In [1]:
import os
os.chdir("../")

In [2]:
os.getcwd()

'/home/benchuser/code'

In [3]:
import numpy as np
import pandas as pd
import ast 
import geopandas as gpd
from shapely.geometry import Point
from src.utils import get_continent
import yaml
from pathlib import Path
import shutil
from tqdm import tqdm

In [4]:
from src.data_cleaning import add_point, drop_rows

In [5]:
platform_name_dict = {
    's2' : 'sentinel_2',
    's1' : 'sentinel_1',
    'landsat' : 'landsat'
}

In [6]:
config_path = '/home/benchuser/code/config.yml'

In [7]:
with open(config_path, "r") as file:
    config = yaml.safe_load(file)
        
version = config['dataset']['version']
working_dir = Path(config['working_dir'])
output_dir = Path(config['output_dir'])
metadata = config['metadata']['file']

print(yaml.dump(config))

aoi:
  version: v0.30
chips:
  chip_size: 960
  sample_size: 960
dataset:
  version: v0.40.2
dem:
  collection: cop-dem-glo-30
  native_crs: false
  resolution: 30
  year: 2021-01-02/2021-12-31
excluded_aoi_indices:
- 12
- 25
- 46
- 60
- 81
- 153
land_cover:
  collection: io-lulc-annual-v02
  native_crs: false
  resolution: 10
  sampling_factor: null
  year: 2023-01-02/2023-12-31
landsat:
  bands:
  - coastal
  - blue
  - green
  - red
  - nir08
  - swir16
  - swir22
  - qa_pixel
  cloud_band: qa_pixel
  cloud_cover: 50
  collection: landsat-c2-l2
  native_crs: false
  platforms:
  - landsat-8
  - landsat-9
  resolution: 30
log_errors: true
metadata:
  file: metadata_df.csv
output_dir: /home/benchuser/final_data
sentinel_1:
  bands:
  - vv
  - vh
  collection: sentinel-1-rtc
  native_crs: false
  nodata_pixel_percentage: 5
  resolution: 10
sentinel_2:
  bands:
  - B01
  - B02
  - B03
  - B04
  - B05
  - B06
  - B07
  - B08
  - B8A
  - B09
  - B11
  - B12
  - SCL
  cloud_band: SCL
  clo

In [9]:
metadata_df = pd.read_csv(working_dir / version / metadata)

# ensure only desired lc classes are present
metadata_df = metadata_df[metadata_df['lc'].isin([1, 2, 5, 7, 8, 11])]

# get sampling factor, max count, and min count
sampling_factor = config['land_cover']['sampling_factor']
max_count = metadata_df.groupby("lc").count().max().iloc[0]
min_count = metadata_df.groupby("lc").count().min().iloc[0]

In [10]:
metadata_df['chip_id'].max() + 1

1097

In [11]:
print(metadata_df.groupby("lc").count()['chip_id'])

lc
2    1097
Name: chip_id, dtype: int64


In [12]:
len(metadata_df)

1097

In [13]:
metadata_df

Unnamed: 0,chip_id,aoi_index,s2_dates,s1_dates,landsat_dates,lc,x_center,y_center,epsg,error_msg
0,1096,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,301670.0,593140.0,32634,
1,1095,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,277670.0,593140.0,32634,
2,1094,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,272870.0,593140.0,32634,
3,1093,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,267110.0,593140.0,32634,
4,1092,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,259430.0,593140.0,32634,
...,...,...,...,...,...,...,...,...,...,...
1092,4,1,,,,2,231590.0,646900.0,32634,s1_values_missing
1093,3,1,,,,2,230630.0,646900.0,32634,s1_values_missing
1094,2,1,,,,2,225830.0,646900.0,32634,s1_values_missing
1095,1,1,,,,2,210470.0,646900.0,32634,s1_values_missing


In [12]:
# use sampling factor to calculate correction factor, for proportional class drop quantities
max_distance = max_count - min_count
max_end_value = min_count * sampling_factor
max_distance_to_max_end_value = max_count - max_end_value
correction_factor = max_distance_to_max_end_value / max_distance
print(max_count, min_count)

6012 405


In [None]:

# use correction factor to determine proportion of samples above min to drop for each class
# the number of samples dropped will be proportional to the number of samples above minimum
# this scales the number of samples between min and min * sampling factor

for index, row in metadata_df.groupby("lc").count().iterrows():
    lc_class = index
    class_count = row['chip_id']
    class_distance = class_count - min_count
    drop_quantity = int(correction_factor * class_distance)
    metadata_df = drop_rows(metadata_df, lc_class, drop_quantity)

metadata_df["index"] = np.arange(0, len(metadata_df))
metadata_df = metadata_df.rename(columns={"chip_id" : "original_chip_id", "index" : "chip_id"})

metadata_df["geometry"] = metadata_df[["x_center", "y_center", "epsg"]].apply(add_point, axis=1)
metadata_gdf = gpd.GeoDataFrame(metadata_df, geometry="geometry", crs="EPSG:4326")

In [None]:
(output_dir / version).mkdir(exist_ok=True)
metadata_gdf.to_csv(output_dir / f'{version}/cleaned_df.csv', index=False)
for index, row in tqdm(metadata_df.iterrows(), total=len(metadata_gdf), desc="copying files to output dir..."):
    for col in ["s2_dates", "s1_dates", "landsat_dates"]:
        for i, date in enumerate(ast.literal_eval(row[col])):
            platform = col.split("_")[0]
            src_platform = platform_name_dict[platform]
            src_file = working_dir / version / f"{src_platform}_{row["original_chip_id"]:06}_{i}_{date}.tif"
            dst_file = output_dir / version / f"{platform}_{row["chip_id"]:06}_{date}.tif"
            shutil.copy2(src_file, dst_file)
            if platform in ['s2', 'landsat']:
                src_file = working_dir / version / f"{src_platform}_{row["original_chip_id"]:06}_{i}_{date}.png"
                dst_file = output_dir / version / f"{platform}_{row["chip_id"]:06}_{date}.png"
                
    src_file = working_dir / version / f"dem_{row["original_chip_id"]:06}.tif"
    dst_file = output_dir / version / f"dem_{row["chip_id"]:06}.tif"
    shutil.copy2(src_file, dst_file)

folder_to_zip = working_dir / version
output_zip_file = output_dir / version / version
shutil.make_archive(output_zip_file, 'zip', folder_to_zip)
