In [1]:
import os
os.chdir("../")

In [2]:
os.getcwd()

'/home/benchuser/code'

In [22]:
import numpy as np
import pandas as pd
import ast 
import geopandas as gpd
from shapely.geometry import Point
import yaml
from pathlib import Path
import shutil
from tqdm import tqdm
from src.gelos_config import GELOSConfig
from src.data_cleaner import DataCleaner


In [23]:
config = GELOSConfig.from_yaml('/home/benchuser/code/config.yml')
cleaner = DataCleaner(config)

In [25]:
metadata_df = pd.read_csv(cleaner.working_dir / cleaner.version / "chip_metadata.csv")
metadata_df['chip_footprint'] = gpd.GeoSeries(metadata_df['chip_footprint'].dropna().map(wkt.loads), crs=4326)
metadata_gdf = gpd.GeoDataFrame(metadata_df, geometry = 'chip_footprint', crs=4326)
metadata_gdf = metadata_gdf[metadata_gdf['status'] == 'success']

# ensure only desired land_cover classes are present
metadata_gdf = metadata_gdf[metadata_gdf['land_cover'].isin([1, 2, 5, 7, 8, 11])]

# get sampling factor, max count, and min count
sampling_factor = cleaner.config.land_cover.sampling_factor
if sampling_factor:
    max_count = metadata_gdf.groupby("land_cover").count().max().iloc[0]
    min_count = metadata_gdf.groupby("land_cover").count().min().iloc[0]
    
    # use sampling factor to calculate correction factor, for proportional class drop quantities
    max_distance = max_count - min_count
    max_end_value = min_count * sampling_factor
    max_distance_to_max_end_value = max_count - max_end_value
    correction_factor = max_distance_to_max_end_value / max_distance
    
    # use correction factor to determine proportion of samples above min to drop for each class
    # the number of samples dropped will be proportional to the number of samples above minimum
    # this scales the number of samples between min and min * sampling factor
    if max_distance_to_max_end_value > 0:
            
        for index, row in metadata_gdf.groupby("land_cover").count().iterrows():
            land_cover_class = index
            class_count = row['chip_id']
            class_distance = class_count - min_count
            drop_quantity = int(correction_factor * class_distance)
            metadata_gdf = drop_rows(metadata_gdf, land_cover_class, drop_quantity)
    
metadata_gdf["chip_id"] = np.arange(0, len(metadata_gdf))
metadata_gdf['land_cover'] = metadata_gdf['land_cover'].map(lambda x: int(x))
metadata_gdf['x_center'] = metadata_gdf.geometry.centroid.x
metadata_gdf['y_center'] = metadata_gdf.geometry.centroid.y
metadata_gdf = metadata_gdf.rename(columns={"chip_index": "original_chip_id"})


  metadata_gdf['x_center'] = metadata_gdf.geometry.centroid.x

  metadata_gdf['y_center'] = metadata_gdf.geometry.centroid.y


In [33]:
len(metadata_gdf)

77547

In [36]:
metadata_gdf.index = metadata_gdf['chip_id']

In [38]:
metadata_gdf.head()

Unnamed: 0_level_0,original_chip_id,aoi_index,sentinel_2_dates,sentinel_1_dates,landsat_dates,land_cover,chip_footprint,epsg,status,chip_id,x_center,y_center
chip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,318,0,"['20230218', '20230419', '20230713', '20231230']","['20230218', '20230419', '20230712', '20231227']","['20230217', '20230524', '20230921', '20231218']",2,"POLYGON ((21.82991 4.28125, 21.82992 4.28993, ...",32634,success,0,21.825585,4.285593
1,335,0,"['20230218', '20230419', '20230713', '20231230']","['20230218', '20230419', '20230712', '20231227']","['20230217', '20230524', '20230921', '20231218']",2,"POLYGON ((21.8472 4.27254, 21.84721 4.28123, 2...",32634,success,1,21.842877,4.27689
2,336,0,"['20230218', '20230419', '20230713', '20231230']","['20230218', '20230419', '20230712', '20231227']","['20230217', '20230524', '20230921', '20231218']",2,"POLYGON ((21.85585 4.27253, 21.85586 4.28122, ...",32634,success,2,21.851527,4.276881
3,361,0,"['20230218', '20230419', '20230713', '20231230']","['20230218', '20230419', '20230712', '20231227']","['20230217', '20230524', '20230921', '20231218']",2,"POLYGON ((21.86449 4.26384, 21.8645 4.27252, 2...",32634,success,3,21.860168,4.268187
4,381,0,"['20230218', '20230419', '20230713', '20231230']","['20230218', '20230419', '20230712', '20231227']","['20230217', '20230524', '20230921', '20231218']",1,"POLYGON ((21.50116 4.25548, 21.50117 4.26416, ...",32634,success,4,21.496842,4.259824


In [None]:
# Move files into dataset destination
(cleaner.output_dir / cleaner.version).mkdir(exist_ok=True)
metadata_gdf.to_file(cleaner.output_dir / f'{cleaner.version}/cleaned_df.geojson', driver='GeoJSON', index=False)

for index, row in tqdm(metadata_gdf.iterrows(), total=len(metadata_gdf), desc="copying files to output dir..."):
    for col in ["sentinel_2_dates", "sentinel_1_dates", "landsat_dates"]:
        for i, date in enumerate(ast.literal_eval(row[col])):
            platform = col[:-6]
            src_file = cleaner.working_dir / cleaner.version / f"{platform}_{row["original_chip_id"]:06}_{i}_{date}.tif"
            dst_file = cleaner.output_dir / cleaner.version / f"{platform}_{row["chip_id"]:06}_{date}.tif"
            shutil.copy2(src_file, dst_file)
            src_file = cleaner.working_dir / cleaner.version / f"{platform}_{row["original_chip_id"]:06}_{i}_{date}.png"
            dst_file = cleaner.output_dir / cleaner.version / f"{platform}_{row["chip_id"]:06}_{date}.png"
            shutil.copy2(src_file, dst_file)
    src_file = cleaner.working_dir / cleaner.version / f"dem_{row["original_chip_id"]:06}.tif"
    dst_file = cleaner.output_dir / cleaner.version / f"dem_{row["chip_id"]:06}.tif"
    shutil.copy2(src_file, dst_file)

folder_to_zip = cleaner.working_dir / cleaner.version
output_zip_file = cleaner.output_dir / cleaner.version / cleaner.version
shutil.make_archive(output_zip_file, 'zip', folder_to_zip)


## Get a subset of successful chips

In [51]:
metadata_gdf_sample = metadata_gdf.head(10)
cleaner.sample_dir = Path('/home/benchuser/code/data/')
cleaner.output_version = 'v0.40'
(cleaner.sample_dir / cleaner.output_version).mkdir(exist_ok=True)
metadata_gdf.to_file(cleaner.sample_dir / f'{cleaner.output_version}/cleaned_df.geojson', driver='GeoJSON', index=False)

for index, row in tqdm(metadata_gdf_sample.iterrows(), total=len(metadata_gdf_sample), desc="copying files to output dir..."):
    for col in ["sentinel_2_dates", "sentinel_1_dates", "landsat_dates"]:
        for i, date in enumerate(ast.literal_eval(row[col])):
            platform = col[:-6]
            src_file = cleaner.working_dir / cleaner.version / f"{platform}_{row["original_chip_id"]:06}_{i}_{date}.tif"
            dst_file = cleaner.sample_dir / cleaner.output_version / f"{platform}_{row["chip_id"]:06}_{date}.tif"
            shutil.copy2(src_file, dst_file)
            src_file = cleaner.working_dir / cleaner.version / f"{platform}_{row["original_chip_id"]:06}_{i}_{date}.png"
            dst_file = cleaner.sample_dir / cleaner.output_version / f"{platform}_{row["chip_id"]:06}_{date}.png"
            shutil.copy2(src_file, dst_file)
    src_file = cleaner.working_dir / cleaner.version / f"dem_{row["original_chip_id"]:06}.tif"
    dst_file = cleaner.sample_dir / cleaner.output_version / f"dem_{row["chip_id"]:06}.tif"
    shutil.copy2(src_file, dst_file)

copying files to output dir...: 100%|██████████| 10/10 [00:00<00:00, 227.68it/s]
