# Data Cleaning

In [1]:
import os
os.chdir("../")

In [2]:
os.getcwd()

'/home/benchuser/code'

In [18]:
import numpy as np
import pandas as pd
import ast 
import geopandas as gpd
from shapely.geometry import Point
from src.utils import get_continent
import yaml
from pathlib import Path

In [19]:
with open("config.yml", "r") as file:
    config = yaml.safe_load(file)

In [20]:
version = config['dataset']['version']
working_dir = Path(config['working_dir'])
output_dir = Path(config['output_dir'])
metadata = config['metadata']['file']

In [21]:
metadata_df = pd.read_csv(working_dir / version / metadata)

In [22]:
version

'v0.40'

In [23]:
metadata_df

Unnamed: 0,chip_id,aoi_index,s2_dates,s1_dts,landsat_dts,lc,x_center,y_center,epsg
0,20125,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",[5],607970.0,5017940.0,32618
1,20124,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",[5],607970.0,5020820.0,32618
2,20123,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",[2],562850.0,5030420.0,32618
3,20122,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",[2],554210.0,5044820.0,32618
4,20121,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",[7],599330.0,5045780.0,32618
...,...,...,...,...,...,...,...,...,...
20121,4,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",[2],209530.0,618410.0,32634
20122,3,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",[2],227770.0,619370.0,32634
20123,2,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",[2],204730.0,619370.0,32634
20124,1,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",[2],220090.0,620330.0,32634


In [24]:
metadata_df['lc'] = metadata_df['lc'].apply(lambda row: int(row[1:-1]))

In [25]:
# drop flooded vegetation
metadata_df = metadata_df[metadata_df['lc'].isin([1, 2, 5, 7, 8, 11])]

In [26]:
metadata_df.groupby("lc").count() 	

Unnamed: 0_level_0,chip_id,aoi_index,s2_dates,s1_dts,landsat_dts,x_center,y_center,epsg
lc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4822,4822,4822,4822,4822,4822,4822,4822
2,1565,1565,1565,1565,1565,1565,1565,1565
5,3149,3149,3149,3149,3149,3149,3149,3149
7,212,212,212,212,212,212,212,212
8,1865,1865,1865,1865,1865,1865,1865,1865
11,8512,8512,8512,8512,8512,8512,8512,8512


In [27]:
len(metadata_df) 

20125

In [28]:
metadata_df

Unnamed: 0,chip_id,aoi_index,s2_dates,s1_dts,landsat_dts,lc,x_center,y_center,epsg
0,20125,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",5,607970.0,5017940.0,32618
1,20124,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",5,607970.0,5020820.0,32618
2,20123,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",2,562850.0,5030420.0,32618
3,20122,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",2,554210.0,5044820.0,32618
4,20121,407,"['20230214', '20230530', '20230922', '20231002']","['20230529', '20231003']","['20230213', '20230527', '20230925', '20231002']",7,599330.0,5045780.0,32618
...,...,...,...,...,...,...,...,...,...
20121,4,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,209530.0,618410.0,32634
20122,3,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,227770.0,619370.0,32634
20123,2,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,204730.0,619370.0,32634
20124,1,1,"['20230216', '20230407', '20230701', '20231218']","['20230216', '20230405', '20230628', '20231213']","['20230215', '20230412', '20230701', '20231216']",2,220090.0,620330.0,32634


In [29]:
metadata_df.columns

Index(['chip_id', 'aoi_index', 's2_dates', 's1_dts', 'landsat_dts', 'lc',
       'x_center', 'y_center', 'epsg'],
      dtype='object')

In [45]:
metadata_df.groupby("lc").count().max().iloc[0]

6607

In [14]:
for index, row in metadata_df.groupby("lc").count().iterrows():
    print (index, row['chip_id'])

1 4422
2 1187
5 3139
7 191
8 643
11 7325


In [15]:
def drop_rows(metadata_df, lc_class, count_to_drop):
    import random
    index_to_drop = random.sample(sorted(metadata_df[metadata_df.lc==lc_class].index.values), count_to_drop)
    metadata_df = metadata_df.drop(index_to_drop)

    return metadata_df

In [16]:
sampling_factor = config['land_cover']['sampling_factor']
max_count = metadata_df.groupby("lc").count().max().iloc[0]
min_count = metadata_df.groupby("lc").count().min().iloc[0]

# use sampling factor to calculate correction factor, for proportional class drop quantities
max_distance = max_count - min_count
max_end_value = min_count * sampling_factor
max_distance_to_max_end_value = max_count - max_end_value
correction_factor = max_distance_to_max_end_value / max_distance

# use correction factor to determine proportion of samples above min to drop for each class
# the number of samples dropped will be proportional to the number of samples above minimum
# this scales the number of samples between min and min * sampling factor

for index, row in metadata_df.groupby("lc").count().iterrows():
    lc_class = index
    class_count = row['chip_id']
    class_distance = class_count - min_count
    drop_quantity = int(correction_factor * class_distance)
    metadata_df = drop_rows(metadata_df, lc_class, drop_quantity)

In [17]:
metadata_df.groupby("lc").count() 	

Unnamed: 0_level_0,chip_id,aoi_index,s2_dates,s1_dts,landsat_dts,x_center,y_center,epsg
lc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,645,645,645,645,645,645,645,645
2,298,298,298,298,298,298,298,298
5,507,507,507,507,507,507,507,507
7,191,191,191,191,191,191,191,191
8,240,240,240,240,240,240,240,240
11,955,955,955,955,955,955,955,955


In [64]:
metadata_df["index"] = np.arange(0, len(metadata_df))

In [65]:
metadata_df = metadata_df.rename(columns={"chip_id" : "original_chip_id"})

In [66]:
metadata_df = metadata_df.rename(columns={"index" : "chip_id"})

In [67]:
metadata_df.head()

Unnamed: 0,original_chip_id,aoi_index,s2_dates,s1_dts,landsat_dts,lc,x_center,y_center,epsg,chip_id
0,20449,275,"['20230309', '20230518', '20230717', '20231005']","['20230312', '20230516', '20230715', '20231007']","['20230314', '20230518', '20230720', '20231008']",1,429260.0,7164220.0,32603,0
1,20448,275,"['20230309', '20230518', '20230717', '20231005']","['20230312', '20230516', '20230715', '20231007']","['20230314', '20230518', '20230720', '20231008']",1,429260.0,7166780.0,32603,1
2,20447,275,"['20230309', '20230518', '20230717', '20231005']","['20230312', '20230516', '20230715', '20231007']","['20230314', '20230518', '20230720', '20231008']",1,429260.0,7167420.0,32603,2
3,20446,275,"['20230309', '20230518', '20230717', '20231005']","['20230312', '20230516', '20230715', '20231007']","['20230314', '20230518', '20230720', '20231008']",1,427980.0,7167420.0,32603,3
4,20445,275,"['20230309', '20230518', '20230717', '20231005']","['20230312', '20230516', '20230715', '20231007']","['20230314', '20230518', '20230720', '20231008']",1,429260.0,7168060.0,32603,4


In [None]:
# {'No Data': 0,
#  'Water': 1,
#  'Trees': 2,
#  'Flooded vegetation': 4,
#  'Crops': 5,
#  'Built area': 7,
#  'Bare ground': 8,
#  'Snow/ice': 9,
#  'Clouds': 10,
#  'Rangeland': 11}

In [68]:
def add_point(row):
    point = Point(row["x_center"], row["y_center"])
    gdf = gpd.GeoDataFrame([{'geometry': point}], crs=f"EPSG:{row["epsg"]}")
    gdf_reprojected = gdf.to_crs(epsg=4326)
    
    return gdf_reprojected.geometry.iloc[0]

In [69]:
metadata_df["geometry"] = metadata_df[["x_center", "y_center", "epsg"]].apply(add_point, axis=1)

In [None]:
metadata_gdf = gpd.GeoDataFrame(metadata_df, geometry="geometry", crs="EPSG:4326")

In [None]:
metadata_gdf.to_csv(f"/home/benchuser/final_data/{version}/cleaned_df.csv", index=False)

In [None]:
import shutil
for row in metadata_df.iterrows():
    for date in ast.literal_eval(row[1]["dates"]):
        src_file = f"/home/benchuser/data/s2_{row[1]["original_chip_id"]:06}_{date}.tif"
        dst_file = f"/home/benchuser/final_data/{version}/s2_{row[1]["chip_id"]:06}_{date}.tif"
        shutil.copy2(src_file, dst_file)

folder_to_zip = f'/home/benchuser/final_data/{version}'
output_zip_file = f'/home/benchuser/final_data/{version}'
shutil.make_archive(output_zip_file, 'zip', folder_to_zip)

## Sample Visualizations

In [None]:
version = "v0.30"

In [None]:
cleaned_df = pd.read_csv(f'/home/benchuser/final_data/{version}/cleaned_df.csv')

In [None]:
import random
import matplotlib.pyplot as plt
import rasterio
import ast

for ii, lc in enumerate([1, 2, 5, 7, 8, 11]):
    fig, axes = plt.subplots(4, 6, figsize = (15, 10), layout="constrained")
    df = cleaned_df[cleaned_df["lc"] == lc]
    df_elements = df.sample(n=6)
    j = 0
    for index, chip in df_elements.iterrows():
        times = ast.literal_eval(chip["dates"])
        for i, time in enumerate(times):
            file_path = f'/home/benchuser/final_data/{version}/s2_{chip["chip_id"]:06}_{time}.tif'
            with rasterio.open(file_path, 'r') as src:
                img = src.read(fill_value=0)
            axes[i, j].imshow(np.flip(img[:3,:,:], 0).transpose(1,2,0)/5000, vmax=0.9)
            # axes[0].set_title("RGB Visualization of S2 Input")
            axes[i, j].set_axis_off()
        j +=1    
    plt.savefig(f"/home/benchuser/samples_lc_{lc:02}.png", dpi = 600)