# Data Cleaning

In [None]:
import os
os.chdir("../")

In [None]:
import numpy as np
import pandas as pd
import ast 
import geopandas as gpd
from shapely.geometry import Point
from src.utils import get_continent

In [None]:
metadata_df = pd.read_csv('/home/benchuser/data/metadata_df.csv')

In [None]:
metadata_df.groupby("lc").count()

In [None]:
len(metadata_df) 

In [None]:
# drop flooded vegetation
metadata_df = metadata_df = metadata_df[metadata_df.lc != 4]

In [None]:
metadata_df.groupby("lc").count()

In [None]:
def drop_rows(metadata_df, lc_class, count_to_drop):
    import random
    index_to_drop = random.sample(sorted(metadata_df[metadata_df.lc==lc_class].index.values), count_to_drop)
    metadata_df = metadata_df.drop(index_to_drop)

    return metadata_df

In [None]:
# drop class 1
metadata_df = drop_rows(metadata_df, 1, 12700)
# drop class 2
metadata_df = drop_rows(metadata_df, 2, 3998)
# drop class 5
metadata_df = drop_rows(metadata_df, 5, 5007)
# drop class 8
metadata_df = drop_rows(metadata_df, 8, 11900)
# drop class 11
metadata_df = drop_rows(metadata_df, 11, 58648)

In [None]:
metadata_df.groupby("lc").count()

In [None]:
metadata_df["index"] = np.arange(0, len(metadata_df))

In [None]:
metadata_df = metadata_df.rename(columns={"chip_id" : "original_chip_id"})

In [None]:
metadata_df = metadata_df.rename(columns={"index" : "chip_id"})

In [None]:
metadata_df.head()

In [None]:
# {'No Data': 0,
#  'Water': 1,
#  'Trees': 2,
#  'Flooded vegetation': 4,
#  'Crops': 5,
#  'Built area': 7,
#  'Bare ground': 8,
#  'Snow/ice': 9,
#  'Clouds': 10,
#  'Rangeland': 11}

In [None]:
version = "v0.20"


In [None]:
def add_point(row):
    point = Point(row["x_center"], row["y_center"])
    gdf = gpd.GeoDataFrame([{'geometry': point}], crs=f"EPSG:{row["epsg"]}")
    gdf_reprojected = gdf.to_crs(epsg=4326)
    
    return gdf_reprojected.geometry.iloc[0]

In [None]:
metadata_df["geometry"] = metadata_df[["x_center", "y_center", "epsg"]].apply(add_point, axis=1)

In [None]:
metadata_gdf = gpd.GeoDataFrame(metadata_df, geometry="geometry", crs="EPSG:4326")

In [None]:
metadata_gdf.to_csv(f"/home/benchuser/final_data/{version}/cleaned_df.csv", index=False)

In [None]:
import shutil
for row in metadata_df.iterrows():
    for date in ast.literal_eval(row[1]["dates"]):
        src_file = f"/home/benchuser/data/s2_{row[1]["original_chip_id"]:06}_{date}.tif"
        dst_file = f"/home/benchuser/final_data/{version}/s2_{row[1]["chip_id"]:06}_{date}.tif"
        shutil.copy2(src_file, dst_file)

In [None]:
import shutil

folder_to_zip = f'/home/benchuser/final_data/{version}'
output_zip_file = f'/home/benchuser/{version}'

shutil.make_archive(output_zip_file, 'zip', folder_to_zip)

## Sample Visualizations

In [None]:
version = "v0.11"

In [None]:
cleaned_df = pd.read_csv(f'/home/benchuser/final_data/{version}/cleaned_df.csv')

In [None]:
import random
import matplotlib.pyplot as plt
import rasterio
import ast

for ii, lc in enumerate([1, 2, 5, 7, 8, 11]):
    fig, axes = plt.subplots(4, 6, figsize = (15, 10), layout="constrained")
    df = cleaned_df[cleaned_df["lc"] == lc]
    df_elements = df.sample(n=6)
    j = 0
    for index, chip in df_elements.iterrows():
        times = ast.literal_eval(chip["dates"])
        for i, time in enumerate(times):
            file_path = f'/home/benchuser/final_data/{version}/s2_{chip["chip_id"]:06}_{time}.tif'
            with rasterio.open(file_path, 'r') as src:
                img = src.read(fill_value=0)
            axes[i, j].imshow(np.flip(img[:3,:,:], 0).transpose(1,2,0)/5000, vmax=0.9)
            # axes[0].set_title("RGB Visualization of S2 Input")
            axes[i, j].set_axis_off()
        j +=1    
    plt.savefig(f"/home/benchuser/samples_lc_{lc:02}.png", dpi = 600)

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile("final_data/data.zip", "r") as zip_ref:
    zip_ref.extractall("final_data/")