# Data Cleaning

In [1]:
import os
import glob
import rasterio
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
file_paths = glob.glob("data/s2*.tif", recursive=True)

In [3]:
metadata_df = pd.DataFrame(columns=["index", "original_chip_id", "lc", "bounds", "epsg"])
global_index = 0
for file_path in tqdm(file_paths, desc="Processing S2 files"):
    with rasterio.open(file_path) as src:
        img = src.read(fill_value=0)

    if np.count_nonzero(img)!= 60000:
        continue
    elif img.size != 301056:
        continue
    else:
        with rasterio.open(file_path.replace("s2", "lc")) as src:
            lc_img = src.read(fill_value=-1)
         
    if len(np.unique(lc_img)) == 2:
        lc = np.unique(lc_img)[1]
        metadata_df = pd.concat([pd.DataFrame([[global_index, 
                                                file_path.split("_")[-1][:-4],
                                                lc,
                                                src.bounds,
                                                src.crs.to_epsg()]
                                              ],
                                              columns=metadata_df.columns
                                             ),
                                 metadata_df],
                                ignore_index=True
                               )
        global_index += 1
    else:
        continue
        
metadata_df.to_csv('/home/benchuser/data/metadata_df.csv', index=False)

Processing S2 files: 100%|████████████████████████████████████████████████████████████████████████████████████| 22160/22160 [02:11<00:00, 169.12it/s]


In [4]:
metadata_df = pd.read_csv('/home/benchuser/data/metadata_df.csv')

In [5]:
len(metadata_df)

17710

In [6]:
metadata_df.groupby("lc").count()

Unnamed: 0_level_0,index,original_chip_id,bounds,epsg
lc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1141,1141,1141,1141
2,6940,6940,6940,6940
4,17,17,17,17
5,2166,2166,2166,2166
7,783,783,783,783
8,1395,1395,1395,1395
11,5268,5268,5268,5268


In [7]:
# drop flooded vegetation
metadata_df = metadata_df = metadata_df[metadata_df.lc != 4]

In [8]:
metadata_df.groupby("lc").count()

Unnamed: 0_level_0,index,original_chip_id,bounds,epsg
lc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1141,1141,1141,1141
2,6940,6940,6940,6940
5,2166,2166,2166,2166
7,783,783,783,783
8,1395,1395,1395,1395
11,5268,5268,5268,5268


In [9]:
def drop_rows(metadata_df, lc_class, count_to_drop):
    import random
    index_to_drop = random.sample(sorted(metadata_df[metadata_df.lc==lc_class].index.values), count_to_drop)
    metadata_df = metadata_df.drop(index_to_drop)

    return metadata_df

In [10]:
# drop class 2
metadata_df = drop_rows(metadata_df, 2, 5700)
# drop class 5
metadata_df = drop_rows(metadata_df, 5, 900)
# drop class 11
metadata_df = drop_rows(metadata_df, 11, 4000)

In [11]:
metadata_df.groupby("lc").count()

Unnamed: 0_level_0,index,original_chip_id,bounds,epsg
lc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1141,1141,1141,1141
2,1240,1240,1240,1240
5,1266,1266,1266,1266
7,783,783,783,783
8,1395,1395,1395,1395
11,1268,1268,1268,1268


In [12]:
metadata_df["index"] = np.arange(0, len(metadata_df))

In [13]:
metadata_df.to_csv('/home/benchuser/data/cleaned_df.csv', index=False)

In [14]:
# {'No Data': 0,
#  'Water': 1,
#  'Trees': 2,
#  'Flooded vegetation': 4,
#  'Crops': 5,
#  'Built area': 7,
#  'Bare ground': 8,
#  'Snow/ice': 9,
#  'Clouds': 10,
#  'Rangeland': 11}

In [15]:
import shutil
for row in metadata_df.iterrows():
    src_file = f"/home/benchuser/data/s2_{row[1]["original_chip_id"]:04}.tif"
    dst_file = f"/home/benchuser/final_data/s2_{row[1]["index"]:05}.tif"
    shutil.copy2(src_file, dst_file)

In [16]:
import shutil

folder_to_zip = '/home/benchuser/final_data'
output_zip_file = '/home/benchuser/data'

shutil.make_archive(output_zip_file, 'zip', folder_to_zip)

'/home/benchuser/data.zip'

In [None]:
metadata_df.iloc[0]

## Sample Visualizations

In [None]:
cleaned_df = pd.read_csv('/home/benchuser/data/cleaned_df.csv')

In [None]:
import random
import matplotlib.pyplot as plt
fig, axes = plt.subplots(4, 6, figsize = (15, 10), layout="constrained")

for j, lc in enumerate([1, 2, 5, 7, 8, 11]):
    df = cleaned_df[cleaned_df["lc"] == lc]
    im_ids = random.sample(sorted(df["index"]), 4) 
    for i in range(4): 
        file_path = f"/home/benchuser/final_data/s2_{im_ids[i]:05}.tif"
        with rasterio.open(file_path, 'r') as src:
            img = src.read(fill_value=0)
        axes[i, j].imshow(np.flip(img[:3,:,:], 0).transpose(1,2,0)/5000, vmax=0.9)
        # axes[0].set_title("RGB Visualization of S2 Input")
        axes[i, j].set_axis_off()
        
plt.savefig("samples.png", dpi = 600)

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile("final_data/data.zip", "r") as zip_ref:
    zip_ref.extractall("final_data/")