# PyRaws - RAW Granule filtering notebook. 

This notebook is a demonstrator of the effects of the lack of a preprocessing to onboard classification. 

# 1) - Imports, paths and variables

Limit CUDA visible devices.

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'

Autoreload.

In [None]:
%load_ext autoreload
%autoreload 2

Imports.

In [None]:
import sys
import os
sys.path.insert(1, os.path.join("..",".."))
from coregistration_study_notebooks_utils import generate_histograms
from pyraws.utils.database_utils import DATABASE_FILE_DICTIONARY, get_cfg_file_dict
from pyraws.utils.constants import BAND_SPATIAL_RESOLUTION_DICT
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd

This import is to remove odd errors on `libiomp5md.dll`. If you do not have them, you can skip it

In [None]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

Set torch device. Use "CUDA" as default if available.

In [None]:
if torch.cuda.is_available():
    device=torch.device("cuda")
else:
    device=torch.device("cpu")

# 2) - Parameters to select

Creating a requested band list and dictionary with index positions

In [None]:
bands = ['B02','B08','B03','B10','B04','B05','B11','B06','B07','B8A','B12','B01','B09']

k=1
requested_bands=[bands[k], bands[k+1]] #Requested bands
requested_bands=['B8A', 'B12'] #Requested bands
band_dict=dict(zip(requested_bands, [n for n in range(len(requested_bands))]))

# 4) - Parsing results

Create empty `shift_lut_db`.

In [None]:
database_col_names=["satellite","registration_mode","detector","S08_02","S03_08","S10_03","S04_10","S05_04","S11_05","S06_11","S07_06","S8A_07","S12_8A","S01_12","S09_01","S05_03"]
database_row=["S2A","downsampling",1,[[0,0]],[[0,0]],[[0,0]],[[0,0]],[[0,0]],[[0,0]],[[0,0]],[[0,0]],[[0,0]],[[0,0]],[[0,0]],[[0,0]]]
shift_lut_df = pd.DataFrame(data=dict(zip(database_col_names,database_row)))
shift_lut_df=shift_lut_df.append(dict(zip(database_col_names,["S2A", "upsampling"]+database_row[2:])), ignore_index=True)
shift_lut_df=shift_lut_df.append(dict(zip(database_col_names,["S2B"]+database_row[1:])), ignore_index=True)
shift_lut_df=shift_lut_df.append(dict(zip(database_col_names,["S2B", "upsampling"]+database_row[2:])), ignore_index=True)
for n in range(2,13):
    database_row_copy=[database_row[0],database_row[1],n]+[[0,0] for n in range(len(database_row) - 3)]
    shift_lut_df=shift_lut_df.append(dict(zip(database_col_names,database_row_copy)), ignore_index=True)
    shift_lut_df=shift_lut_df.append(dict(zip(database_col_names,["S2A", "upsampling"]+database_row_copy[2:])), ignore_index=True)
    shift_lut_df=shift_lut_df.append(dict(zip(database_col_names,["S2B"]+database_row_copy[1:])), ignore_index=True)
    shift_lut_df=shift_lut_df.append(dict(zip(database_col_names,["S2B", "upsampling"]+database_row_copy[2:])), ignore_index=True)

shift_lut_df

Create EVENT/SATELLITE dictionary.

In [None]:
cfg=get_cfg_file_dict()
df = pd.read_csv(os.path.join(get_cfg_file_dict()["database"],"coregistration_study_db.csv"))
event = df.ID_event.to_list()
sat = df.Sat.to_list()
event_sat_dict = {x:y for x,y in zip(event,sat)}

Path to the directory containing the study files.

In [None]:
from glob import glob
coregistration_study_files=os.path.join("coregistration_study_results","coregistration_study_results_dataset_v1")

In [None]:
coregistration_files=glob(os.path.join(coregistration_study_files, "*"))

for c_file in tqdm(coregistration_files, desc="Processing studies..."):
    if c_file[-3:] != "csv":
        continue
    c_file_name=c_file.split(os.sep)[-1][20:]
    bands=[c_file_name[1:4], c_file_name[5:8]]
    bands=[bands[1], bands[0]]
    study_df = pd.read_csv(c_file)
    study_df.dropna(axis=0,subset="ID_event", inplace=True)
    #Removing duplicates due to manual merging of study files
    study_df=study_df.drop_duplicates(keep='first')
   
    # Adding satellite colum to the df
    study_event_sat=[event_sat_dict[event] for event in study_df["ID_event"]] #There are repeated entries for every event because of multiple granules.
    study_df["satellite"]=study_event_sat
    study_df_s2a=study_df[study_df["satellite"] == "S2A"]
    study_df_s2b=study_df[study_df["satellite"] == "S2B"]
    shift_mean_2a_old="[0,0]"
    shift_mean_2b_old="[0,0]"
    for detector in tqdm(range(1,13), desc= "Processing detectors..."):
        try:
            study_df_s2a_detector=study_df_s2a[study_df_s2a["detector_number"]== detector]
            column_name="S"+bands[0][1:]+"_"+bands[1][1:]
            shift_mean_2a=[int(np.round(np.array(study_df_s2a_detector.N_v.to_list()).mean())),int(np.round(np.array(study_df_s2a_detector.N_h.to_list()).mean()))]
            shift_mean_2a_old=[-shift_mean_2a[0], - shift_mean_2a[1]]

        except:
            print("Warning: fail to get values for: ", bands, detector, ". Using previous iterations values for S2A.")
            shift_mean_2a=shift_mean_2a_old

        try:
            study_df_s2b_detector=study_df_s2b[study_df_s2b["detector_number"]== detector]
            column_name="S"+bands[0][1:]+"_"+bands[1][1:]
            shift_mean_2b=[int(np.round(np.array(study_df_s2b_detector.N_v.to_list()).mean())),int(np.round(np.array(study_df_s2b_detector.N_h.to_list()).mean()))]
            shift_mean_2b_old=[-shift_mean_2b[0], - shift_mean_2b[1]]

        except:
            print("Warning: fail to get values for: ", bands, detector, ". Using previous iterations values for S2B.")
            shift_mean_2b=shift_mean_2b_old


        shift_lut_df.loc[(shift_lut_df["satellite"] == "S2A") & (shift_lut_df["registration_mode"] == "downsampling") & (shift_lut_df["detector"] == detector), column_name] = "["+str(shift_mean_2a[0])+","+str(shift_mean_2a[1])+"]"
        shift_lut_df.loc[(shift_lut_df["satellite"] == "S2A") & (shift_lut_df["registration_mode"] == "upsampling") & (shift_lut_df["detector"] == detector), column_name] = "["+str(shift_mean_2a[0])+","+str(shift_mean_2a[1])+"]"
        shift_lut_df.loc[(shift_lut_df["satellite"] == "S2B") & (shift_lut_df["registration_mode"] == "downsampling") & (shift_lut_df["detector"] == detector), column_name] = "["+str(shift_mean_2b[0])+","+str(shift_mean_2b[1])+"]"
        shift_lut_df.loc[(shift_lut_df["satellite"] == "S2B") & (shift_lut_df["registration_mode"] == "upsampling") & (shift_lut_df["detector"] == detector), column_name] = "["+str(shift_mean_2b[0])+","+str(shift_mean_2b[1])+"]"
shift_lut_df

Fix S04_10 by using S09_10.

In [None]:
SHIFT=["S05_04","S11_05","S06_11", "S07_06","S8A_07", "S12_8A","S01_12", "S09_01"]
for r in [20, 21,22,23]:
    
    v=-float(shift_lut_df.loc[r, "S10_09"][1:-1].split(",")[0]) * BAND_SPATIAL_RESOLUTION_DICT["B10"]/BAND_SPATIAL_RESOLUTION_DICT["B04"]
    h=-float(shift_lut_df.loc[r, "S10_09"][1:-1].split(",")[1]) * min(BAND_SPATIAL_RESOLUTION_DICT["B10"],20)/BAND_SPATIAL_RESOLUTION_DICT["B04"]
    for n in range(len(SHIFT)):
        try:
            k=float(shift_lut_df.loc[r, SHIFT[n]][1:-1].split(",")[0]) 
            l=float(shift_lut_df.loc[r, SHIFT[n]][1:-1].split(",")[1]) 
        except:
            k=float(shift_lut_df.loc[r, SHIFT[n]][0]) 
            l=float(shift_lut_df.loc[r, SHIFT[n]][1]) 
        b="B"+SHIFT[n][1:3]
        v-=k * min(BAND_SPATIAL_RESOLUTION_DICT[b],20)/BAND_SPATIAL_RESOLUTION_DICT["B04"]
        h-=l * min(BAND_SPATIAL_RESOLUTION_DICT[b],20)/BAND_SPATIAL_RESOLUTION_DICT["B04"]
    shift_lut_df.loc[r, "S04_10"]=str([int(round(v)),int(round(h))])


Fix S10_03 by using S05_03 and S04_10.

In [None]:

SHIFT=["S04_10","S05_04"]
for r in [20, 21,22,23]:
    v=float(shift_lut_df.loc[r, "S05_03"][1:-1].split(",")[0]) * BAND_SPATIAL_RESOLUTION_DICT["B05"]/BAND_SPATIAL_RESOLUTION_DICT["B10"]
    h=float(shift_lut_df.loc[r, "S05_03"][1:-1].split(",")[1]) * min(BAND_SPATIAL_RESOLUTION_DICT["B05"],20)/min(BAND_SPATIAL_RESOLUTION_DICT["B10"],20)
    for n in range(len(SHIFT)):
        try:
            k=float(shift_lut_df.loc[r, SHIFT[n]][1:-1].split(",")[0]) 
            l=float(shift_lut_df.loc[r, SHIFT[n]][1:-1].split(",")[1]) 
        except:
            k=float(shift_lut_df.loc[r, SHIFT[n]][0]) 
            l=float(shift_lut_df.loc[r, SHIFT[n]][1]) 
        b="B"+SHIFT[n][1:3]

        v-=k * BAND_SPATIAL_RESOLUTION_DICT[b]/BAND_SPATIAL_RESOLUTION_DICT["B10"]
        h-=l * min(BAND_SPATIAL_RESOLUTION_DICT[b],20)/min(BAND_SPATIAL_RESOLUTION_DICT["B10"],20)
    shift_lut_df.loc[r, "S10_03"]=str([int(round(v)),int(round(h))])

Path to the database_file.

In [None]:
out_db_path=os.path.join(cfg["database"], "shift_lut_test.csv")

Save results.

In [None]:
shift_lut_df.to_csv(out_db_path)

# 5) - Showing histograms (WIP)

Path to the B8A vs B11 bands study.

In [None]:
database_results_csv=os.path.join("coregistration_study_results","coregistration_study_results_THRAWS","coregistration_study_B8A_B11_2023_2_1_20_36_8.csv")

Parsing studies results.

This analysis is performed on the final version of the dataset. So `THRAWS` will be used.

In [None]:
results_df=pd.read_csv(database_results_csv)
results_df.drop_duplicates(inplace=True)
results_df.reset_index(inplace=True, drop=True)
coregistration_study_db=pd.read_csv(os.path.join(get_cfg_file_dict()["database"],DATABASE_FILE_DICTIONARY["THRAWS"]))
coregistration_study_db.reset_index(inplace=True, drop=True)
event_names=results_df["ID_event"]

Generate histograms for N_h, for different S-2

In [None]:
Sat_col = [coregistration_study_db[coregistration_study_db["ID_event"] ==x]["Sat"].to_list()[0] for x in event_names.to_list()]
results_df["Sat"]=Sat_col

results_df_Nh_S2A = results_df[results_df['Sat']=='S2A'].N_h.to_list()
results_df_Nh_S2B = results_df[results_df['Sat']=='S2B'].N_h.to_list()
results_df_Nh = results_df.N_h.to_list()

In [None]:
sns.set_style("whitegrid")
################################################################
data = results_df_Nh
# Plot the histogram
plt.figure(figsize=(2,2), dpi=400)
sns.histplot(data, bins=20, kde=True)

# Add a title and labels to the plot
plt.title("Sentinel-2A")
plt.xlabel("")
plt.ylabel("")
# Show the plot
plt.show()

################################################################
data = results_df_Nh_S2A
# Plot the histogram
plt.figure(figsize=(2,2), dpi=400)
sns.histplot(data, bins=20, kde=True)

# Add a title and labels to the plot
plt.title("Sentinel-2A")
plt.xlabel("")
plt.ylabel("")
# Show the plot
plt.show()


################################################################
data = results_df_Nh_S2B
# Plot the histogram
plt.figure(figsize=(2,2), dpi=400)
sns.histplot(data, bins=20, kde=True)

# Add a title and labels to the plot
plt.title("Sentinel-2B")
plt.xlabel("")
plt.ylabel("")
# Show the plot
plt.show()

Generate histograms for each detector number.

In [None]:

Sat_col = [coregistration_study_db[coregistration_study_db["ID_event"] ==x]["Sat"].to_list()[0] for x in event_names.to_list()]
results_df["Sat"]=Sat_col


for detector in range(1,13):
    generate_histograms(results_df, "S2A", detector)
    generate_histograms(results_df, "S2B", detector)

