# Data Loading and Pre-processing

## Imports and data-loading

In [23]:
# Standard imports
import pandas as pd
import numpy as np

# utilities
from sklearn.preprocessing import StandardScaler


In [24]:
train = pd.read_csv('../data/orig/train.csv')
test = pd.read_csv('../data/orig/test.csv')
train.head()
# Remove useless index column
train = train.drop(columns=['Unnamed: 0'])
test = test.drop(columns=['S.No'])

In [13]:
train.head()

Unnamed: 0,S2_B2_jan,S2_B3_jan,S2_B4_jan,S2_B5_jan,S2_B6_jan,S2_B7_jan,S2_B8_jan,S2_B8A_jan,S2_B9_jan,S2_B11_jan,...,S2_B11_dec,S2_B12_dec,S1_VV_dec,S1_VH_dec,ERA5_temperature_2m_dec,ERA5_total_precipitation_dec,topo_elevation_dec,topo_slope_dec,NDVI_dec,LABELS
0,-14.271277,-21.134172,957.531174,1019.557045,1230.022834,1435.138891,1693.70227,1805.411109,1891.557355,2020.567257,...,2203.025469,795.862978,2747.777894,1688.997611,289.38222,0.000147,1886.384195,0.749163,0.307887,0.0
1,-12.341429,-14.744978,949.227883,895.845779,987.517322,1350.496916,1930.239804,2048.686546,2375.680146,2259.90329,...,2922.382762,1140.79618,2528.590979,1934.261859,266.57837,0.002876,69.876216,1.385904,0.097779,1.0
2,-12.317847,-17.026201,1087.616069,1086.401035,1185.755955,1529.674085,1858.981635,1904.667487,2082.954737,2115.713139,...,2548.726966,521.102434,1556.082048,741.212901,276.467873,0.001622,91.279743,2.125908,0.418492,1.0
3,-15.332478,-20.978203,2987.224569,3163.553102,3412.572434,3908.443371,4044.232162,4231.35377,3682.709329,4518.882323,...,2537.545246,1073.588565,2388.141112,1367.682434,276.101799,0.00372,767.817294,8.427714,0.336528,1.0
4,-11.739502,-11.374262,691.825857,782.111227,478.448214,976.753128,3189.134129,3813.893119,3861.824527,4174.591554,...,4276.302643,371.861994,2080.641023,692.730434,298.663246,0.001763,61.500923,1.203899,0.595404,1.0


In [14]:
train.dtypes

S2_B2_jan                       float64
S2_B3_jan                       float64
S2_B4_jan                       float64
S2_B5_jan                       float64
S2_B6_jan                       float64
                                 ...   
ERA5_total_precipitation_dec    float64
topo_elevation_dec              float64
topo_slope_dec                  float64
NDVI_dec                        float64
LABELS                          float64
Length: 217, dtype: object

## Removal of correlated features

In [15]:
# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = train.corr().abs()
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
upper[upper>0.9].head(20)

Unnamed: 0,S2_B2_jan,S2_B3_jan,S2_B4_jan,S2_B5_jan,S2_B6_jan,S2_B7_jan,S2_B8_jan,S2_B8A_jan,S2_B9_jan,S2_B11_jan,...,S2_B11_dec,S2_B12_dec,S1_VV_dec,S1_VH_dec,ERA5_temperature_2m_dec,ERA5_total_precipitation_dec,topo_elevation_dec,topo_slope_dec,NDVI_dec,LABELS
S2_B2_jan,,,,,,,,,,,...,,,,,,,,,,
S2_B3_jan,,,,,,,,,,,...,,,,,,,,,,
S2_B4_jan,,,,0.996445,0.986259,0.983646,0.943895,0.900987,,,...,,,,,,,,,,
S2_B5_jan,,,,,0.993192,0.991253,0.95572,0.915622,0.914422,,...,,,,,,,,,,
S2_B6_jan,,,,,,0.995991,0.943824,,,,...,,,,,,,,,,
S2_B7_jan,,,,,,,0.96179,0.92091,0.918681,,...,,,,,,,,,,
S2_B8_jan,,,,,,,,0.991239,0.986729,0.978979,...,,,,,,,,,,
S2_B8A_jan,,,,,,,,,0.99394,0.996131,...,,,,,,,,,,
S2_B9_jan,,,,,,,,,,0.990915,...,,,,,,,,,,
S2_B11_jan,,,,,,,,,,,...,,,,,,,,,,


Let's drop all columns that are at least 0.9 correlated with another feature

In [16]:
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print(f'Columns to drop: {to_drop}')
print('There are %d columns to remove.' % (len(to_drop)))
train = train.drop(columns = to_drop)
test = test.drop(columns = to_drop)

print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

Columns to drop: ['S2_B5_jan', 'S2_B6_jan', 'S2_B7_jan', 'S2_B8_jan', 'S2_B8A_jan', 'S2_B9_jan', 'S2_B11_jan', 'S1_VH_jan', 'S2_B5_feb', 'S2_B6_feb', 'S2_B7_feb', 'S2_B8_feb', 'S2_B8A_feb', 'S2_B9_feb', 'S2_B11_feb', 'S1_VH_feb', 'ERA5_temperature_2m_feb', 'topo_elevation_feb', 'topo_slope_feb', 'S2_B5_mar', 'S2_B6_mar', 'S2_B7_mar', 'S2_B8_mar', 'S2_B8A_mar', 'S2_B9_mar', 'S2_B11_mar', 'S1_VH_mar', 'ERA5_temperature_2m_mar', 'topo_elevation_mar', 'topo_slope_mar', 'S2_B5_apr', 'S2_B6_apr', 'S2_B7_apr', 'S2_B8A_apr', 'S2_B9_apr', 'S2_B11_apr', 'S1_VH_apr', 'ERA5_temperature_2m_apr', 'topo_elevation_apr', 'topo_slope_apr', 'S2_B5_may', 'S2_B6_may', 'S2_B7_may', 'S2_B8A_may', 'S2_B9_may', 'S2_B11_may', 'S1_VH_may', 'topo_elevation_may', 'topo_slope_may', 'S2_B5_jun', 'S2_B6_jun', 'S2_B7_jun', 'S2_B8A_jun', 'S2_B9_jun', 'S2_B11_jun', 'S1_VH_jun', 'ERA5_temperature_2m_jun', 'topo_elevation_jun', 'topo_slope_jun', 'S2_B5_jul', 'S2_B6_jul', 'S2_B7_jul', 'S2_B8A_jul', 'S2_B9_jul', 'S2_B11_jul

## Missing Values

In [17]:
# Train missing values (in percent)
train_missing = (train.isnull().sum() / len(train)).sort_values(ascending = False)
train_missing.head()

S2_B2_jan    0.0
S2_B4_aug    0.0
S2_B8_sep    0.0
S2_B4_sep    0.0
S2_B3_sep    0.0
dtype: float64

In [18]:
test_missing = (test.isnull().sum() / len(test)).sort_values(ascending = False)
test_missing.head()

S2_B2_jan    0.0
S2_B3_aug    0.0
S2_B4_sep    0.0
S2_B3_sep    0.0
S2_B2_sep    0.0
dtype: float64

Nice! Not much to be done in this regard

## Feature Scaling

In [21]:
columns_to_normalize = train.columns.drop('LABELS')
scaler = StandardScaler()
# Fit transform train
x = train[columns_to_normalize].values
x_scaled = scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=columns_to_normalize, index = train.index)
train[columns_to_normalize] = df_temp
# Transform test
x = test[columns_to_normalize].values
x_scaled = scaler.transform(x)
df_temp = pd.DataFrame(x_scaled, columns=columns_to_normalize, index = test.index)
test[columns_to_normalize] = df_temp

print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

Training shape:  (62000, 93)
Testing shape:  (1200, 92)


Save our processed datasets

In [22]:
train.to_csv('../data/processed/train_processed.csv',index=False)
test.to_csv('../data/processed/test_processed.csv',index=False)

In [25]:
train.head()

Unnamed: 0,S2_B2_jan,S2_B3_jan,S2_B4_jan,S2_B5_jan,S2_B6_jan,S2_B7_jan,S2_B8_jan,S2_B8A_jan,S2_B9_jan,S2_B11_jan,...,S2_B11_dec,S2_B12_dec,S1_VV_dec,S1_VH_dec,ERA5_temperature_2m_dec,ERA5_total_precipitation_dec,topo_elevation_dec,topo_slope_dec,NDVI_dec,LABELS
0,-14.271277,-21.134172,957.531174,1019.557045,1230.022834,1435.138891,1693.70227,1805.411109,1891.557355,2020.567257,...,2203.025469,795.862978,2747.777894,1688.997611,289.38222,0.000147,1886.384195,0.749163,0.307887,0.0
1,-12.341429,-14.744978,949.227883,895.845779,987.517322,1350.496916,1930.239804,2048.686546,2375.680146,2259.90329,...,2922.382762,1140.79618,2528.590979,1934.261859,266.57837,0.002876,69.876216,1.385904,0.097779,1.0
2,-12.317847,-17.026201,1087.616069,1086.401035,1185.755955,1529.674085,1858.981635,1904.667487,2082.954737,2115.713139,...,2548.726966,521.102434,1556.082048,741.212901,276.467873,0.001622,91.279743,2.125908,0.418492,1.0
3,-15.332478,-20.978203,2987.224569,3163.553102,3412.572434,3908.443371,4044.232162,4231.35377,3682.709329,4518.882323,...,2537.545246,1073.588565,2388.141112,1367.682434,276.101799,0.00372,767.817294,8.427714,0.336528,1.0
4,-11.739502,-11.374262,691.825857,782.111227,478.448214,976.753128,3189.134129,3813.893119,3861.824527,4174.591554,...,4276.302643,371.861994,2080.641023,692.730434,298.663246,0.001763,61.500923,1.203899,0.595404,1.0


In [26]:
train['LABELS'].value_counts()

1.0    41217
0.0    20783
Name: LABELS, dtype: int64