In [24]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
from math import radians, sin, cos, sqrt, atan2
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import pdist, squareform
import dask.array as da
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from math import radians, sin, cos, sqrt, atan2
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from joblib import Parallel, delayed
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler


#Extra settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore")

In [25]:
#Loading train and test data
try:
    train_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/Datasets/Train.csv')
    test_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/Datasets/Test.csv')
    print("Train df shape: " ,train_df.shape)
    print("Test df shape: ", test_df.shape)

except FileNotFoundError:
    print('File not loaded')

Train df shape:  (86584, 14)
Test df shape:  (6576, 13)


In [26]:

def Prep_linear(df1):
    df=df1.copy()
    numeric_columns =['Precipitation','LST','AAI','CloudFraction','TropopausePressure','GT_NO2']
    
    for col in numeric_columns:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='linear')
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

def Prep_linear_test(df1):
    df=df1.copy()
    numeric_columns =['Precipitation','LST','AAI','CloudFraction','TropopausePressure']
    
    for col in numeric_columns:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='linear')
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

#Testing cubic interpolcation second



def Prep_spline(df1):
    df=df1.copy()
    numeric_columns =['NO2_strat','NO2_total','NO2_trop']
    
    for col in numeric_columns:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='spline',order=2)
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

In [27]:
#Model with score 10.72 after filling missing values
train_model3_df_prep_mix=Prep_linear(train_df)
train_model3_df_prep_mix=Prep_spline(train_model3_df_prep_mix)

test_model3_df_prep_mix=Prep_linear_test(test_df)
test_model3_df_prep_mix=Prep_spline(test_model3_df_prep_mix)

# Select only numeric columns for both train and test datasets
train_model3_df_prep_mix = train_model3_df_prep_mix.select_dtypes(include=['number'])
test_model3_df_prep_mix = test_model3_df_prep_mix.select_dtypes(include=['number'])

# Separate the target variable 'GT_NO2' from the features in the training dataset
train_model3_df_prep_mix_GT_NO2_mix = train_model3_df_prep_mix['GT_NO2']
train_model3_df_prep_mix = train_model3_df_prep_mix.drop('GT_NO2', axis=1)

#Checking skewness for all cols.
Skewed_cols=train_model3_df_prep_mix.skew()[abs(train_model3_df_prep_mix.skew())>0.5].index.to_list()
Skewed_cols.remove('LON')
print(train_model3_df_prep_mix.skew()[abs(train_model3_df_prep_mix.skew())>0.5])
print('\n')
print(Skewed_cols)
print('\n')

#GT_NO2 is also skewed
print(train_df['GT_NO2'].skew())


LON              0.911077
Precipitation    4.569618
CloudFraction    1.236881
NO2_total        4.169193
NO2_trop         2.935214
dtype: float64


['Precipitation', 'CloudFraction', 'NO2_total', 'NO2_trop']


1.507939283863649


In [28]:
train_model3_df_prep_mix.head(2)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure
0,45.601585,11.903551,0.0,280.097333,0.230527,0.559117,2.4e-05,0.000117,0.000131,14440.82126
1,45.371005,11.84083,3.047342,280.097333,-0.074006,0.869309,2.4e-05,0.000127,0.000131,14441.79815


In [29]:
test_model3_df_prep_mix.head(2)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure
0,45.289376,11.642394,3.277529,279.369667,-0.313361,0.771456,2.4e-05,7.5e-05,0.000114,14440.02819
1,45.836941,12.510362,0.0,279.369667,-0.229512,0.398208,2.3e-05,0.00012,0.000114,14434.0479


In [30]:
train_df.head(2)

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,GT_NO2
0,ID_ENTGC7,1/1/19,PD01,45.601585,11.903551,0.0,,0.230527,0.559117,2.4e-05,0.000117,,14440.82126,31.0
1,ID_8JCCXC,1/1/19,PD04,45.371005,11.84083,3.047342,,-0.074006,0.869309,2.4e-05,0.000127,,14441.79815,42.0


In [31]:
Rolling_window=3
Cols_for_Rolling_Window=train_model3_df_prep_mix.columns.to_list()
Cols_for_Rolling_Window.remove('LAT')
Cols_for_Rolling_Window.remove('LON')
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=train_model3_df_prep_mix.copy()
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date']=train_df['Date'].copy()

In [32]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date
0,45.601585,11.903551,0.0,280.097333,0.230527,0.559117,2.4e-05,0.000117,0.000131,14440.82126,1/1/19
1,45.371005,11.84083,3.047342,280.097333,-0.074006,0.869309,2.4e-05,0.000127,0.000131,14441.79815,1/1/19
2,45.045825,12.060869,0.0,280.097333,0.02447,0.67416,2.4e-05,8.6e-05,0.000131,14437.38294,1/1/19
3,45.104075,11.553241,1.200467,280.097333,-0.010442,0.920054,2.4e-05,0.000124,0.000131,14440.83831,1/1/19
4,45.038758,11.790152,1.274564,280.097333,-0.176178,0.747464,2.4e-05,0.000116,0.000131,14438.79037,1/1/19


In [33]:
# train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.sort_values(by=['LON', 'LAT', 'Date'])

In [37]:
#We will be sorting by date so creating a ordered PK to sort it back
# Create an ordered PK
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['PK'] = range(1, len(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted) + 1)

In [43]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.sort_values(by=['LON', 'LAT', 'Date'])

In [44]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK
35,45.526551,8.736497,0.0,282.74,-0.567872,0.148464,2.3e-05,0.000154,0.000132,14428.88172,1/1/19,36
28870,45.526551,8.736497,0.0,282.06,-1.425637,0.009579,2.8e-05,0.000182,0.000154,14432.34899,1/1/20,28871
57784,45.526551,8.736497,15.828151,259.937455,-0.103099,0.869997,4.6e-05,0.000137,8.9e-05,22138.140916,1/1/21,57785
21602,45.526551,8.736497,0.0,296.98,-1.998369,0.054525,4.5e-05,7.4e-05,2.9e-05,14420.39429,1/10/19,21603
50516,45.526551,8.736497,0.0,298.858491,-1.103071,0.519731,3.8e-05,9.6e-05,8.4e-05,16686.98116,1/10/20,50517


In [45]:
Cols_for_Rolling_Window

['Precipitation',
 'LST',
 'AAI',
 'CloudFraction',
 'NO2_strat',
 'NO2_total',
 'NO2_trop',
 'TropopausePressure']

In [47]:

# Function to apply rolling mean to specified columns
# Function to apply rolling mean and stddev to specified columns
def apply_rolling_stats(df, group_cols, cols_to_roll, window=3):
    for col in cols_to_roll:
        new_mean_col_name = f'Rolling Mean {col}'
        new_stddev_col_name = f'Rolling Stddev {col}'
        df[new_mean_col_name] = df.groupby(group_cols)[col].transform(lambda x: x.rolling(window=window).mean())
        df[new_stddev_col_name] = df.groupby(group_cols)[col].transform(lambda x: x.rolling(window=window).std())
    return df

In [49]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=apply_rolling_stats(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted,['LAT','LON'],Cols_for_Rolling_Window,window=3)

In [51]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(10)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK,Roll_Mean_Precipitation,Roll_Mean_LST,Roll_Mean_AAI,Roll_Mean_CloudFraction,Roll_Mean_NO2_strat,Roll_Mean_NO2_total,Roll_Mean_NO2_trop,Roll_Mean_TropopausePressure
35,45.526551,8.736497,0.0,282.74,-0.567872,0.148464,2.3e-05,0.000154,0.000132,14428.88172,1/1/19,36,,,,,,,,
28870,45.526551,8.736497,0.0,282.06,-1.425637,0.009579,2.8e-05,0.000182,0.000154,14432.34899,1/1/20,28871,,,,,,,,
57784,45.526551,8.736497,15.828151,259.937455,-0.103099,0.869997,4.6e-05,0.000137,8.9e-05,22138.140916,1/1/21,57785,5.27605,274.912485,-0.698869,0.34268,3.2e-05,0.000158,0.000125,16999.790542
21602,45.526551,8.736497,0.0,296.98,-1.998369,0.054525,4.5e-05,7.4e-05,2.9e-05,14420.39429,1/10/19,21603,5.27605,279.659152,-1.175702,0.311367,3.9e-05,0.000131,9.1e-05,16996.961399
50516,45.526551,8.736497,0.0,298.858491,-1.103071,0.519731,3.8e-05,9.6e-05,8.4e-05,16686.98116,1/10/20,50517,5.27605,285.258648,-1.06818,0.481417,4.3e-05,0.000102,6.8e-05,17748.505455
79351,45.526551,8.736497,0.0,299.24,-1.141254,0.092145,4.2e-05,0.000169,0.000127,16705.56841,1/10/21,79352,0.0,298.359497,-1.414231,0.222133,4.2e-05,0.000113,8e-05,15937.647953
24051,45.526551,8.736497,15.313245,280.69095,-0.878648,0.44193,4e-05,0.000172,9.7e-05,14429.041631,1/11/19,24052,5.104415,292.929814,-1.040991,0.351268,4e-05,0.000145,0.000103,15940.5304
52965,45.526551,8.736497,0.0,287.246392,-2.059892,0.20729,4.3e-05,0.000142,8.5e-05,14432.467061,1/11/20,52966,5.104415,289.059114,-1.359931,0.247122,4.2e-05,0.000161,0.000103,15189.025701
81800,45.526551,8.736497,14.338717,289.910769,0.111101,0.235162,4.6e-05,0.000189,0.000131,16712.134464,1/11/21,81801,9.883987,285.94937,-0.94248,0.294794,4.3e-05,0.000167,0.000105,15191.214385
26421,45.526551,8.736497,0.0,268.066316,-2.340317,0.432437,4.5e-05,0.000143,9.4e-05,14434.946091,1/12/19,26422,4.779572,281.741159,-1.429703,0.29163,4.5e-05,0.000158,0.000104,15193.182538
