In [74]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
from math import radians, sin, cos, sqrt, atan2
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import pdist, squareform
import dask.array as da
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from math import radians, sin, cos, sqrt, atan2
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from joblib import Parallel, delayed
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler


#Extra settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore")

In [72]:
#Loading train and test data
try:
    train_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/Datasets/Train.csv')
    test_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/Datasets/Test.csv')
    print("Train df shape: " ,train_df.shape)
    print("Test df shape: ", test_df.shape)

except FileNotFoundError:
    print('File not loaded')

Train df shape:  (86584, 14)
Test df shape:  (6576, 13)


In [75]:

def Prep_linear(df1,numeric_columns_linear):
    df=df1.copy()

    
    for col in numeric_columns_linear:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='linear')
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

def Prep_linear_test(df1,numeric_columns_linear):
    df=df1.copy()
    
    
    for col in numeric_columns_linear:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='linear')
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

#Testingtg cubic interpolcation second



def Prep_spline(df1,numeric_columns_spline):
    df=df1.copy()
    
    
    for col in numeric_columns_spline:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='spline',order=2)
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

In [73]:
numeric_columns_spline =['NO2_strat','NO2_total','NO2_trop']
numeric_columns_linear =['Precipitation','LST','AAI','CloudFraction','TropopausePressure']

In [76]:
train_model3_df_prep_mix=Prep_linear(train_df,numeric_columns_linear)
train_model3_df_prep_mix=Prep_spline(train_model3_df_prep_mix,numeric_columns_spline)

test_model3_df_prep_mix=Prep_linear_test(test_df,numeric_columns_linear)
test_model3_df_prep_mix=Prep_spline(test_model3_df_prep_mix,numeric_columns_spline)

# Select only numeric columns for both train and test datasets
train_model3_df_prep_mix = train_model3_df_prep_mix.select_dtypes(include=['number'])
test_model3_df_prep_mix = test_model3_df_prep_mix.select_dtypes(include=['number'])

# Separate the target variable 'GT_NO2' from the features in the training dataset
train_model3_df_prep_mix_GT_NO2_mix = train_model3_df_prep_mix['GT_NO2']
train_model3_df_prep_mix = train_model3_df_prep_mix.drop('GT_NO2', axis=1)

#Checking skewness for all cols.
Skewed_cols=train_model3_df_prep_mix.skew()[abs(train_model3_df_prep_mix.skew())>0.5].index.to_list()
Skewed_cols.remove('LON')
print(train_model3_df_prep_mix.skew()[abs(train_model3_df_prep_mix.skew())>0.5])
print('\n')
print(Skewed_cols)
print('\n')

#GT_NO2 is also skewed
print(train_df['GT_NO2'].skew())


LON              0.911077
Precipitation    4.569618
CloudFraction    1.236881
NO2_total        4.169193
NO2_trop         2.935214
dtype: float64


['Precipitation', 'CloudFraction', 'NO2_total', 'NO2_trop']


1.507939283863649


In [77]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=train_model3_df_prep_mix.copy()
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date']=train_df['Date'].copy()

#We will be sorting by date so creating a ordered PK to sort it back
# Create an ordered PK
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['PK'] = range(1, len(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted) + 1)

# Ensure 'Date' is in datetime format
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'] = pd.to_datetime(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'])

# Sort by 'LAT', 'LON', and 'Date'
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted = train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.sort_values(by=['LAT', 'LON', 'Date'])


In [79]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,70
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,2519
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,4731
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,7180
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,9550


In [78]:
# train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.99954599].head(100)
Rolling_window=3
Cols_for_Rolling_Window=train_model3_df_prep_mix.columns.to_list()
Cols_for_Rolling_Window.remove('LAT')
Cols_for_Rolling_Window.remove('LON')

In [80]:
train_model3_df_prep_mix.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure
0,45.601585,11.903551,0.0,280.097333,0.230527,0.559117,2.4e-05,0.000117,0.000131,14440.82126
1,45.371005,11.84083,3.047342,280.097333,-0.074006,0.869309,2.4e-05,0.000127,0.000131,14441.79815
2,45.045825,12.060869,0.0,280.097333,0.02447,0.67416,2.4e-05,8.6e-05,0.000131,14437.38294
3,45.104075,11.553241,1.200467,280.097333,-0.010442,0.920054,2.4e-05,0.000124,0.000131,14440.83831
4,45.038758,11.790152,1.274564,280.097333,-0.176178,0.747464,2.4e-05,0.000116,0.000131,14438.79037


In [82]:
# Function to apply rolling mean to specified columns
# Function to apply rolling mean and stddev to specified columns
def apply_rolling_stats(df, group_cols, cols_to_roll, window=3):
    for col in cols_to_roll:
        new_mean_col_name = f'Rolling_Mean_{col}'
        new_stddev_col_name = f'Rolling_Stddev_{col}'
        df[new_mean_col_name] = df.groupby(group_cols)[col].transform(lambda x: x.rolling(window=window).mean())
        df[new_stddev_col_name] = df.groupby(group_cols)[col].transform(lambda x: x.rolling(window=window).std())
        df[new_mean_col_name].fillna(method='bfill', inplace=True)
        df[new_mean_col_name].fillna(method='ffill', inplace=True)
        df[new_stddev_col_name].fillna(method='bfill', inplace=True)
        df[new_stddev_col_name].fillna(method='ffill', inplace=True)
    return df

In [84]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=apply_rolling_stats(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted,['LAT','LON'],Cols_for_Rolling_Window,window=3)


In [81]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.92469405].head(10)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,70
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,2519
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,4731
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,7180
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,9550
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,11999
14368,44.924694,10.517502,0.0,308.28,-1.486456,0.017318,6e-05,0.000105,4.5e-05,8614.530051,2019-01-07,14369
16817,44.924694,10.517502,0.0,305.82,-1.51968,0.022716,6.2e-05,0.000102,4e-05,14427.88873,2019-01-08,16818
19266,44.924694,10.517502,0.0,309.72,-1.879356,0.0,5.6e-05,9.2e-05,3.6e-05,14426.71668,2019-01-09,19267
21636,44.924694,10.517502,0.0,298.46,-1.289062,0.04211,4.7e-05,8e-05,3.3e-05,13056.62147,2019-01-10,21637


In [83]:
#Now starting lag features
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.copy()
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.sort_values(by=['LAT','LON','Date'])

In [85]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.92469405].head(6)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212


In [86]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,70
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,2519
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,4731
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,7180
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,9550
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,11999


In [88]:
grouped_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.groupby(['LAT','LON'])

In [87]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['AAI_LAG3'] = grouped_v1['AAI'].shift(3)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LST_LAG1'] = grouped_v1['LST'].shift(1)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG1'] = grouped_v1['NO2_strat'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG2'] = grouped_v1['NO2_strat'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG3'] = grouped_v1['NO2_strat'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG1'] = grouped_v1['NO2_total'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG2'] = grouped_v1['NO2_total'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG3'] = grouped_v1['NO2_total'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG1'] = grouped_v1['NO2_trop'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG2'] = grouped_v1['NO2_trop'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG3'] = grouped_v1['NO2_trop'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG1'] = grouped_v1['TropopausePressure'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG2'] = grouped_v1['TropopausePressure'].shift(2)

In [89]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,70,,,,,,,,,,,,,
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,2519,,280.675857,2.4e-05,,,0.000395,,,0.00014,,,14436.75358,
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,4731,,279.766,3e-05,2.4e-05,,0.000873,0.000395,,0.000143,0.00014,,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,7180,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,9550,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,11999,-0.886214,303.26,6.3e-05,5e-05,3.7e-05,0.000116,0.000152,0.000171,5.3e-05,0.000102,0.000119,19282.00083,19286.55321


In [91]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.isnull().sum()

LAT                          0
LON                          0
Precipitation                0
LST                          0
AAI                          0
CloudFraction                0
NO2_strat                    0
NO2_total                    0
NO2_trop                     0
TropopausePressure           0
Date                         0
PK                           0
AAI_LAG3                   237
LST_LAG1                    79
NO2_strat_LAG1              79
NO2_strat_LAG2             158
NO2_strat_LAG3             237
NO2_total_LAG1              79
NO2_total_LAG2             158
NO2_total_LAG3             237
NO2_trop_LAG1               79
NO2_trop_LAG2              158
NO2_trop_LAG3              237
TropopausePressure_LAG1     79
TropopausePressure_LAG2    158
dtype: int64

In [90]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,70,,,,,,,,,,,,,
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,2519,,280.675857,2.4e-05,,,0.000395,,,0.00014,,,14436.75358,
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,4731,,279.766,3e-05,2.4e-05,,0.000873,0.000395,,0.000143,0.00014,,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,7180,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,9550,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913


In [93]:
LAG_Cols=[col for col in train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.columns if "LAG" in col]
LAG_Cols

['AAI_LAG3',
 'LST_LAG1',
 'NO2_strat_LAG1',
 'NO2_strat_LAG2',
 'NO2_strat_LAG3',
 'NO2_total_LAG1',
 'NO2_total_LAG2',
 'NO2_total_LAG3',
 'NO2_trop_LAG1',
 'NO2_trop_LAG2',
 'NO2_trop_LAG3',
 'TropopausePressure_LAG1',
 'TropopausePressure_LAG2']

In [100]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,70,,,,,,,,,,,,,14436.75358
1,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,2519,,280.675857,2.4e-05,,,0.000395,,,0.00014,,,14436.75358,14436.75358
2,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,4731,,279.766,3e-05,2.4e-05,,0.000873,0.000395,,0.000143,0.00014,,16692.0173,14436.75358
3,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,7180,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
4,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,9550,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913
5,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,11999,-0.886214,303.26,6.3e-05,5e-05,3.7e-05,0.000116,0.000152,0.000171,5.3e-05,0.000102,0.000119,19282.00083,19286.55321


In [101]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.99954599].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
1096,44.999546,9.008437,0.0,280.818714,0.108347,0.738709,2.4e-05,0.000302,0.000132,14437.76944,2019-01-01,73,,,,,,,,,,,,,14437.76944
1097,44.999546,9.008437,0.0,279.87875,0.310969,1.0,2.7e-05,0.000716,0.000143,16693.96459,2019-01-02,2522,,280.818714,2.4e-05,,,0.000302,,,0.000132,,,14437.76944,14437.76944
1098,44.999546,9.008437,0.0,297.62,-0.865564,0.051724,3.8e-05,0.000107,6.8e-05,19283.41137,2019-01-03,4734,,279.87875,2.7e-05,2.4e-05,,0.000716,0.000302,,0.000143,0.000132,,16693.96459,14437.76944
1099,44.999546,9.008437,0.0,299.52,-0.799233,0.192144,5e-05,0.000127,7.7e-05,19289.1028,2019-01-04,7183,0.108347,297.62,3.8e-05,2.7e-05,2.4e-05,0.000107,0.000716,0.000302,6.8e-05,0.000143,0.000132,19283.41137,16693.96459
1100,44.999546,9.008437,0.0,303.7,-1.081019,0.01421,6.1e-05,0.000111,4.9e-05,19287.64829,2019-01-05,9553,0.310969,299.52,5e-05,3.8e-05,2.7e-05,0.000127,0.000107,0.000716,7.7e-05,6.8e-05,0.000143,19289.1028,19283.41137
1101,44.999546,9.008437,0.0,309.3,-1.199881,0.026664,5.8e-05,0.00011,5.2e-05,16719.80479,2019-01-06,12002,-0.865564,303.7,6.1e-05,5e-05,3.8e-05,0.000111,0.000127,0.000107,4.9e-05,7.7e-05,6.8e-05,19287.64829,19289.1028


In [102]:
#Doing backfill 
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.fillna(method='bfill', inplace=True)

In [103]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,70,0.286079,280.675857,2.4e-05,2.4e-05,2.4e-05,0.000395,0.000395,0.000395,0.00014,0.00014,0.00014,14436.75358,14436.75358
1,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,2519,0.286079,280.675857,2.4e-05,2.4e-05,2.4e-05,0.000395,0.000395,0.000395,0.00014,0.00014,0.00014,14436.75358,14436.75358
2,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,4731,0.286079,279.766,3e-05,2.4e-05,2.4e-05,0.000873,0.000395,0.000395,0.000143,0.00014,0.00014,16692.0173,14436.75358
3,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,7180,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
4,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,9550,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913
5,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,11999,-0.886214,303.26,6.3e-05,5e-05,3.7e-05,0.000116,0.000152,0.000171,5.3e-05,0.000102,0.000119,19282.00083,19286.55321


In [104]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.99954599].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,PK,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
1096,44.999546,9.008437,0.0,280.818714,0.108347,0.738709,2.4e-05,0.000302,0.000132,14437.76944,2019-01-01,73,0.108347,280.818714,2.4e-05,2.4e-05,2.4e-05,0.000302,0.000302,0.000302,0.000132,0.000132,0.000132,14437.76944,14437.76944
1097,44.999546,9.008437,0.0,279.87875,0.310969,1.0,2.7e-05,0.000716,0.000143,16693.96459,2019-01-02,2522,0.108347,280.818714,2.4e-05,2.4e-05,2.4e-05,0.000302,0.000302,0.000302,0.000132,0.000132,0.000132,14437.76944,14437.76944
1098,44.999546,9.008437,0.0,297.62,-0.865564,0.051724,3.8e-05,0.000107,6.8e-05,19283.41137,2019-01-03,4734,0.108347,279.87875,2.7e-05,2.4e-05,2.4e-05,0.000716,0.000302,0.000302,0.000143,0.000132,0.000132,16693.96459,14437.76944
1099,44.999546,9.008437,0.0,299.52,-0.799233,0.192144,5e-05,0.000127,7.7e-05,19289.1028,2019-01-04,7183,0.108347,297.62,3.8e-05,2.7e-05,2.4e-05,0.000107,0.000716,0.000302,6.8e-05,0.000143,0.000132,19283.41137,16693.96459
1100,44.999546,9.008437,0.0,303.7,-1.081019,0.01421,6.1e-05,0.000111,4.9e-05,19287.64829,2019-01-05,9553,0.310969,299.52,5e-05,3.8e-05,2.7e-05,0.000127,0.000107,0.000716,7.7e-05,6.8e-05,0.000143,19289.1028,19283.41137
1101,44.999546,9.008437,0.0,309.3,-1.199881,0.026664,5.8e-05,0.00011,5.2e-05,16719.80479,2019-01-06,12002,-0.865564,303.7,6.1e-05,5e-05,3.8e-05,0.000111,0.000127,0.000107,4.9e-05,7.7e-05,6.8e-05,19287.64829,19289.1028
