In [32]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
from math import radians, sin, cos, sqrt, atan2
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import pdist, squareform
import dask.array as da
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from math import radians, sin, cos, sqrt, atan2
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from joblib import Parallel, delayed
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler


#Extra settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore")

In [33]:
#Loading train and test data
try:
    train_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/Datasets/Train.csv')
    test_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/Datasets/Test.csv')
    print("Train df shape: " ,train_df.shape)
    print("Test df shape: ", test_df.shape)

except FileNotFoundError:
    print('File not loaded')

Train df shape:  (86584, 14)
Test df shape:  (6576, 13)


In [34]:

def Prep_linear(df1,numeric_columns_linear):
    df=df1.copy()

    
    for col in numeric_columns_linear:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='linear')
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

def Prep_linear_test(df1,numeric_columns_linear):
    df=df1.copy()
    
    
    for col in numeric_columns_linear:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='linear')
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

#Testingtg cubic interpolcation second



def Prep_spline(df1,numeric_columns_spline):
    df=df1.copy()
    
    
    for col in numeric_columns_spline:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='spline',order=2)
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

In [35]:
numeric_columns_spline =['NO2_strat','NO2_total','NO2_trop']
numeric_columns_linear =['Precipitation','LST','AAI','CloudFraction','TropopausePressure']

In [36]:
train_model3_df_prep_mix=Prep_linear(train_df,numeric_columns_linear)
train_model3_df_prep_mix=Prep_spline(train_model3_df_prep_mix,numeric_columns_spline)

test_model3_df_prep_mix=Prep_linear_test(test_df,numeric_columns_linear)
test_model3_df_prep_mix=Prep_spline(test_model3_df_prep_mix,numeric_columns_spline)

# Select only numeric columns for both train and test datasets
train_model3_df_prep_mix = train_model3_df_prep_mix.select_dtypes(include=['number'])
test_model3_df_prep_mix = test_model3_df_prep_mix.select_dtypes(include=['number'])

# Separate the target variable 'GT_NO2' from the features in the training dataset
train_model3_df_prep_mix_GT_NO2_mix = train_model3_df_prep_mix['GT_NO2']
train_model3_df_prep_mix = train_model3_df_prep_mix.drop('GT_NO2', axis=1)

#Checking skewness for all cols.
Skewed_cols=train_model3_df_prep_mix.skew()[abs(train_model3_df_prep_mix.skew())>0.5].index.to_list()
Skewed_cols.remove('LON')
print(train_model3_df_prep_mix.skew()[abs(train_model3_df_prep_mix.skew())>0.5])
print('\n')
print(Skewed_cols)
print('\n')

#GT_NO2 is also skewed
print(train_df['GT_NO2'].skew())


LON              0.911077
Precipitation    4.569618
CloudFraction    1.236881
NO2_total        4.169193
NO2_trop         2.935214
dtype: float64


['Precipitation', 'CloudFraction', 'NO2_total', 'NO2_trop']


1.507939283863649


In [37]:
train_df.head(5)

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,GT_NO2
0,ID_ENTGC7,1/1/19,PD01,45.601585,11.903551,0.0,,0.230527,0.559117,2.4e-05,0.000117,,14440.82126,31.0
1,ID_8JCCXC,1/1/19,PD04,45.371005,11.84083,3.047342,,-0.074006,0.869309,2.4e-05,0.000127,,14441.79815,42.0
2,ID_V3136Z,1/1/19,RO01,45.045825,12.060869,0.0,,0.02447,0.67416,2.4e-05,8.6e-05,,14437.38294,31.0
3,ID_KRVZDJ,1/1/19,RO02,45.104075,11.553241,1.200467,,-0.010442,0.920054,2.4e-05,0.000124,,14440.83831,30.0
4,ID_PR351A,1/1/19,RO03,45.038758,11.790152,1.274564,,-0.176178,0.747464,2.4e-05,0.000116,,14438.79037,58.0


In [38]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=train_model3_df_prep_mix.copy()
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted=test_model3_df_prep_mix.copy()

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date']=train_df['Date'].copy()
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['ID_Zindi']=train_df['ID_Zindi'].copy()

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date']=test_df['Date'].copy()
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['ID_Zindi']=test_df['ID_Zindi'].copy()

#We will be sorting by date so creating a ordered PK to sort it back
# Create an ordered PK
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['PK'] = range(1, len(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted) + 1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['PK'] = range(1, len(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted) + 1)


# Ensure 'Date' is in datetime format
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'] = pd.to_datetime(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'])
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'] = pd.to_datetime(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'])

# Sort by 'LAT', 'LON', and 'Date'
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted = train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.sort_values(by=['LAT', 'LON', 'Date'])
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted = test_model3_df_prep_mix_roll_wind_mean_stddev_sorted.sort_values(by=['LAT', 'LON', 'Date'])


In [39]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550


In [40]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28


In [41]:
# train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.99954599].head(100)
Rolling_window=3
Cols_for_Rolling_Window=train_model3_df_prep_mix.columns.to_list()
Cols_for_Rolling_Window.remove('LAT')
Cols_for_Rolling_Window.remove('LON')

In [42]:
train_model3_df_prep_mix.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure
0,45.601585,11.903551,0.0,280.097333,0.230527,0.559117,2.4e-05,0.000117,0.000131,14440.82126
1,45.371005,11.84083,3.047342,280.097333,-0.074006,0.869309,2.4e-05,0.000127,0.000131,14441.79815
2,45.045825,12.060869,0.0,280.097333,0.02447,0.67416,2.4e-05,8.6e-05,0.000131,14437.38294
3,45.104075,11.553241,1.200467,280.097333,-0.010442,0.920054,2.4e-05,0.000124,0.000131,14440.83831
4,45.038758,11.790152,1.274564,280.097333,-0.176178,0.747464,2.4e-05,0.000116,0.000131,14438.79037


In [43]:
# Function to apply rolling mean to specified columns
# Function to apply rolling mean and stddev to specified columns
def apply_rolling_stats(df, group_cols, cols_to_roll, window=3):
    for col in cols_to_roll:
        new_mean_col_name = f'Rolling_Mean_{col}'
        new_stddev_col_name = f'Rolling_Stddev_{col}'
        df[new_mean_col_name] = df.groupby(group_cols)[col].transform(lambda x: x.rolling(window=window).mean())
        df[new_stddev_col_name] = df.groupby(group_cols)[col].transform(lambda x: x.rolling(window=window).std())
        df[new_mean_col_name].fillna(method='bfill', inplace=True)
        df[new_mean_col_name].fillna(method='ffill', inplace=True)
        df[new_stddev_col_name].fillna(method='bfill', inplace=True)
        df[new_stddev_col_name].fillna(method='ffill', inplace=True)
    return df

In [44]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=apply_rolling_stats(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted,['LAT','LON'],Cols_for_Rolling_Window,window=3)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted=apply_rolling_stats(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted,['LAT','LON'],Cols_for_Rolling_Window,window=3)


In [45]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.92469405].head(10)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212
14368,44.924694,10.517502,0.0,308.28,-1.486456,0.017318,6e-05,0.000105,4.5e-05,8614.530051,2019-01-07,ID_BBYHQH,14369,0.0,0.0,305.686667,2.514147,-1.199292,0.257798,0.005776,0.009996,6.1e-05,1e-06,0.000104,1.3e-05,4.3e-05,1.1e-05,14870.55787,5567.81853
16817,44.924694,10.517502,0.0,305.82,-1.51968,0.022716,6.2e-05,0.000102,4e-05,14427.88873,2019-01-08,ID_W8OH93,16818,0.0,0.0,306.54,1.514332,-1.376587,0.2197,0.013345,0.011868,6.1e-05,1e-06,9.9e-05,7e-06,3.8e-05,7e-06,13252.520504,4176.254218
19266,44.924694,10.517502,0.0,309.72,-1.879356,0.0,5.6e-05,9.2e-05,3.6e-05,14426.71668,2019-01-09,ID_FYHRGF,19267,0.0,0.0,307.94,1.972105,-1.628497,0.217884,0.013345,0.011868,5.9e-05,3e-06,0.0001,7e-06,4e-05,5e-06,12489.71182,3356.005908
21636,44.924694,10.517502,0.0,298.46,-1.289062,0.04211,4.7e-05,8e-05,3.3e-05,13056.62147,2019-01-10,ID_NL8HHQ,21637,0.0,0.0,304.666667,5.717913,-1.562699,0.297489,0.021609,0.021077,5.5e-05,8e-06,9.1e-05,1.1e-05,3.6e-05,4e-06,13970.40896,791.363397


In [46]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22,0.0,0.0,278.625296,1.037649,-0.938282,0.366992,0.147458,0.212993,2.8e-05,7e-06,0.000155,5.9e-05,0.000102,1.2e-05,19451.419047,2769.11619
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28,0.642677,1.113149,277.695407,1.262636,-0.936442,0.369341,0.037229,0.032077,2.3e-05,6e-06,0.000166,7.7e-05,0.000128,5.6e-05,18585.48803,3198.771039
33,45.131947,10.015742,0.0,279.38,-0.634519,0.040908,2.2e-05,0.000181,0.000172,16733.67227,2019-01-06,ID_ZOMAXE,34,0.642677,1.113149,278.342074,1.534045,-0.866226,0.412832,0.050612,0.010105,2e-05,2e-06,0.000186,6.7e-05,0.000156,4.8e-05,16737.01085,6.809725


In [47]:
#Now starting lag features
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.copy()
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.sort_values(by=['LAT','LON','Date'])

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=test_model3_df_prep_mix_roll_wind_mean_stddev_sorted.copy()
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.sort_values(by=['LAT','LON','Date'])



In [48]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.92469405].head(6)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212


In [49]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212


In [50]:
grouped_v1_train=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.groupby(['LAT','LON'])
grouped_v1_test=test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.groupby(['LAT','LON'])

In [51]:
#train
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['AAI_LAG3'] = grouped_v1_train['AAI'].shift(3)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LST_LAG1'] = grouped_v1_train['LST'].shift(1)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG1'] = grouped_v1_train['NO2_strat'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG2'] = grouped_v1_train['NO2_strat'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG3'] = grouped_v1_train['NO2_strat'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG1'] = grouped_v1_train['NO2_total'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG2'] = grouped_v1_train['NO2_total'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG3'] = grouped_v1_train['NO2_total'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG1'] = grouped_v1_train['NO2_trop'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG2'] = grouped_v1_train['NO2_trop'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG3'] = grouped_v1_train['NO2_trop'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG1'] = grouped_v1_train['TropopausePressure'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG2'] = grouped_v1_train['TropopausePressure'].shift(2)

#test
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['AAI_LAG3'] = grouped_v1_test['AAI'].shift(3)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LST_LAG1'] = grouped_v1_test['LST'].shift(1)

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG1'] = grouped_v1_test['NO2_strat'].shift(1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG2'] = grouped_v1_test['NO2_strat'].shift(2)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG3'] = grouped_v1_test['NO2_strat'].shift(3)

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG1'] = grouped_v1_test['NO2_total'].shift(1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG2'] = grouped_v1_test['NO2_total'].shift(2)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG3'] = grouped_v1_test['NO2_total'].shift(3)

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG1'] = grouped_v1_test['NO2_trop'].shift(1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG2'] = grouped_v1_test['NO2_trop'].shift(2)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG3'] = grouped_v1_test['NO2_trop'].shift(3)

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG1'] = grouped_v1_test['TropopausePressure'].shift(1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG2'] = grouped_v1_test['TropopausePressure'].shift(2)

In [52]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,,,,,,,,,,,,
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,280.675857,2.4e-05,,,0.000395,,,0.00014,,,14436.75358,
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,279.766,3e-05,2.4e-05,,0.000873,0.000395,,0.000143,0.00014,,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212,-0.886214,303.26,6.3e-05,5e-05,3.7e-05,0.000116,0.000152,0.000171,5.3e-05,0.000102,0.000119,19282.00083,19286.55321


In [53]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,,,,,,,,,,,,
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,279.369667,2.4e-05,,,0.000266,,,0.000114,,,14443.09006,
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,279.369667,3.4e-05,2.4e-05,,0.000223,0.000266,,0.000114,0.000114,,19330.30774,14443.09006
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22,0.0,0.0,278.625296,1.037649,-0.938282,0.366992,0.147458,0.212993,2.8e-05,7e-06,0.000155,5.9e-05,0.000102,1.2e-05,19451.419047,2769.11619,0.132952,277.44,3e-05,3.4e-05,2.4e-05,0.00012,0.000223,0.000266,9e-05,0.000114,0.000114,22279.10381,19330.30774
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28,0.642677,1.113149,277.695407,1.262636,-0.936442,0.369341,0.037229,0.032077,2.3e-05,6e-06,0.000166,7.7e-05,0.000128,5.6e-05,18585.48803,3198.771039,-0.626818,279.066222,2e-05,3e-05,3.4e-05,0.000122,0.00012,0.000223,0.000102,9e-05,0.000114,16744.84559,22279.10381
33,45.131947,10.015742,0.0,279.38,-0.634519,0.040908,2.2e-05,0.000181,0.000172,16733.67227,2019-01-06,ID_ZOMAXE,34,0.642677,1.113149,278.342074,1.534045,-0.866226,0.412832,0.050612,0.010105,2e-05,2e-06,0.000186,6.7e-05,0.000156,4.8e-05,16737.01085,6.809725,-0.845165,276.58,1.8e-05,2e-05,3e-05,0.000255,0.000122,0.00012,0.000193,0.000102,9e-05,16732.51469,16744.84559


In [54]:
print(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].shape[0])
print(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==45.13194691].shape[0])
print(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==45.28937609].shape[0])

1096
1096
1096


In [57]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==45.13194691].head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,,,,,,,,,,,,
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,279.369667,2.4e-05,,,0.000266,,,0.000114,,,14443.09006,
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,279.369667,3.4e-05,2.4e-05,,0.000223,0.000266,,0.000114,0.000114,,19330.30774,14443.09006
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22,0.0,0.0,278.625296,1.037649,-0.938282,0.366992,0.147458,0.212993,2.8e-05,7e-06,0.000155,5.9e-05,0.000102,1.2e-05,19451.419047,2769.11619,0.132952,277.44,3e-05,3.4e-05,2.4e-05,0.00012,0.000223,0.000266,9e-05,0.000114,0.000114,22279.10381,19330.30774
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28,0.642677,1.113149,277.695407,1.262636,-0.936442,0.369341,0.037229,0.032077,2.3e-05,6e-06,0.000166,7.7e-05,0.000128,5.6e-05,18585.48803,3198.771039,-0.626818,279.066222,2e-05,3e-05,3.4e-05,0.000122,0.00012,0.000223,0.000102,9e-05,0.000114,16744.84559,22279.10381


In [58]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,,,,,,,,,,,,
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,280.675857,2.4e-05,,,0.000395,,,0.00014,,,14436.75358,
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,279.766,3e-05,2.4e-05,,0.000873,0.000395,,0.000143,0.00014,,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913


In [59]:
LAG_Cols=[col for col in train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.columns if "LAG" in col]
LAG_Cols

['AAI_LAG3',
 'LST_LAG1',
 'NO2_strat_LAG1',
 'NO2_strat_LAG2',
 'NO2_strat_LAG3',
 'NO2_total_LAG1',
 'NO2_total_LAG2',
 'NO2_total_LAG3',
 'NO2_trop_LAG1',
 'NO2_trop_LAG2',
 'NO2_trop_LAG3',
 'TropopausePressure_LAG1',
 'TropopausePressure_LAG2']

In [60]:
#Doing backfill 
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.fillna(method='bfill', inplace=True)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.fillna(method='bfill', inplace=True)

In [61]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,280.675857,2.4e-05,2.4e-05,2.4e-05,0.000395,0.000395,0.000395,0.00014,0.00014,0.00014,14436.75358,14436.75358
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,280.675857,2.4e-05,2.4e-05,2.4e-05,0.000395,0.000395,0.000395,0.00014,0.00014,0.00014,14436.75358,14436.75358
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,279.766,3e-05,2.4e-05,2.4e-05,0.000873,0.000395,0.000395,0.000143,0.00014,0.00014,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212,-0.886214,303.26,6.3e-05,5e-05,3.7e-05,0.000116,0.000152,0.000171,5.3e-05,0.000102,0.000119,19282.00083,19286.55321


In [62]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==45.13194691].head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,0.132952,279.369667,2.4e-05,2.4e-05,2.4e-05,0.000266,0.000266,0.000266,0.000114,0.000114,0.000114,14443.09006,14443.09006
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,0.132952,279.369667,2.4e-05,2.4e-05,2.4e-05,0.000266,0.000266,0.000266,0.000114,0.000114,0.000114,14443.09006,14443.09006
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,0.132952,279.369667,3.4e-05,2.4e-05,2.4e-05,0.000223,0.000266,0.000266,0.000114,0.000114,0.000114,19330.30774,14443.09006
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22,0.0,0.0,278.625296,1.037649,-0.938282,0.366992,0.147458,0.212993,2.8e-05,7e-06,0.000155,5.9e-05,0.000102,1.2e-05,19451.419047,2769.11619,0.132952,277.44,3e-05,3.4e-05,2.4e-05,0.00012,0.000223,0.000266,9e-05,0.000114,0.000114,22279.10381,19330.30774
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28,0.642677,1.113149,277.695407,1.262636,-0.936442,0.369341,0.037229,0.032077,2.3e-05,6e-06,0.000166,7.7e-05,0.000128,5.6e-05,18585.48803,3198.771039,-0.626818,279.066222,2e-05,3e-05,3.4e-05,0.000122,0.00012,0.000223,0.000102,9e-05,0.000114,16744.84559,22279.10381


In [71]:
#Checking if any null present or not
print('train null check\n',train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.isnull().sum())
print('\n')
print('test null check\n',test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.isnull().sum())

train null check
 LAT                                  0
LON                                  0
Precipitation                        0
LST                                  0
AAI                                  0
CloudFraction                        0
NO2_strat                            0
NO2_total                            0
NO2_trop                             0
TropopausePressure                   0
Date                                 0
ID_Zindi                             0
PK                                   0
Rolling_Mean_Precipitation           0
Rolling_Stddev_Precipitation         0
Rolling_Mean_LST                     0
Rolling_Stddev_LST                   0
Rolling_Mean_AAI                     0
Rolling_Stddev_AAI                   0
Rolling_Mean_CloudFraction           0
Rolling_Stddev_CloudFraction         0
Rolling_Mean_NO2_strat               0
Rolling_Stddev_NO2_strat             0
Rolling_Mean_NO2_total               0
Rolling_Stddev_NO2_total             0
Rolling