In [97]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
from math import radians, sin, cos, sqrt, atan2
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import pdist, squareform
import dask.array as da
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from math import radians, sin, cos, sqrt, atan2
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from joblib import Parallel, delayed
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler


#Extra settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore")

In [98]:
#Loading train and test data
try:
    train_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/Datasets/Train.csv')
    test_df=pd.read_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/Datasets/Test.csv')
    print("Train df shape: " ,train_df.shape)
    print("Test df shape: ", test_df.shape)

except FileNotFoundError:
    print('File not loaded')

Train df shape:  (86584, 14)
Test df shape:  (6576, 13)


In [99]:

def Prep_linear(df1,numeric_columns_linear):
    df=df1.copy()

    
    for col in numeric_columns_linear:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='linear')
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

def Prep_linear_test(df1,numeric_columns_linear):
    df=df1.copy()
    
    
    for col in numeric_columns_linear:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='linear')
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

#Testingtg cubic interpolcation second



def Prep_spline(df1,numeric_columns_spline):
    df=df1.copy()
    
    
    for col in numeric_columns_spline:
        # Fill missing values temporarily using linear interpolation
        data_interpolated = df[col].interpolate(method='spline',order=2)
        
        # Handle cases where interpolation might still leave NaNs at the ends
        if data_interpolated.isna().sum() > 0:
            data_interpolated.fillna(method='bfill', inplace=True)
            data_interpolated.fillna(method='ffill', inplace=True)

        # Decompose the time series to extract the trend component
        decomposition = seasonal_decompose(data_interpolated, model='additive', period=30)
        trend = decomposition.trend
        
        # Handle cases where the trend might still have NaNs at the ends
        trend.fillna(method='bfill', inplace=True)
        trend.fillna(method='ffill', inplace=True)

        # Replace original NaN values with the trend component
        df[col] = df[col].combine_first(trend)
        
        # Fill any remaining NaN values with the mean of the column
        df[col].fillna(value=df[col].mean(),inplace=True)

    return df

In [100]:
numeric_columns_spline =['NO2_strat','NO2_total','NO2_trop']
numeric_columns_linear_train =['Precipitation','LST','AAI','CloudFraction','TropopausePressure','GT_NO2']
numeric_columns_linear_test =['Precipitation','LST','AAI','CloudFraction','TropopausePressure']

In [101]:
train_model3_df_prep_mix=Prep_linear(train_df,numeric_columns_linear_train)
train_model3_df_prep_mix=Prep_spline(train_model3_df_prep_mix,numeric_columns_spline)

test_model3_df_prep_mix=Prep_linear_test(test_df,numeric_columns_linear_test)
test_model3_df_prep_mix=Prep_spline(test_model3_df_prep_mix,numeric_columns_spline)

# Select only numeric columns for both train and test datasets
train_model3_df_prep_mix = train_model3_df_prep_mix.select_dtypes(include=['number'])
test_model3_df_prep_mix = test_model3_df_prep_mix.select_dtypes(include=['number'])

# Separate the target variable 'GT_NO2' from the features in the training dataset
train_model3_df_prep_mix_GT_NO2_mix = train_model3_df_prep_mix['GT_NO2']
train_model3_df_prep_mix = train_model3_df_prep_mix.drop('GT_NO2', axis=1)

#Checking skewness for all cols.
Skewed_cols=train_model3_df_prep_mix.skew()[abs(train_model3_df_prep_mix.skew())>0.5].index.to_list()
Skewed_cols.remove('LON')
print(train_model3_df_prep_mix.skew()[abs(train_model3_df_prep_mix.skew())>0.5])
print('\n')
print(Skewed_cols)
print('\n')

#GT_NO2 is also skewed
print(train_df['GT_NO2'].skew())

#Best till now 10.53

LON              0.911077
Precipitation    4.569618
CloudFraction    1.236881
NO2_total        4.169193
NO2_trop         2.935214
dtype: float64


['Precipitation', 'CloudFraction', 'NO2_total', 'NO2_trop']


1.507939283863649


In [107]:
train_model3_df_prep_mix_GT_NO2_mix.isnull().sum()

0

In [102]:
train_df.head(5)

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,GT_NO2
0,ID_ENTGC7,1/1/19,PD01,45.601585,11.903551,0.0,,0.230527,0.559117,2.4e-05,0.000117,,14440.82126,31.0
1,ID_8JCCXC,1/1/19,PD04,45.371005,11.84083,3.047342,,-0.074006,0.869309,2.4e-05,0.000127,,14441.79815,42.0
2,ID_V3136Z,1/1/19,RO01,45.045825,12.060869,0.0,,0.02447,0.67416,2.4e-05,8.6e-05,,14437.38294,31.0
3,ID_KRVZDJ,1/1/19,RO02,45.104075,11.553241,1.200467,,-0.010442,0.920054,2.4e-05,0.000124,,14440.83831,30.0
4,ID_PR351A,1/1/19,RO03,45.038758,11.790152,1.274564,,-0.176178,0.747464,2.4e-05,0.000116,,14438.79037,58.0


In [103]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=train_model3_df_prep_mix.copy()
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted=test_model3_df_prep_mix.copy()

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date']=train_df['Date'].copy()
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['ID_Zindi']=train_df['ID_Zindi'].copy()

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date']=test_df['Date'].copy()
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['ID_Zindi']=test_df['ID_Zindi'].copy()

#We will be sorting by date so creating a ordered PK to sort it back
# Create an ordered PK
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['PK'] = range(1, len(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted) + 1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['PK'] = range(1, len(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted) + 1)


# Ensure 'Date' is in datetime format
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'] = pd.to_datetime(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'])
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'] = pd.to_datetime(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted['Date'])

# Sort by 'LAT', 'LON', and 'Date'
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted = train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.sort_values(by=['LAT', 'LON', 'Date'])
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted = test_model3_df_prep_mix_roll_wind_mean_stddev_sorted.sort_values(by=['LAT', 'LON', 'Date'])


In [104]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550


In [105]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28


In [106]:
# train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.99954599].head(100)
Rolling_window=3
Cols_for_Rolling_Window=train_model3_df_prep_mix.columns.to_list()
Cols_for_Rolling_Window.remove('LAT')
Cols_for_Rolling_Window.remove('LON')

In [11]:
train_model3_df_prep_mix.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure
0,45.601585,11.903551,0.0,280.097333,0.230527,0.559117,2.4e-05,0.000117,0.000131,14440.82126
1,45.371005,11.84083,3.047342,280.097333,-0.074006,0.869309,2.4e-05,0.000127,0.000131,14441.79815
2,45.045825,12.060869,0.0,280.097333,0.02447,0.67416,2.4e-05,8.6e-05,0.000131,14437.38294
3,45.104075,11.553241,1.200467,280.097333,-0.010442,0.920054,2.4e-05,0.000124,0.000131,14440.83831
4,45.038758,11.790152,1.274564,280.097333,-0.176178,0.747464,2.4e-05,0.000116,0.000131,14438.79037


In [108]:
# Function to apply rolling mean to specified columns
# Function to apply rolling mean and stddev to specified columns
def apply_rolling_stats(df, group_cols, cols_to_roll, window=3):
    for col in cols_to_roll:
        new_mean_col_name = f'Rolling_Mean_{col}'
        new_stddev_col_name = f'Rolling_Stddev_{col}'
        df[new_mean_col_name] = df.groupby(group_cols)[col].transform(lambda x: x.rolling(window=window).mean())
        df[new_stddev_col_name] = df.groupby(group_cols)[col].transform(lambda x: x.rolling(window=window).std())
        df[new_mean_col_name].fillna(method='bfill', inplace=True)
        df[new_mean_col_name].fillna(method='ffill', inplace=True)
        df[new_stddev_col_name].fillna(method='bfill', inplace=True)
        df[new_stddev_col_name].fillna(method='ffill', inplace=True)
    return df

In [109]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted=apply_rolling_stats(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted,['LAT','LON'],Cols_for_Rolling_Window,window=3)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted=apply_rolling_stats(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted,['LAT','LON'],Cols_for_Rolling_Window,window=3)


In [110]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.92469405].head(10)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212
14368,44.924694,10.517502,0.0,308.28,-1.486456,0.017318,6e-05,0.000105,4.5e-05,8614.530051,2019-01-07,ID_BBYHQH,14369,0.0,0.0,305.686667,2.514147,-1.199292,0.257798,0.005776,0.009996,6.1e-05,1e-06,0.000104,1.3e-05,4.3e-05,1.1e-05,14870.55787,5567.81853
16817,44.924694,10.517502,0.0,305.82,-1.51968,0.022716,6.2e-05,0.000102,4e-05,14427.88873,2019-01-08,ID_W8OH93,16818,0.0,0.0,306.54,1.514332,-1.376587,0.2197,0.013345,0.011868,6.1e-05,1e-06,9.9e-05,7e-06,3.8e-05,7e-06,13252.520504,4176.254218
19266,44.924694,10.517502,0.0,309.72,-1.879356,0.0,5.6e-05,9.2e-05,3.6e-05,14426.71668,2019-01-09,ID_FYHRGF,19267,0.0,0.0,307.94,1.972105,-1.628497,0.217884,0.013345,0.011868,5.9e-05,3e-06,0.0001,7e-06,4e-05,5e-06,12489.71182,3356.005908
21636,44.924694,10.517502,0.0,298.46,-1.289062,0.04211,4.7e-05,8e-05,3.3e-05,13056.62147,2019-01-10,ID_NL8HHQ,21637,0.0,0.0,304.666667,5.717913,-1.562699,0.297489,0.021609,0.021077,5.5e-05,8e-06,9.1e-05,1.1e-05,3.6e-05,4e-06,13970.40896,791.363397


In [111]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted.head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22,0.0,0.0,278.625296,1.037649,-0.938282,0.366992,0.147458,0.212993,2.8e-05,7e-06,0.000155,5.9e-05,0.000102,1.2e-05,19451.419047,2769.11619
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28,0.642677,1.113149,277.695407,1.262636,-0.936442,0.369341,0.037229,0.032077,2.3e-05,6e-06,0.000166,7.7e-05,0.000128,5.6e-05,18585.48803,3198.771039
33,45.131947,10.015742,0.0,279.38,-0.634519,0.040908,2.2e-05,0.000181,0.000172,16733.67227,2019-01-06,ID_ZOMAXE,34,0.642677,1.113149,278.342074,1.534045,-0.866226,0.412832,0.050612,0.010105,2e-05,2e-06,0.000186,6.7e-05,0.000156,4.8e-05,16737.01085,6.809725


In [112]:
#Now starting lag features
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted.copy()
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.sort_values(by=['LAT','LON','Date'])

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=test_model3_df_prep_mix_roll_wind_mean_stddev_sorted.copy()
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1=test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.sort_values(by=['LAT','LON','Date'])



In [113]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted['LAT']==44.92469405].head(6)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212


In [114]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)
#44.92469405

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212


In [115]:
grouped_v1_train=train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.groupby(['LAT','LON'])
grouped_v1_test=test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.groupby(['LAT','LON'])

In [116]:
#train
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['AAI_LAG3'] = grouped_v1_train['AAI'].shift(3)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LST_LAG1'] = grouped_v1_train['LST'].shift(1)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG1'] = grouped_v1_train['NO2_strat'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG2'] = grouped_v1_train['NO2_strat'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG3'] = grouped_v1_train['NO2_strat'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG1'] = grouped_v1_train['NO2_total'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG2'] = grouped_v1_train['NO2_total'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG3'] = grouped_v1_train['NO2_total'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG1'] = grouped_v1_train['NO2_trop'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG2'] = grouped_v1_train['NO2_trop'].shift(2)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG3'] = grouped_v1_train['NO2_trop'].shift(3)

train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG1'] = grouped_v1_train['TropopausePressure'].shift(1)
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG2'] = grouped_v1_train['TropopausePressure'].shift(2)

#test
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['AAI_LAG3'] = grouped_v1_test['AAI'].shift(3)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LST_LAG1'] = grouped_v1_test['LST'].shift(1)

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG1'] = grouped_v1_test['NO2_strat'].shift(1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG2'] = grouped_v1_test['NO2_strat'].shift(2)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_strat_LAG3'] = grouped_v1_test['NO2_strat'].shift(3)

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG1'] = grouped_v1_test['NO2_total'].shift(1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG2'] = grouped_v1_test['NO2_total'].shift(2)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_total_LAG3'] = grouped_v1_test['NO2_total'].shift(3)

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG1'] = grouped_v1_test['NO2_trop'].shift(1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG2'] = grouped_v1_test['NO2_trop'].shift(2)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['NO2_trop_LAG3'] = grouped_v1_test['NO2_trop'].shift(3)

test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG1'] = grouped_v1_test['TropopausePressure'].shift(1)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['TropopausePressure_LAG2'] = grouped_v1_test['TropopausePressure'].shift(2)

In [117]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,,,,,,,,,,,,
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,280.675857,2.4e-05,,,0.000395,,,0.00014,,,14436.75358,
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,279.766,3e-05,2.4e-05,,0.000873,0.000395,,0.000143,0.00014,,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212,-0.886214,303.26,6.3e-05,5e-05,3.7e-05,0.000116,0.000152,0.000171,5.3e-05,0.000102,0.000119,19282.00083,19286.55321


In [118]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,,,,,,,,,,,,
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,279.369667,2.4e-05,,,0.000266,,,0.000114,,,14443.09006,
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,279.369667,3.4e-05,2.4e-05,,0.000223,0.000266,,0.000114,0.000114,,19330.30774,14443.09006
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22,0.0,0.0,278.625296,1.037649,-0.938282,0.366992,0.147458,0.212993,2.8e-05,7e-06,0.000155,5.9e-05,0.000102,1.2e-05,19451.419047,2769.11619,0.132952,277.44,3e-05,3.4e-05,2.4e-05,0.00012,0.000223,0.000266,9e-05,0.000114,0.000114,22279.10381,19330.30774
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28,0.642677,1.113149,277.695407,1.262636,-0.936442,0.369341,0.037229,0.032077,2.3e-05,6e-06,0.000166,7.7e-05,0.000128,5.6e-05,18585.48803,3198.771039,-0.626818,279.066222,2e-05,3e-05,3.4e-05,0.000122,0.00012,0.000223,0.000102,9e-05,0.000114,16744.84559,22279.10381
33,45.131947,10.015742,0.0,279.38,-0.634519,0.040908,2.2e-05,0.000181,0.000172,16733.67227,2019-01-06,ID_ZOMAXE,34,0.642677,1.113149,278.342074,1.534045,-0.866226,0.412832,0.050612,0.010105,2e-05,2e-06,0.000186,6.7e-05,0.000156,4.8e-05,16737.01085,6.809725,-0.845165,276.58,1.8e-05,2e-05,3e-05,0.000255,0.000122,0.00012,0.000193,0.000102,9e-05,16732.51469,16744.84559


In [119]:
print(train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].shape[0])
print(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==45.13194691].shape[0])
print(test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==45.28937609].shape[0])

1096
1096
1096


In [120]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==45.13194691].head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,,,,,,,,,,,,
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,279.369667,2.4e-05,,,0.000266,,,0.000114,,,14443.09006,
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,,279.369667,3.4e-05,2.4e-05,,0.000223,0.000266,,0.000114,0.000114,,19330.30774,14443.09006
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22,0.0,0.0,278.625296,1.037649,-0.938282,0.366992,0.147458,0.212993,2.8e-05,7e-06,0.000155,5.9e-05,0.000102,1.2e-05,19451.419047,2769.11619,0.132952,277.44,3e-05,3.4e-05,2.4e-05,0.00012,0.000223,0.000266,9e-05,0.000114,0.000114,22279.10381,19330.30774
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28,0.642677,1.113149,277.695407,1.262636,-0.936442,0.369341,0.037229,0.032077,2.3e-05,6e-06,0.000166,7.7e-05,0.000128,5.6e-05,18585.48803,3198.771039,-0.626818,279.066222,2e-05,3e-05,3.4e-05,0.000122,0.00012,0.000223,0.000102,9e-05,0.000114,16744.84559,22279.10381


In [121]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,,,,,,,,,,,,
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,280.675857,2.4e-05,,,0.000395,,,0.00014,,,14436.75358,
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,,279.766,3e-05,2.4e-05,,0.000873,0.000395,,0.000143,0.00014,,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913


In [122]:
LAG_Cols=[col for col in train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.columns if "LAG" in col]
LAG_Cols

['AAI_LAG3',
 'LST_LAG1',
 'NO2_strat_LAG1',
 'NO2_strat_LAG2',
 'NO2_strat_LAG3',
 'NO2_total_LAG1',
 'NO2_total_LAG2',
 'NO2_total_LAG3',
 'NO2_trop_LAG1',
 'NO2_trop_LAG2',
 'NO2_trop_LAG3',
 'TropopausePressure_LAG1',
 'TropopausePressure_LAG2']

In [123]:
#Doing backfill 
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.fillna(method='bfill', inplace=True)
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.fillna(method='bfill', inplace=True)

In [124]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==44.92469405].head(6)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,280.675857,2.4e-05,2.4e-05,2.4e-05,0.000395,0.000395,0.000395,0.00014,0.00014,0.00014,14436.75358,14436.75358
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,280.675857,2.4e-05,2.4e-05,2.4e-05,0.000395,0.000395,0.000395,0.00014,0.00014,0.00014,14436.75358,14436.75358
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,279.766,3e-05,2.4e-05,2.4e-05,0.000873,0.000395,0.000395,0.000143,0.00014,0.00014,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913
11998,44.924694,10.517502,0.0,305.52,-1.123626,0.0,6.1e-05,9.1e-05,3e-05,16715.14273,2019-01-06,ID_FKZF1C,11999,0.0,0.0,301.006667,5.968059,-1.00183,0.115421,0.036466,0.063154,5.8e-05,7e-06,0.00012,3.1e-05,6.2e-05,3.7e-05,18427.898923,1483.29212,-0.886214,303.26,6.3e-05,5e-05,3.7e-05,0.000116,0.000152,0.000171,5.3e-05,0.000102,0.000119,19282.00083,19286.55321


In [125]:
test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1[test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1['LAT']==45.13194691].head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,2019-01-01,ID_QGSNTZ,4,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,0.132952,279.369667,2.4e-05,2.4e-05,2.4e-05,0.000266,0.000266,0.000266,0.000114,0.000114,0.000114,14443.09006,14443.09006
9,45.131947,10.015742,0.0,279.369667,-0.626818,0.391763,3.4e-05,0.000223,0.000114,19330.30774,2019-01-02,ID_8JASJD,10,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,0.132952,279.369667,2.4e-05,2.4e-05,2.4e-05,0.000266,0.000266,0.000266,0.000114,0.000114,0.000114,14443.09006,14443.09006
15,45.131947,10.015742,0.0,277.44,-0.845165,0.000761,3e-05,0.00012,9e-05,22279.10381,2019-01-03,ID_C2YTPV,16,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,0.132952,279.369667,3.4e-05,2.4e-05,2.4e-05,0.000223,0.000266,0.000266,0.000114,0.000114,0.000114,19330.30774,14443.09006
21,45.131947,10.015742,0.0,279.066222,-1.342863,0.04985,2e-05,0.000122,0.000102,16744.84559,2019-01-04,ID_O6FGA6,22,0.0,0.0,278.625296,1.037649,-0.938282,0.366992,0.147458,0.212993,2.8e-05,7e-06,0.000155,5.9e-05,0.000102,1.2e-05,19451.419047,2769.11619,0.132952,277.44,3e-05,3.4e-05,2.4e-05,0.00012,0.000223,0.000266,9e-05,0.000114,0.000114,22279.10381,19330.30774
27,45.131947,10.015742,1.928031,276.58,-0.621298,0.061076,1.8e-05,0.000255,0.000193,16732.51469,2019-01-05,ID_ANJZBA,28,0.642677,1.113149,277.695407,1.262636,-0.936442,0.369341,0.037229,0.032077,2.3e-05,6e-06,0.000166,7.7e-05,0.000128,5.6e-05,18585.48803,3198.771039,-0.626818,279.066222,2e-05,3e-05,3.4e-05,0.000122,0.00012,0.000223,0.000102,9e-05,0.000114,16744.84559,22279.10381


In [126]:
#Checking if any null present or not
print('train null check\n',train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.isnull().sum())
print('\n')
print('test null check\n',test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.isnull().sum())

train null check
 LAT                                  0
LON                                  0
Precipitation                        0
LST                                  0
AAI                                  0
CloudFraction                        0
NO2_strat                            0
NO2_total                            0
NO2_trop                             0
TropopausePressure                   0
Date                                 0
ID_Zindi                             0
PK                                   0
Rolling_Mean_Precipitation           0
Rolling_Stddev_Precipitation         0
Rolling_Mean_LST                     0
Rolling_Stddev_LST                   0
Rolling_Mean_AAI                     0
Rolling_Stddev_AAI                   0
Rolling_Mean_CloudFraction           0
Rolling_Stddev_CloudFraction         0
Rolling_Mean_NO2_strat               0
Rolling_Stddev_NO2_strat             0
Rolling_Mean_NO2_total               0
Rolling_Stddev_NO2_total             0
Rolling

In [127]:
train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Date,ID_Zindi,PK,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
69,44.924694,10.517502,0.0,280.675857,0.286079,0.954099,2.4e-05,0.000395,0.00014,14436.75358,2019-01-01,ID_ELHEMB,70,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,280.675857,2.4e-05,2.4e-05,2.4e-05,0.000395,0.000395,0.000395,0.00014,0.00014,0.00014,14436.75358,14436.75358
2518,44.924694,10.517502,8.211939,279.766,-0.579522,0.970421,3e-05,0.000873,0.000143,16692.0173,2019-01-02,ID_RLD66Y,2519,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,280.675857,2.4e-05,2.4e-05,2.4e-05,0.000395,0.000395,0.000395,0.00014,0.00014,0.00014,14436.75358,14436.75358
4730,44.924694,10.517502,0.0,294.249333,-0.886214,0.273483,3.7e-05,0.000171,0.000119,19279.33913,2019-01-03,ID_JLN5ZY,4731,2.737313,4.741165,284.897063,8.11207,-0.393219,0.607947,0.732668,0.39775,3.1e-05,7e-06,0.00048,0.000359,0.000134,1.3e-05,16802.703337,2423.189479,0.286079,279.766,3e-05,2.4e-05,2.4e-05,0.000873,0.000395,0.000395,0.000143,0.00014,0.00014,16692.0173,14436.75358
7179,44.924694,10.517502,0.0,294.24,-0.894068,0.10939,5e-05,0.000152,0.000102,19286.55321,2019-01-04,ID_YC6QCK,7180,2.737313,4.741165,289.418444,8.359263,-0.786601,0.179379,0.451098,0.457169,3.9e-05,1e-05,0.000399,0.000411,0.000121,2.1e-05,18419.303213,1495.877829,0.286079,294.249333,3.7e-05,3e-05,2.4e-05,0.000171,0.000873,0.000395,0.000119,0.000143,0.00014,19279.33913,16692.0173
9549,44.924694,10.517502,0.0,303.26,-0.987795,8e-06,6.3e-05,0.000116,5.3e-05,19282.00083,2019-01-05,ID_I56YD1,9550,0.0,0.0,297.249778,5.205007,-0.922692,0.056517,0.127627,0.137646,5e-05,1.3e-05,0.000146,2.8e-05,9.1e-05,3.4e-05,19282.631057,3.648099,-0.579522,294.24,5e-05,3.7e-05,3e-05,0.000152,0.000171,0.000873,0.000102,0.000119,0.000143,19286.55321,19279.33913


In [128]:
train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1 = train_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.sort_values(by='PK')
test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1 = test_model3_df_prep_mix_roll_wind_mean_stddev_sorted_v1.sort_values(by='PK')
train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.drop(['ID_Zindi','PK','Date'],axis=1)
test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1=test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.drop(['ID_Zindi','PK','Date'],axis=1)

In [129]:
train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,45.601585,11.903551,0.0,280.097333,0.230527,0.559117,2.4e-05,0.000117,0.000131,14440.82126,6.584876,11.405339,283.669317,7.679005,-0.287823,0.483869,0.719599,0.139724,3.2e-05,7e-06,0.000183,0.000121,0.000161,4.1e-05,17478.750221,2648.341256,0.230527,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000117,0.000117,0.000117,0.000131,0.000131,0.000131,14440.82126,14440.82126
1,45.371005,11.84083,3.047342,280.097333,-0.074006,0.869309,2.4e-05,0.000127,0.000131,14441.79815,5.571793,7.175196,283.63682,7.758017,-0.426385,0.316991,0.757004,0.14117,3.2e-05,7e-06,0.000214,0.000104,0.000159,4e-05,17419.601916,2609.250967,-0.074006,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000127,0.000127,0.000127,0.000131,0.000131,0.000131,14441.79815,14441.79815
2,45.045825,12.060869,0.0,280.097333,0.02447,0.67416,2.4e-05,8.6e-05,0.000131,14437.38294,3.979073,6.891956,283.587963,7.83351,-0.345919,0.34386,0.76843,0.081828,3.3e-05,8e-06,0.000172,0.000135,0.000158,3.8e-05,17350.081865,2567.367039,0.02447,280.097333,2.4e-05,2.4e-05,2.4e-05,8.6e-05,8.6e-05,8.6e-05,0.000131,0.000131,0.000131,14437.38294,14437.38294
3,45.104075,11.553241,1.200467,280.097333,-0.010442,0.920054,2.4e-05,0.000124,0.000131,14440.83831,2.328808,3.053554,283.516375,7.892164,-0.159093,0.268755,0.856576,0.183586,3.1e-05,7e-06,0.000304,0.000281,0.000156,3.4e-05,16813.63926,2431.51377,-0.010442,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000124,0.000124,0.000124,0.000131,0.000131,0.000131,14440.83831,14440.83831
4,45.038758,11.790152,1.274564,280.097333,-0.176178,0.747464,2.4e-05,0.000116,0.000131,14438.79037,4.241536,6.275195,283.48328,7.998216,-0.357413,0.178524,0.7746,0.097796,3.3e-05,8e-06,0.000207,0.000129,0.000155,3.1e-05,17231.772274,2508.138542,-0.176178,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000116,0.000116,0.000116,0.000131,0.000131,0.000131,14438.79037,14438.79037


In [130]:
test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,45.289376,11.642394,3.277529,279.369667,-0.313361,0.771456,2.4e-05,7.5e-05,0.000114,14440.02819,1.09251,1.892282,278.209889,3.593232,-0.402533,0.280688,0.312794,0.405887,2.9e-05,5e-06,8.9e-05,3.4e-05,6.9e-05,4.1e-05,19153.006053,4147.00114,-0.313361,279.369667,2.4e-05,2.4e-05,2.4e-05,7.5e-05,7.5e-05,7.5e-05,0.000114,0.000114,0.000114,14440.02819,14440.02819
1,45.836941,12.510362,0.0,279.369667,-0.229512,0.398208,2.3e-05,0.00012,0.000114,14434.0479,0.0,0.0,280.769889,1.504183,-0.528932,0.269267,0.148067,0.217846,2.8e-05,4e-06,0.000105,3.6e-05,8.4e-05,4.5e-05,19131.785407,4135.206982,-0.229512,279.369667,2.3e-05,2.3e-05,2.3e-05,0.00012,0.00012,0.00012,0.000114,0.000114,0.000114,14434.0479,14434.0479
2,45.582894,8.842165,0.0,282.98,-0.470822,0.153694,2.3e-05,0.000171,0.000148,14427.42478,0.0,0.0,282.366667,1.629028,-0.679175,0.183454,0.051231,0.088735,2.8e-05,5e-06,0.00017,9.6e-05,0.000143,9.8e-05,18626.888577,3916.558243,-0.470822,282.98,2.3e-05,2.3e-05,2.3e-05,0.000171,0.000171,0.000171,0.000148,0.000148,0.000148,14427.42478,14427.42478
3,45.131947,10.015742,1.928031,279.369667,0.132952,0.756917,2.4e-05,0.000266,0.000114,14443.09006,0.642677,1.113149,278.726444,1.114094,-0.446344,0.513426,0.383147,0.378152,2.9e-05,5e-06,0.000203,7.5e-05,0.000106,1.4e-05,18684.167203,3957.764655,0.132952,279.369667,2.4e-05,2.4e-05,2.4e-05,0.000266,0.000266,0.000266,0.000114,0.000114,0.000114,14443.09006,14443.09006
4,45.186329,9.146666,0.0,279.369667,-0.198272,0.678858,2.3e-05,0.000149,0.000114,14440.8584,0.0,0.0,280.842222,2.631201,-0.774266,0.58342,0.333002,0.339611,2.9e-05,5e-06,0.000116,3.4e-05,9.2e-05,3.9e-05,18673.04112,3949.25678,-0.198272,279.369667,2.3e-05,2.3e-05,2.3e-05,0.000149,0.000149,0.000149,0.000114,0.000114,0.000114,14440.8584,14440.8584


In [131]:
print(train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[abs(train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew())>0.5])
Skewed_cols=train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[abs(train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew())>0.5].index.to_list()

print(Skewed_cols)

LON                                  0.911077
Precipitation                        4.569618
CloudFraction                        1.236881
NO2_total                            4.169193
NO2_trop                             2.935214
Rolling_Mean_Precipitation           2.702031
Rolling_Stddev_Precipitation         2.793419
Rolling_Stddev_LST                   1.407023
Rolling_Mean_AAI                     0.558595
Rolling_Stddev_AAI                   1.204749
Rolling_Mean_CloudFraction           0.824397
Rolling_Stddev_CloudFraction         0.507065
Rolling_Stddev_NO2_strat             1.383593
Rolling_Mean_NO2_total               2.855243
Rolling_Stddev_NO2_total             4.382217
Rolling_Mean_NO2_trop                2.107716
Rolling_Stddev_NO2_trop              2.970711
Rolling_Stddev_TropopausePressure    0.806626
NO2_total_LAG1                       4.179662
NO2_total_LAG2                       4.183827
NO2_total_LAG3                       4.180559
NO2_trop_LAG1                     

In [132]:
print(test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[abs(test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew())>0.5])

LAT                                  0.839399
Precipitation                        4.409900
AAI                                  0.557857
CloudFraction                        1.295619
NO2_total                            5.237289
NO2_trop                             3.275638
Rolling_Mean_Precipitation           2.613215
Rolling_Stddev_Precipitation         2.756800
Rolling_Stddev_LST                   2.293982
Rolling_Mean_AAI                     0.667658
Rolling_Stddev_AAI                   1.358639
Rolling_Mean_CloudFraction           0.978377
Rolling_Stddev_CloudFraction         0.884107
Rolling_Stddev_NO2_strat             1.327646
Rolling_Mean_NO2_total               4.225154
Rolling_Stddev_NO2_total             5.097974
Rolling_Mean_NO2_trop                1.971349
Rolling_Stddev_NO2_trop              4.451370
Rolling_Stddev_TropopausePressure    0.806643
AAI_LAG3                             0.555439
NO2_total_LAG1                       5.250821
NO2_total_LAG2                    

In [133]:
#Before treating for skewness lets create a model

model= LinearRegression()
model.fit(train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1, train_model3_df_prep_mix_GT_NO2_mix)

# Predict using the test dataset
y_pred = model.predict(test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1)

# Add additional 'ID_Zindi' to y_pred3 and create a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=['GT_NO2'])
y_pred_result_df = pd.concat([test_df['ID_Zindi'], y_pred_df], axis=1)

# Save the results to a CSV file
y_pred_result_df.to_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/model5_lag_roll.csv', index=False)


In [134]:
print(train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[abs(train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew())>0.5])
Skewed_cols=train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[abs(train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew())>0.5].index.to_list()
print(Skewed_cols)
Skewed_cols.remove('LON')

LON                                  0.911077
Precipitation                        4.569618
CloudFraction                        1.236881
NO2_total                            4.169193
NO2_trop                             2.935214
Rolling_Mean_Precipitation           2.702031
Rolling_Stddev_Precipitation         2.793419
Rolling_Stddev_LST                   1.407023
Rolling_Mean_AAI                     0.558595
Rolling_Stddev_AAI                   1.204749
Rolling_Mean_CloudFraction           0.824397
Rolling_Stddev_CloudFraction         0.507065
Rolling_Stddev_NO2_strat             1.383593
Rolling_Mean_NO2_total               2.855243
Rolling_Stddev_NO2_total             4.382217
Rolling_Mean_NO2_trop                2.107716
Rolling_Stddev_NO2_trop              2.970711
Rolling_Stddev_TropopausePressure    0.806626
NO2_total_LAG1                       4.179662
NO2_total_LAG2                       4.183827
NO2_total_LAG3                       4.180559
NO2_trop_LAG1                     

In [135]:
#Treating skewness with log
def log_transfrom_trt(series):
    if series.min()<0:
        offset=abs(series.min())+0.0001
    else:
        offset=0.0001
    return np.log(series+offset)

In [136]:
train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.copy()
train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.head(20)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,45.601585,11.903551,0.0,280.097333,0.230527,0.559117,2.4e-05,0.000117,0.000131,14440.82126,6.584876,11.405339,283.669317,7.679005,-0.287823,0.483869,0.719599,0.139724,3.2e-05,7e-06,0.000183,0.000121,0.000161,4.1e-05,17478.750221,2648.341256,0.230527,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000117,0.000117,0.000117,0.000131,0.000131,0.000131,14440.82126,14440.82126
1,45.371005,11.84083,3.047342,280.097333,-0.074006,0.869309,2.4e-05,0.000127,0.000131,14441.79815,5.571793,7.175196,283.63682,7.758017,-0.426385,0.316991,0.757004,0.14117,3.2e-05,7e-06,0.000214,0.000104,0.000159,4e-05,17419.601916,2609.250967,-0.074006,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000127,0.000127,0.000127,0.000131,0.000131,0.000131,14441.79815,14441.79815
2,45.045825,12.060869,0.0,280.097333,0.02447,0.67416,2.4e-05,8.6e-05,0.000131,14437.38294,3.979073,6.891956,283.587963,7.83351,-0.345919,0.34386,0.76843,0.081828,3.3e-05,8e-06,0.000172,0.000135,0.000158,3.8e-05,17350.081865,2567.367039,0.02447,280.097333,2.4e-05,2.4e-05,2.4e-05,8.6e-05,8.6e-05,8.6e-05,0.000131,0.000131,0.000131,14437.38294,14437.38294
3,45.104075,11.553241,1.200467,280.097333,-0.010442,0.920054,2.4e-05,0.000124,0.000131,14440.83831,2.328808,3.053554,283.516375,7.892164,-0.159093,0.268755,0.856576,0.183586,3.1e-05,7e-06,0.000304,0.000281,0.000156,3.4e-05,16813.63926,2431.51377,-0.010442,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000124,0.000124,0.000124,0.000131,0.000131,0.000131,14440.83831,14440.83831
4,45.038758,11.790152,1.274564,280.097333,-0.176178,0.747464,2.4e-05,0.000116,0.000131,14438.79037,4.241536,6.275195,283.48328,7.998216,-0.357413,0.178524,0.7746,0.097796,3.3e-05,8e-06,0.000207,0.000129,0.000155,3.1e-05,17231.772274,2508.138542,-0.176178,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000116,0.000116,0.000116,0.000131,0.000131,0.000131,14438.79037,14438.79037
5,45.889734,12.307124,0.0,278.38,-0.366831,0.324392,2.3e-05,0.000109,0.000131,14432.05624,6.467497,6.946312,282.900712,8.495183,-0.406773,0.070077,0.711479,0.335296,3.2e-05,8e-06,0.000188,0.000158,0.000155,3.1e-05,17172.840333,2478.232614,-0.366831,278.38,2.3e-05,2.3e-05,2.3e-05,0.000109,0.000109,0.000109,0.000131,0.000131,0.000131,14432.05624,14432.05624
6,45.671721,12.237807,0.0,280.097333,0.188599,0.818422,2.4e-05,0.000135,0.000131,14436.70176,8.585276,10.632579,283.5256,8.223582,-0.217691,0.351931,0.84458,0.060162,3.2e-05,8e-06,0.000206,0.00016,0.000155,3.2e-05,17154.1402,2477.350827,0.188599,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000135,0.000135,0.000135,0.000131,0.000131,0.000131,14436.70176,14436.70176
7,45.629092,12.590682,0.0,280.097333,0.507837,0.926018,2.4e-05,0.000137,0.000131,14435.0196,5.56395,9.637044,283.605664,8.374745,-0.106631,0.535749,0.893003,0.059437,3.2e-05,8e-06,0.000206,0.000162,0.000155,3.1e-05,17126.544369,2465.640962,0.507837,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000137,0.000137,0.000137,0.000131,0.000131,0.000131,14435.0196,14435.0196
8,45.499618,12.261249,0.0,280.097333,0.087363,0.835097,2.4e-05,0.000104,0.000131,14438.51425,5.121933,8.871448,283.590035,8.353166,-0.176963,0.229061,0.82616,0.109202,3.2e-05,8e-06,0.000204,0.00016,0.000155,3.1e-05,17111.016002,2463.19211,0.087363,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000104,0.000104,0.000104,0.000131,0.000131,0.000131,14438.51425,14438.51425
9,45.428424,12.31293,0.650355,280.097333,0.208678,0.812696,2.4e-05,0.000117,0.000131,14437.71015,3.558747,5.610138,283.561935,8.301308,-0.153326,0.31573,0.837929,0.090406,3.2e-05,8e-06,0.000202,0.000164,0.000154,3e-05,17080.645429,2453.268242,0.208678,280.097333,2.4e-05,2.4e-05,2.4e-05,0.000117,0.000117,0.000117,0.000131,0.000131,0.000131,14437.71015,14437.71015


In [137]:
for col in Skewed_cols:
    train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col]=log_transfrom_trt(train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col])


In [138]:
train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.head(10)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,45.601585,11.903551,-9.21034,280.097333,0.230527,-0.581218,2.4e-05,-8.382225,-8.268476,14440.82126,1.884791,2.43409,283.669317,2.038503,1.099332,-0.725735,-0.328923,-1.967372,3.2e-05,-9.13992,-8.1688,-8.417132,-8.251292,-8.864902,17478.750221,7.881689,0.230527,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.382225,-8.382225,-8.382225,-8.268476,-8.268476,-8.268476,14440.82126,14440.82126
1,45.371005,11.84083,1.114303,280.097333,-0.074006,-0.139942,2.4e-05,-8.339466,-8.268476,14441.79815,1.717735,1.970644,283.63682,2.04874,1.052079,-1.148567,-0.278255,-1.957082,3.2e-05,-9.138728,-8.065476,-8.496386,-8.257614,-8.874134,17419.601916,7.866819,-0.074006,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.339466,-8.339466,-8.339466,-8.268476,-8.268476,-8.268476,14441.79815,14441.79815
2,45.045825,12.060869,-9.21034,280.097333,0.02447,-0.394139,2.4e-05,-8.526739,-8.268476,14437.38294,1.381074,1.93037,283.587963,2.058424,1.079791,-1.06723,-0.263276,-2.501912,3.3e-05,-9.135579,-8.208074,-8.355529,-8.261479,-8.88828,17350.081865,7.850636,0.02447,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.526739,-8.526739,-8.526739,-8.268476,-8.268476,-8.268476,14437.38294,14437.38294
3,45.104075,11.553241,0.182794,280.097333,-0.010442,-0.083215,2.4e-05,-8.352103,-8.268476,14440.83831,0.8454,1.116339,283.516375,2.065883,1.141317,-1.313584,-0.154695,-1.694526,3.1e-05,-9.146313,-7.814921,-7.87378,-8.26999,-8.920634,16813.63926,7.796269,-0.010442,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.352103,-8.352103,-8.352103,-8.268476,-8.268476,-8.268476,14440.83831,14440.83831
4,45.038758,11.790152,0.242682,280.097333,-0.176178,-0.290936,2.4e-05,-8.386604,-8.268476,14438.79037,1.444949,1.836621,283.48328,2.079231,1.075879,-1.72247,-0.25528,-2.32385,3.3e-05,-9.135782,-8.089858,-8.383253,-8.276172,-8.943668,17231.772274,7.827296,-0.176178,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.386604,-8.386604,-8.386604,-8.268476,-8.268476,-8.268476,14438.79037,14438.79037
5,45.889734,12.307124,-9.21034,278.38,-0.366831,-1.125495,2.3e-05,-8.4178,-8.268476,14432.05624,1.866805,1.938225,282.900712,2.139511,1.058905,-2.656729,-0.340269,-1.092445,3.2e-05,-9.137187,-8.152338,-8.260812,-8.274299,-8.937191,17172.840333,7.815301,-0.366831,278.38,2.3e-05,2.3e-05,2.3e-05,-8.4178,-8.4178,-8.4178,-8.268476,-8.268476,-8.268476,14432.05624,14432.05624
6,45.671721,12.237807,-9.21034,280.097333,0.188599,-0.200256,2.4e-05,-8.306527,-8.268476,14436.70176,2.15006,2.363932,283.5256,2.107018,1.122424,-1.044035,-0.168797,-2.809058,3.2e-05,-9.137828,-8.092234,-8.2555,-8.273439,-8.934068,17154.1402,7.814945,0.188599,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.306527,-8.306527,-8.306527,-8.268476,-8.268476,-8.268476,14436.70176,14436.70176
7,45.629092,12.590682,-9.21034,280.097333,0.507837,-0.076753,2.4e-05,-8.298459,-8.268476,14435.0196,1.716326,2.265625,283.605664,2.125233,1.157935,-0.623903,-0.113053,-2.821164,3.2e-05,-9.137338,-8.09053,-8.245707,-8.275722,-8.941309,17126.544369,7.810207,0.507837,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.298459,-8.298459,-8.298459,-8.268476,-8.268476,-8.268476,14435.0196,14435.0196
8,45.499618,12.261249,-9.21034,280.097333,0.087363,-0.180088,2.4e-05,-8.440695,-8.268476,14438.51425,1.633551,2.182849,283.590035,2.122653,1.135593,-1.473329,-0.190845,-2.213637,3.2e-05,-9.137289,-8.098633,-8.253633,-8.274293,-8.936575,17111.016002,7.809213,0.087363,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.440695,-8.440695,-8.440695,-8.268476,-8.268476,-8.268476,14438.51425,14438.51425
9,45.428424,12.31293,-0.430082,280.097333,0.208678,-0.207276,2.4e-05,-8.382225,-8.268476,14437.71015,1.269437,1.724593,283.561935,2.116425,1.143158,-1.15255,-0.176703,-2.402338,3.2e-05,-9.13753,-8.105916,-8.237994,-8.278622,-8.950891,17080.645429,7.805176,0.208678,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.382225,-8.382225,-8.382225,-8.268476,-8.268476,-8.268476,14437.71015,14437.71015


In [139]:
skew_incr=[]
for col in Skewed_cols:
    print('Before:',col,train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[col])
    print('After:',col,train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.skew()[col])
    if not abs(train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[col])>abs(train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.skew()[col]):
        skew_incr.append(col)
    print('\n')

Before: Precipitation 4.569618429259263
After: Precipitation 1.5171822647199633


Before: CloudFraction 1.2368812312528774
After: CloudFraction -1.4714216645089138


Before: NO2_total 4.169192733648738
After: NO2_total 1.6688966196282646


Before: NO2_trop 2.935213957795029
After: NO2_trop 1.3232412318536575


Before: Rolling_Mean_Precipitation 2.702030626182702
After: Rolling_Mean_Precipitation 0.2614906171333378


Before: Rolling_Stddev_Precipitation 2.7934187108952053
After: Rolling_Stddev_Precipitation 0.2582753345866494


Before: Rolling_Stddev_LST 1.407022882161994
After: Rolling_Stddev_LST -0.6201144706329049


Before: Rolling_Mean_AAI 0.5585953628123189
After: Rolling_Mean_AAI -0.794397295904864


Before: Rolling_Stddev_AAI 1.204749435254367
After: Rolling_Stddev_AAI -0.8403124994451319


Before: Rolling_Mean_CloudFraction 0.8243970114592594
After: Rolling_Mean_CloudFraction -2.26457274415614


Before: Rolling_Stddev_CloudFraction 0.507064503175373
After: Rolling_Stddev_CloudFr

In [140]:
#Skewness incresead for these cols. We will do sqrt instead
print(skew_incr)
for col in skew_incr:
    train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col]=train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1[col].copy()

['CloudFraction', 'Rolling_Mean_AAI', 'Rolling_Mean_CloudFraction', 'Rolling_Stddev_CloudFraction', 'Rolling_Stddev_TropopausePressure']


In [141]:
train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[skew_incr].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CloudFraction,86584.0,0.238371,0.274434,0.0,0.03064,0.112237,0.372591,1.0
Rolling_Mean_AAI,86584.0,-1.255426,0.534608,-3.289884,-1.630519,-1.322073,-0.950591,0.769099
Rolling_Mean_CloudFraction,86584.0,0.238663,0.180119,0.0,0.086085,0.208139,0.351432,0.976606
Rolling_Stddev_CloudFraction,86584.0,0.202189,0.153922,0.0,0.063585,0.170843,0.322412,0.57735
Rolling_Stddev_TropopausePressure,86584.0,1905.627055,1376.21449,0.0,1153.349783,1499.896124,2722.220593,7291.919792


In [142]:
#Org skewness
train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[skew_incr].skew()

CloudFraction                        1.236881
Rolling_Mean_AAI                     0.558595
Rolling_Mean_CloudFraction           0.824397
Rolling_Stddev_CloudFraction         0.507065
Rolling_Stddev_TropopausePressure    0.806626
dtype: float64

In [143]:
for col in skew_incr:
    train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col]=np.sqrt(train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col].abs())

In [144]:

print(train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[skew_incr].skew())
#Lets leave  Rolling_Mean_AAI as it is and not screw it up
Sqrt_trt_col_skew=skew_incr
Sqrt_trt_col_skew.remove('Rolling_Mean_AAI')
print(Sqrt_trt_col_skew)



CloudFraction                        0.470742
Rolling_Mean_AAI                    -1.152743
Rolling_Mean_CloudFraction          -0.016732
Rolling_Stddev_CloudFraction        -0.097127
Rolling_Stddev_TropopausePressure   -0.516428
dtype: float64
['CloudFraction', 'Rolling_Mean_CloudFraction', 'Rolling_Stddev_CloudFraction', 'Rolling_Stddev_TropopausePressure']


In [145]:
for col in skew_incr:
    Skewed_cols.remove(col)


In [146]:
Skewed_cols.remove('Rolling_Mean_AAI')


In [147]:
Log_trt_col_Skew=Skewed_cols
print(Sqrt_trt_col_skew)
print('\n')
print(Log_trt_col_Skew)

['CloudFraction', 'Rolling_Mean_CloudFraction', 'Rolling_Stddev_CloudFraction', 'Rolling_Stddev_TropopausePressure']


['Precipitation', 'NO2_total', 'NO2_trop', 'Rolling_Mean_Precipitation', 'Rolling_Stddev_Precipitation', 'Rolling_Stddev_LST', 'Rolling_Stddev_AAI', 'Rolling_Stddev_NO2_strat', 'Rolling_Mean_NO2_total', 'Rolling_Stddev_NO2_total', 'Rolling_Mean_NO2_trop', 'Rolling_Stddev_NO2_trop', 'NO2_total_LAG1', 'NO2_total_LAG2', 'NO2_total_LAG3', 'NO2_trop_LAG1', 'NO2_trop_LAG2', 'NO2_trop_LAG3']


In [148]:
for col in Log_trt_col_Skew:
    print('Before:',col,train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[col])
    print('After:',col,train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.skew()[col])

    print('\n')

Before: Precipitation 4.569618429259263
After: Precipitation 1.5171822647199633


Before: NO2_total 4.169192733648738
After: NO2_total 1.6688966196282646


Before: NO2_trop 2.935213957795029
After: NO2_trop 1.3232412318536575


Before: Rolling_Mean_Precipitation 2.702030626182702
After: Rolling_Mean_Precipitation 0.2614906171333378


Before: Rolling_Stddev_Precipitation 2.7934187108952053
After: Rolling_Stddev_Precipitation 0.2582753345866494


Before: Rolling_Stddev_LST 1.407022882161994
After: Rolling_Stddev_LST -0.6201144706329049


Before: Rolling_Stddev_AAI 1.204749435254367
After: Rolling_Stddev_AAI -0.8403124994451319


Before: Rolling_Stddev_NO2_strat 1.383593386266386
After: Rolling_Stddev_NO2_strat 1.2553984439680619


Before: Rolling_Mean_NO2_total 2.8552433803339774
After: Rolling_Mean_NO2_total 1.3952458634231528


Before: Rolling_Stddev_NO2_total 4.382217297092999
After: Rolling_Stddev_NO2_total 1.7125693551922305


Before: Rolling_Mean_NO2_trop 2.107715578643439
After: R

In [149]:
for col in Sqrt_trt_col_skew:
    print('Before:',col,train_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[col])
    print('After:',col,train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.skew()[col])

    print('\n')

#Final Train for Model 6: train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1


Before: CloudFraction 1.2368812312528774
After: CloudFraction 0.4707422286390351


Before: Rolling_Mean_CloudFraction 0.8243970114592594
After: Rolling_Mean_CloudFraction -0.016731679621355043


Before: Rolling_Stddev_CloudFraction 0.507064503175373
After: Rolling_Stddev_CloudFraction -0.09712652807714382


Before: Rolling_Stddev_TropopausePressure 0.8066263954114595
After: Rolling_Stddev_TropopausePressure -0.5164283952433941




In [150]:
train_model3_df_prep_mix_GT_NO2_mix.isnull().sum()

0

In [151]:
# #Now treating GT_NO2

print('Before treatement',train_model3_df_prep_mix_GT_NO2_mix.skew())
train_model3_df_prep_mix_skew_trt_GT_NO2=np.log1p(train_model3_df_prep_mix_GT_NO2_mix)
print('After treatement',train_model3_df_prep_mix_skew_trt_GT_NO2.skew())



Before treatement 1.4995334759205738
After treatement -0.19615003960412955


In [None]:
#Checkpoint: #Final GT_NO2 for Model 6: train_model3_df_prep_mix_skew_trt_GT_NO2
#Final Train for Model 6: train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1

In [152]:
test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1=test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.copy()

In [153]:
#Now treating skewness in Test in similar way # First doing long
for col in Log_trt_col_Skew:
    test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col]=log_transfrom_trt(test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col])

In [154]:
#Now doing sqrt treatment
for col in Sqrt_trt_col_skew:
    test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col]=np.sqrt(test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1[col].abs())

In [155]:
for col in Log_trt_col_Skew:
    print('Before:',col,test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[col])
    print('After:',col,test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.skew()[col])

    print('\n')

Before: Precipitation 4.409900124327067
After: Precipitation 1.5310861116506305


Before: NO2_total 5.237289466539959
After: NO2_total 1.987697663090459


Before: NO2_trop 3.275637690439222
After: NO2_trop 1.37284394791529


Before: Rolling_Mean_Precipitation 2.613215098283561
After: Rolling_Mean_Precipitation 0.3764374593092157


Before: Rolling_Stddev_Precipitation 2.7567998292347546
After: Rolling_Stddev_Precipitation 0.3780436004704904


Before: Rolling_Stddev_LST 2.293981682270173
After: Rolling_Stddev_LST -0.8802842147816398


Before: Rolling_Stddev_AAI 1.3586385217712897
After: Rolling_Stddev_AAI -1.0498045254475055


Before: Rolling_Stddev_NO2_strat 1.3276459016380369
After: Rolling_Stddev_NO2_strat 1.261921769810858


Before: Rolling_Mean_NO2_total 4.225154105376492
After: Rolling_Mean_NO2_total 1.8364780780513235


Before: Rolling_Stddev_NO2_total 5.097974453526616
After: Rolling_Stddev_NO2_total 2.2955603937824116


Before: Rolling_Mean_NO2_trop 1.9713487193928567
After: Rol

In [156]:
for col in Sqrt_trt_col_skew:
    print('Before:',col,test_model3_df_prep_mix_roll_wind_mean_stddev_org_v1.skew()[col])
    print('After:',col,test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.skew()[col])

    print('\n')

Before: CloudFraction 1.29561885475446
After: CloudFraction 0.44339351502902985


Before: Rolling_Mean_CloudFraction 0.9783771484616129
After: Rolling_Mean_CloudFraction 0.11265560819866258


Before: Rolling_Stddev_CloudFraction 0.8841065626851445
After: Rolling_Stddev_CloudFraction 0.13377003963794548


Before: Rolling_Stddev_TropopausePressure 0.8066432331630113
After: Rolling_Stddev_TropopausePressure -0.5472115449680394




In [157]:
#Checkpoint: #Final GT_NO2 for Model 6: train_model3_df_prep_mix_skew_trt_GT_NO2
#Final Train for Model 6: train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1
#Final Test for Model 6: test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1

In [163]:
#Before treating for scaling  lets create a model

model= LinearRegression()
model.fit(train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1, train_model3_df_prep_mix_skew_trt_GT_NO2)

# Predict using the test dataset
y_pred = model.predict(test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1)
y_pred=np.expm1(y_pred)
# Add additional 'ID_Zindi' to y_pred3 and create a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=['GT_NO2'])
y_pred_result_df = pd.concat([test_df['ID_Zindi'], y_pred_df], axis=1)

# Save the results to a CSV file
y_pred_result_df.to_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/model6_lag_roll_skew_trt.csv', index=False)

#This worsened the model 11.15. Will check if scaling helps.


In [171]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for alpha
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Ridge Regression model
ridge_model = Ridge()

# Initialize GridSearchCV with the model and parameter grid
grid_search = GridSearchCV(ridge_model, param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1, train_model3_df_prep_mix_skew_trt_GT_NO2)

# Get the best alpha value
best_alpha = grid_search.best_params_['alpha']

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1)
y_pred = np.expm1(y_pred)

# Add additional 'ID_Zindi' to y_pred and create a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=['GT_NO2'])
y_pred_result_df = pd.concat([test_df['ID_Zindi'], y_pred_df], axis=1)

# Save the results to a CSV file
y_pred_result_df.to_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/model6_lag_roll_skew_trt_best.csv', index=False)
#Gave 11.48

In [172]:
#Trying Lasso
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for alpha
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Lasso Regression model
lasso_model = Lasso()

# Initialize GridSearchCV with the model and parameter grid
grid_search = GridSearchCV(lasso_model, param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1, train_model3_df_prep_mix_skew_trt_GT_NO2)

# Get the best alpha value
best_alpha = grid_search.best_params_['alpha']

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1)
y_pred = np.expm1(y_pred)

# Add additional 'ID_Zindi' to y_pred and create a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=['GT_NO2'])
y_pred_result_df = pd.concat([test_df['ID_Zindi'], y_pred_df], axis=1)

# Save the results to a CSV file
y_pred_result_df.to_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/model6_lag_roll_skew_trt_lasso.csv', index=False)

#11.80

In [170]:
#We will continue scaling the log version .Let's see if it takes us to the moon.
#Checkpoint: #Final GT_NO2 for Model 6: train_model3_df_prep_mix_skew_trt_GT_NO2
#Final Train for Model 6: train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1
#Final Test for Model 6: test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1

In [182]:
train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.head(5)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,45.601585,11.903551,-9.21034,280.097333,0.230527,0.747741,2.4e-05,-8.382225,-8.268476,14440.82126,1.884791,2.43409,283.669317,2.038503,0.536492,-0.725735,0.848292,0.373797,3.2e-05,-9.13992,-8.1688,-8.417132,-8.251292,-8.864902,17478.750221,51.462037,0.230527,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.382225,-8.382225,-8.382225,-8.268476,-8.268476,-8.268476,14440.82126,14440.82126
1,45.371005,11.84083,1.114303,280.097333,-0.074006,0.932367,2.4e-05,-8.339466,-8.268476,14441.79815,1.717735,1.970644,283.63682,2.04874,0.652982,-1.148567,0.87006,0.375726,3.2e-05,-9.138728,-8.065476,-8.496386,-8.257614,-8.874134,17419.601916,51.080828,-0.074006,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.339466,-8.339466,-8.339466,-8.268476,-8.268476,-8.268476,14441.79815,14441.79815
2,45.045825,12.060869,-9.21034,280.097333,0.02447,0.821073,2.4e-05,-8.526739,-8.268476,14437.38294,1.381074,1.93037,283.587963,2.058424,0.588149,-1.06723,0.876601,0.286056,3.3e-05,-9.135579,-8.208074,-8.355529,-8.261479,-8.88828,17350.081865,50.669192,0.02447,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.526739,-8.526739,-8.526739,-8.268476,-8.268476,-8.268476,14437.38294,14437.38294
3,45.104075,11.553241,0.182794,280.097333,-0.010442,0.959194,2.4e-05,-8.352103,-8.268476,14440.83831,0.8454,1.116339,283.516375,2.065883,0.398865,-1.313584,0.925514,0.42847,3.1e-05,-9.146313,-7.814921,-7.87378,-8.26999,-8.920634,16813.63926,49.310382,-0.010442,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.352103,-8.352103,-8.352103,-8.268476,-8.268476,-8.268476,14440.83831,14440.83831
4,45.038758,11.790152,0.242682,280.097333,-0.176178,0.86456,2.4e-05,-8.386604,-8.268476,14438.79037,1.444949,1.836621,283.48328,2.079231,0.59784,-1.72247,0.880114,0.312723,3.3e-05,-9.135782,-8.089858,-8.383253,-8.276172,-8.943668,17231.772274,50.081319,-0.176178,280.097333,2.4e-05,2.4e-05,2.4e-05,-8.386604,-8.386604,-8.386604,-8.268476,-8.268476,-8.268476,14438.79037,14438.79037


In [177]:
Columns_to_rescale=[col for col in train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.columns]
Columns_to_rescale.remove('LAT')
Columns_to_rescale.remove('LON')

In [179]:
print(Columns_to_rescale)

['Precipitation', 'LST', 'AAI', 'CloudFraction', 'NO2_strat', 'NO2_total', 'NO2_trop', 'TropopausePressure', 'Rolling_Mean_Precipitation', 'Rolling_Stddev_Precipitation', 'Rolling_Mean_LST', 'Rolling_Stddev_LST', 'Rolling_Mean_AAI', 'Rolling_Stddev_AAI', 'Rolling_Mean_CloudFraction', 'Rolling_Stddev_CloudFraction', 'Rolling_Mean_NO2_strat', 'Rolling_Stddev_NO2_strat', 'Rolling_Mean_NO2_total', 'Rolling_Stddev_NO2_total', 'Rolling_Mean_NO2_trop', 'Rolling_Stddev_NO2_trop', 'Rolling_Mean_TropopausePressure', 'Rolling_Stddev_TropopausePressure', 'AAI_LAG3', 'LST_LAG1', 'NO2_strat_LAG1', 'NO2_strat_LAG2', 'NO2_strat_LAG3', 'NO2_total_LAG1', 'NO2_total_LAG2', 'NO2_total_LAG3', 'NO2_trop_LAG1', 'NO2_trop_LAG2', 'NO2_trop_LAG3', 'TropopausePressure_LAG1', 'TropopausePressure_LAG2']


In [180]:
train_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1=train_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.copy()


In [183]:
scaler=StandardScaler()
train_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1[Columns_to_rescale]=scaler.fit_transform(train_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1[Columns_to_rescale])

In [184]:
train_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1.head(4)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,45.601585,11.903551,-0.503912,-1.360332,2.114154,1.235104,-1.830742,-0.452184,0.572214,-0.769145,1.214188,1.246431,-1.193435,0.806175,-2.05484,0.355511,2.04638,-0.184767,-1.265506,0.479609,0.542183,1.539002,1.134472,0.245224,0.306788,0.641994,2.121276,-1.360036,-1.82837,-1.824989,-1.820979,-0.451985,-0.451752,-0.451738,0.572723,0.57368,0.573152,-0.769881,-0.770612
1,45.371005,11.84083,1.765684,-1.360332,1.680799,1.885906,-1.8221,-0.291999,0.572214,-0.768821,1.182434,1.161235,-1.196632,0.817607,-1.623276,-0.211639,2.157544,-0.174498,-1.238542,0.509566,0.985481,1.267091,1.108509,0.206654,0.281344,0.621682,1.686697,-1.360036,-1.819736,-1.816366,-1.812368,-0.291452,-0.291131,-0.291173,0.572723,0.57368,0.573152,-0.769557,-0.770287
2,45.045825,12.060869,-0.503912,-1.360332,1.820932,1.493595,-1.813458,-0.993555,0.572214,-0.770285,1.118441,1.153832,-1.201438,0.828422,-1.863464,-0.102541,2.190949,-0.651726,-1.188292,0.588704,0.373688,1.750352,1.09264,0.147554,0.251439,0.59975,1.827226,-1.360036,-1.811101,-1.807743,-1.803757,-0.99453,-0.994597,-0.994391,0.572723,0.57368,0.573152,-0.771021,-0.771753
3,45.104075,11.553241,1.560917,-1.360332,1.771252,1.98047,-1.804815,-0.339339,0.572214,-0.769139,1.01662,1.004187,-1.208481,0.836752,-2.56471,-0.432979,2.440733,0.106206,-1.352251,0.318949,2.060443,3.403172,1.057693,0.012382,0.020681,0.52735,1.777406,-1.360036,-1.802467,-1.79912,-1.795147,-0.338896,-0.3386,-0.338626,0.572723,0.57368,0.573152,-0.769875,-0.770606


In [188]:
test_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1=test_model3_df_prep_mix_roll_wind_mean_stddev_org_skew_trt_v1.copy()


In [189]:
test_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1.head(3)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,45.289376,11.642394,1.18712,279.369667,-0.313361,0.878326,2.4e-05,-8.649012,-8.406815,14440.02819,0.088569,0.637836,278.209889,1.27908,-0.402533,-1.270156,0.55928,0.637093,2.9e-05,-9.166267,-8.571297,-8.919476,-8.68506,-8.870042,19153.006053,64.397214,-0.313361,279.369667,2.4e-05,2.4e-05,2.4e-05,-8.649012,-8.649012,-8.649012,-8.406815,-8.406815,-8.406815,14440.02819,14440.02819
1,45.836941,12.510362,-9.21034,279.369667,-0.229512,0.631038,2.3e-05,-8.421883,-8.406815,14434.0479,-9.21034,-9.21034,280.769889,0.408316,-0.528932,-1.311679,0.384795,0.46674,2.8e-05,-9.167649,-8.491688,-8.905468,-8.602787,-8.84128,19131.785407,64.305575,-0.229512,279.369667,2.3e-05,2.3e-05,2.3e-05,-8.421883,-8.421883,-8.421883,-8.406815,-8.406815,-8.406815,14434.0479,14434.0479
2,45.582894,8.842165,-9.21034,282.98,-0.470822,0.392039,2.3e-05,-8.213392,-8.26659,14427.42478,-9.21034,-9.21034,282.366667,0.488045,-0.679175,-1.695247,0.226344,0.297885,2.8e-05,-9.164888,-8.216101,-8.536875,-8.324371,-8.525659,18626.888577,62.582412,-0.470822,282.98,2.3e-05,2.3e-05,2.3e-05,-8.213392,-8.213392,-8.213392,-8.26659,-8.26659,-8.26659,14427.42478,14427.42478


In [190]:
test_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1[Columns_to_rescale]=scaler.transform(test_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1[Columns_to_rescale])

In [192]:
test_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1.head(3)

Unnamed: 0,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,Rolling_Mean_Precipitation,Rolling_Stddev_Precipitation,Rolling_Mean_LST,Rolling_Stddev_LST,Rolling_Mean_AAI,Rolling_Stddev_AAI,Rolling_Mean_CloudFraction,Rolling_Stddev_CloudFraction,Rolling_Mean_NO2_strat,Rolling_Stddev_NO2_strat,Rolling_Mean_NO2_total,Rolling_Stddev_NO2_total,Rolling_Mean_NO2_trop,Rolling_Stddev_NO2_trop,Rolling_Mean_TropopausePressure,Rolling_Stddev_TropopausePressure,AAI_LAG3,LST_LAG1,NO2_strat_LAG1,NO2_strat_LAG2,NO2_strat_LAG3,NO2_total_LAG1,NO2_total_LAG2,NO2_total_LAG3,NO2_trop_LAG1,NO2_trop_LAG2,NO2_trop_LAG3,TropopausePressure_LAG1,TropopausePressure_LAG2
0,45.289376,11.642394,1.792713,-1.615027,1.40631,1.753055,-1.723963,-1.104863,0.788145,-0.796123,0.928436,0.982758,-1.758289,0.964441,1.536515,-0.029301,0.646826,1.454999,-1.364856,1.129595,-0.938638,0.157977,-0.231199,0.907251,0.904748,1.844955,1.405111,-1.614742,-1.721605,-1.71841,-1.715118,-1.106786,-1.107048,-1.106387,0.788325,0.790129,0.790158,-0.796687,-0.796832
1,45.836941,12.510362,-0.501195,-1.615027,1.531781,0.890988,-1.77443,-0.163627,0.788145,-0.798146,-0.843652,-0.841472,-1.517871,0.055584,1.308294,-0.080187,-0.17426,0.538688,-1.456659,1.04561,-0.560488,0.215411,0.141367,1.06574,0.896586,1.839521,1.530526,-1.614742,-1.772025,-1.768769,-1.765414,-0.162665,-0.162126,-0.161741,0.788325,0.790129,0.790158,-0.79871,-0.798856
2,45.582894,8.842165,-0.501195,-1.283065,1.170688,0.057819,-1.816487,0.700374,1.392296,-0.800386,-0.843652,-0.841472,-1.367912,0.138801,1.037021,-0.550253,-0.919894,-0.369566,-1.456659,1.213359,0.748562,1.726611,1.402157,2.804952,0.702381,1.737331,1.169594,-1.282816,-1.814042,-1.810734,-1.807327,0.703984,0.705258,0.70539,1.392548,1.395366,1.395397,-0.800951,-0.801097


In [None]:
#Checkpoint
#train: train_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1
#test: test_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1
#Final GT_NO2 for Model 6: train_model3_df_prep_mix_skew_trt_GT_NO2

In [193]:
#Lets create model 
model= LinearRegression()
model.fit(train_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1, train_model3_df_prep_mix_skew_trt_GT_NO2)

# Predict using the test dataset
y_pred = model.predict(test_model3_df_prep_mix_roll_wind_mean_stddev_org_scaled_v1)
y_pred=np.expm1(y_pred)
# Add additional 'ID_Zindi' to y_pred3 and create a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=['GT_NO2'])
y_pred_result_df = pd.concat([test_df['ID_Zindi'], y_pred_df], axis=1)

# Save the results to a CSV file
y_pred_result_df.to_csv('/home/anuragverma/Desktop/Kaggle/GeoAI Ground-level NO2 _Zindi/model7_scaled.csv', index=False)
#Scaling made it worst: 11.80