# Importing Required Libraries

In [19]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import GroupKFold,KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

# Defining Needed Functions

In [20]:
def post_process(test_data,y,feature_to_mean='date_month') -> int:
    """runs the post processing

    Keyword arguments:
    test_data -- the testing data
    y -- the target
    feature_to_mean -- feature to apply the post processing by
    Return: predictions after processing
    """
    test_data['target'] = y
    test_data['target_month'] = test_data[feature_to_mean].map(test_data[[feature_to_mean,'target']].groupby(feature_to_mean)['target'].mean())
    test_data.drop(columns='target',inplace=True)
    return test_data['target_month']

# Reading Data

In [21]:
train = pd.read_csv(r"/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Train.csv")
test = pd.read_csv(r"/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Test.csv")

In [22]:
print(train.shape),
print(test.shape)

(8071, 80)
(2783, 79)


# Cleaning data

In [23]:
def cleaning_data(train_data, test_data):
    train_feats = train_data.notna().sum()[train_data.notna().sum() > .15*len(train_data)].index
    test_feats = test_data.notna().sum()[test_data.notna().sum() > .15*len(test_data)].index
    train_data = train_data[train_feats]
    test_data = test_data[train_feats[:-1]]
    return train_data, test_data

train, test = cleaning_data(train, test)


In [24]:
train.shape

(8071, 73)

In [25]:
"""def viewing_site_lat(df):
    for col in df.columns:
        if df[col].dtype == float:
            print(f"you're viewing : {col}")
            print('MAX__________________-')
            print(df[col].max())
            print('MIN_________________-')
            print(df[col].min())
            print('MEAN_________________-')
            print(df[col].mean())
            print('next #################')
viewing_site_lat(train)
    """

'def viewing_site_lat(df):\n    for col in df.columns:\n        if df[col].dtype == float:\n            print(f"you\'re viewing : {col}")\n            print(\'MAX__________________-\')\n            print(df[col].max())\n            print(\'MIN_________________-\')\n            print(df[col].min())\n            print(\'MEAN_________________-\')\n            print(df[col].mean())\n            print(\'next #################\')\nviewing_site_lat(train)\n    '

In [26]:
train.columns

Index(['id', 'site_id', 'site_latitude', 'site_longitude', 'city', 'country',
       'date', 'hour', 'sulphurdioxide_so2_column_number_density',
       'sulphurdioxide_so2_column_number_density_amf',
       'sulphurdioxide_so2_slant_column_number_density',
       'sulphurdioxide_cloud_fraction', 'sulphurdioxide_sensor_azimuth_angle',
       'sulphurdioxide_sensor_zenith_angle',
       'sulphurdioxide_solar_azimuth_angle',
       'sulphurdioxide_solar_zenith_angle',
       'sulphurdioxide_so2_column_number_density_15km', 'month',
       'carbonmonoxide_co_column_number_density',
       'carbonmonoxide_h2o_column_number_density',
       'carbonmonoxide_cloud_height', 'carbonmonoxide_sensor_altitude',
       'carbonmonoxide_sensor_azimuth_angle',
       'carbonmonoxide_sensor_zenith_angle',
       'carbonmonoxide_solar_azimuth_angle',
       'carbonmonoxide_solar_zenith_angle',
       'nitrogendioxide_no2_column_number_density',
       'nitrogendioxide_tropospheric_no2_column_number_densi

In [27]:
for df in (train,test):
    df['date'] = pd.to_datetime(df['date'])
    df['hour'] = df['date'].dt.hour
    df['date_month'] = df['date'].dt.day_of_year
    df['DayOfWeek'] =  df['date'].dt.dayofweek
    df['Day'] =  df['date'].dt.day
    df['Year'] =  df['date'].dt.year
    
    
    df.drop(columns=['id','site_id','date'],inplace=True)


In [28]:
groups = train['city'] # the groups used in grouped kfold

# Label Encodig the features

In [29]:
le = LabelEncoder()
for column in ['city','country']:
    letrans = le.fit_transform(pd.concat([train,test])[column])
    train[column] = letrans[:len(train)]
    test[column] = letrans[len(train):]

# Modeling and Validation

In [30]:
model =  LGBMRegressor(random_state=42,n_estimators=200,max_depth=10,objective='mse')
n_splits = 4
cv = GroupKFold(n_splits=n_splits)

In [31]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17534
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 76
[LightGBM] [Info] Start training from score 24.639296


In [32]:
pred = model.predict(test)

In [33]:

def validate(trainset,testset,t,origin):
    model.fit(trainset.drop(columns=t),trainset[t])
    pred = model.predict(np.array(testset.drop(columns=t)))
    print('std: ', testset[t].std())

    # to validate the post processing
    origin['pm_5'] = pred
    origin['date'] = pd.to_datetime(origin['date'])
    origin['date_day'] = origin['date'].dt.dayofyear
    pred = origin['date_day'].map(origin[['date_day','pm_5']].groupby('date_day')['pm_5'].mean())
    #--------------------------------------------------------------------------------------------
    stds.append(testset[t].std())
    score = mean_squared_error(pred,testset[t],squared=False)
    print('score:', score)
    return score
stds = []
rmse = []

for v_train,v_test in cv.split(train.drop(columns='pm2_5'),train['pm2_5'],groups=groups):
    train_v, test_v= train.iloc[v_train],train.iloc[v_test]
    origin = train_set.iloc[v_test]
    rmse.append(validate(train_v,test_v,'pm2_5',origin))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17399
[LightGBM] [Info] Number of data points in the train set: 2475, number of used features: 76
[LightGBM] [Info] Start training from score 28.876230
std:  14.34682858151784
score: 14.702165241413226
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17507
[LightGBM] [Info] Number of data points in the train set: 6571, number of used features: 76
[LightGBM] [Info] Start training from score 25.954156


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


std:  24.982218437683237
score: 25.05074753764463
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17478
[LightGBM] [Info] Number of data points in the train set: 7219, number of used features: 76
[LightGBM] [Info] Start training from score 22.093092


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


std:  52.84388334184576
score: 52.35034443710488
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17519
[LightGBM] [Info] Number of data points in the train set: 7948, number of used features: 76
[LightGBM] [Info] Start training from score 24.545522


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


std:  22.35338406057941
score: 20.48577363859997


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


In [34]:
np.array(rmse).mean()

28.147257713690674

# Infrence

In [35]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17534
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 76
[LightGBM] [Info] Start training from score 24.639296


In [36]:
y = model.predict(test)

In [38]:
smaple = pd.read_csv('/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/SampleSubmission.csv')
smaple['pm2_5'] = post_process(test,y)
smaple.to_csv('/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/submissions/submission02.csv',index=False)