# Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import GroupKFold,KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
import mlflow

# Defining Needed Functions

In [2]:
def post_process(test_data,y,feature_to_mean='date_month') -> int:
    """runs the post processing

    Keyword arguments:
    test_data -- the testing data
    y -- the target
    feature_to_mean -- feature to apply the post processing by
    Return: predictions after processing
    """
    test_data['target'] = y
    test_data['target_month'] = test_data[feature_to_mean].map(test_data[[feature_to_mean,'target']].groupby(feature_to_mean)['target'].mean())
    test_data.drop(columns='target',inplace=True)
    return test_data['target_month']

# Reading Data

In [3]:
train = pd.read_csv("/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Train.csv")
test = pd.read_csv("/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Test.csv")

train = train[train['pm2_5'] < 200]


# Cleaning data

In [4]:
train_feats = train.notna().sum()[train.notna().sum() > .15*len(train)].index
test_feats = test.notna().sum()[test.notna().sum() > .15*len(test)].index
train = train[train_feats]
test = test[train_feats[:-1]]

# Generating time-related features

In [6]:
def time_features(date: str) -> pd.DataFrame:
    for df in(train,test): 
        df['date'] = pd.to_datetime(df[date])
        df['date_month'] = df['date'].dt.day_of_year
        df['DayOfWeek'] =  df['date'].dt.dayofweek
        df['Day'] =  df['date'].dt.day
        df['Year'] =  df['date'].dt.year
        df.drop(columns=['id','site_id','date'],inplace=True)
    return df

time_features('date')

target_col = 'nitrogendioxide_no2_column_number_density'
freq = "hourly"
if freq == "hourly":
    shifts = [1, 2, 6, 12]
    time_unit = "hour"
elif freq == "daily":
    shifts = [1, 2, 3, 7]
    time_unit = "day"

for s in shifts:
    train[f"pm2_5_last_{s}_{time_unit}"] = train.groupby(["city"])[target_col].shift(s)
    test[f"pm2_5_last_{s}_{time_unit}"] = test.groupby(["city"])[target_col].shift(s)

Unnamed: 0,site_latitude,site_longitude,city,country,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,sulphurdioxide_so2_slant_column_number_density,sulphurdioxide_cloud_fraction,sulphurdioxide_sensor_azimuth_angle,...,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,date_month,DayOfWeek,Day,Year
0,5.61252,-0.22955,Accra,Ghana,13,-0.000072,0.762543,-0.000055,0.079645,-100.330299,...,3.063105,0.263193,-100.317077,27.059646,-86.885670,25.530511,249,2,6,2023
1,5.61252,-0.22955,Accra,Ghana,13,,,,,,...,,,,,,,250,3,7,2023
2,5.61252,-0.22955,Accra,Ghana,13,-0.000051,1.004265,-0.000051,0.163160,73.117264,...,,,,,,,251,4,8,2023
3,5.61252,-0.22955,Accra,Ghana,12,,,,,,...,29.145922,0.314945,70.680077,61.874222,-90.875603,11.865201,252,5,9,2023
4,5.61252,-0.22955,Accra,Ghana,12,-0.000634,0.632173,-0.000401,0.000000,70.066956,...,,,,,,,263,2,20,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2778,-0.10098,34.76242,Kisumu,Kenya,10,0.000099,0.660385,0.000066,0.209272,72.440125,...,2.831732,0.250637,72.428178,55.404121,-150.645025,21.248325,27,5,27,2024
2779,3.89696,11.50603,Yaoundé,Cameroon,12,,,,,,...,13.719315,0.228072,-101.384688,16.577282,-125.511751,25.086502,52,2,21,2024
2780,3.89696,11.50603,Yaoundé,Cameroon,12,,,,,,...,,,,,,,53,3,22,2024
2781,3.89696,11.50603,Yaoundé,Cameroon,12,,,,,,...,,,,,,,54,4,23,2024


In [7]:
train

Unnamed: 0,site_latitude,site_longitude,city,country,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,sulphurdioxide_so2_slant_column_number_density,sulphurdioxide_cloud_fraction,sulphurdioxide_sensor_azimuth_angle,...,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,pm2_5,date_month,DayOfWeek,Day,Year
0,6.532570,3.399360,Lagos,Nigeria,13,,,,,,...,,,,,,12.015000,298,2,25,2023
1,6.532570,3.399360,Lagos,Nigeria,12,,,,,,...,,,,,,42.267200,306,3,2,2023
2,6.532570,3.399360,Lagos,Nigeria,13,,,,,,...,0.192757,-96.411890,61.045123,-121.307414,41.898269,39.450741,307,4,3,2023
3,6.532570,3.399360,Lagos,Nigeria,14,,,,,,...,,,,,,10.537600,312,2,8,2023
4,6.532570,3.399360,Lagos,Nigeria,13,0.000267,0.774656,0.000207,0.223403,-97.811241,...,0.153114,-97.811241,49.513439,-126.064453,40.167355,19.431731,313,3,9,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8066,-1.268877,36.819139,Nairobi,Kenya,10,,,,,,...,,,,,,21.086071,17,2,17,2024
8067,-1.268877,36.819139,Nairobi,Kenya,11,-0.000563,0.595788,-0.000335,0.208774,-98.841057,...,,,,,,17.110000,24,2,24,2024
8068,-1.268877,36.819139,Nairobi,Kenya,11,,,,,,...,,,,,,15.080000,25,3,25,2024
8069,-1.268877,36.819139,Nairobi,Kenya,10,-0.000188,0.746123,-0.000140,0.089824,75.947845,...,,,,,,18.026750,26,4,26,2024


# Label Encodig the features

In [8]:
le = LabelEncoder()
for column in ['city','country']:
    letrans = le.fit_transform(pd.concat([train,test])[column])
    train[column] = letrans[:len(train)]
    test[column] = letrans[len(train):]

# Modeling and Validation

In [9]:
model =  LGBMRegressor(random_state=42,n_estimators=200,max_depth=10,objective='mse')
n_splits = 4
cv = GroupKFold(n_splits=n_splits)
groups = train['city']

In [10]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16520
[LightGBM] [Info] Number of data points in the train set: 8038, number of used features: 73
[LightGBM] [Info] Start training from score 23.518117


In [11]:
pred = model.predict(test)

In [12]:
train_set = pd.read_csv("/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Train.csv")
test_set = pd.read_csv("/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Test.csv")

trainset=  train_set[train_set['pm2_5'] < 200]

def validate(trainset,testset,t,origin):
  with mlflow.start_run():
    model.fit(trainset.drop(columns=t),trainset[t])
    pred = model.predict(np.array(testset.drop(columns=t)))
    print('std: ', testset[t].std())

    # to validate the post processing
    origin['pm_5'] = pred
    origin['date'] = pd.to_datetime(origin['date'])
    origin['date_day'] = origin['date'].dt.dayofyear
    pred = origin['date_day'].map(origin[['date_day','pm_5']].groupby('date_day')['pm_5'].mean())
    #--------------------------------------------------------------------------------------------
    stds.append(testset[t].std())
    score = mean_squared_error(pred,testset[t],squared=False)
    print('score:', score)
    mlflow.log_metric("rmse", score) 
    mlflow.sklearn.log_model(model, "model")

    return score
stds = []
rmse = []

for v_train,v_test in cv.split(train.drop(columns='pm2_5'),train['pm2_5'],groups=groups):
    train_v, test_v= train.iloc[v_train],train.iloc[v_test]
    origin = train_set.iloc[v_test]
    rmse.append(validate(train_v,test_v,'pm2_5',origin))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16386
[LightGBM] [Info] Number of data points in the train set: 2442, number of used features: 73
[LightGBM] [Info] Start training from score 25.243054
std:  14.34682858151784
score: 19.650432587478278


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16493
[LightGBM] [Info] Number of data points in the train set: 6544, number of used features: 73
[LightGBM] [Info] Start training from score 24.887755
std:  11.699535309645805
score: 11.764477970339929


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003887 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16462
[LightGBM] [Info] Number of data points in the train set: 7213, number of used features: 73
[LightGBM] [Info] Start training from score 21.813974
std:  29.185801480076165
score: 27.795842969887197


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16505
[LightGBM] [Info] Number of data points in the train set: 7915, number of used features: 73
[LightGBM] [Info] Start training from score 23.406529
std:  22.35338406057941
score: 21.106460672434704


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


In [13]:
rmse_avg = np.array(rmse).mean()
rmse_avg

20.07930355003503

# Infrence

In [14]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])
y = model.predict(test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16520
[LightGBM] [Info] Number of data points in the train set: 8038, number of used features: 73
[LightGBM] [Info] Start training from score 23.518117


In [None]:
smaple = pd.read_csv('/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/SampleSubmission.csv')
smaple['pm2_5'] = post_process(test,y)
smaple.to_csv('/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/submissions/submission15124873904263207.csv',index=False)