# Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import GroupKFold,KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

# Defining Needed Functions

In [27]:
def post_process(test_data,y,feature_to_mean='date_month') -> int:
    """runs the post processing

    Keyword arguments:
    test_data -- the testing data
    y -- the target
    feature_to_mean -- feature to apply the post processing by
    Return: predictions after processing
    """
    test_data['target'] = y
    test_data['target_month'] = test_data[feature_to_mean].map(test_data[[feature_to_mean,'target']].groupby(feature_to_mean)['target'].mean())
    test_data.drop(columns='target',inplace=True)
    return test_data['target_month']

# Reading Data

In [None]:
train = pd.read_csv(r"/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Train.csv")
test = pd.read_csv(r"/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Test.csv")

In [42]:
print(train.shape),
print(test.shape)

(8071, 73)
(2783, 72)


# Cleaning data

In [40]:
def cleaning_data(train_data, test_data):
    train_feats = train_data.notna().sum()[train_data.notna().sum() > .15*len(train_data)].index
    test_feats = test_data.notna().sum()[test_data.notna().sum() > .15*len(test_data)].index
    train_data = train_data[train_feats]
    test_data = test_data[train_feats[:-1]]
    return train_data, test_data

train, test = cleaning_data(train, test)


In [39]:
train.shape

(8071, 73)

In [70]:
def viewing_site_lat(df):
    for col in df.columns:
        if df[col].dtype == float:
            print(f"you're viewing : {col}")
            print('MAX__________________-')
            print(df[col].max())
            print('MIN_________________-')
            print(df[col].min())
            print('MEAN_________________-')
            print(df[col].mean())
            print('next #################')
viewing_site_lat(train)
    

you're viewing : site_latitude
MAX__________________-
6.5954
MIN_________________-
-3.45997
MEAN_________________-
0.6356776923873395
next #################
you're viewing : site_longitude
MAX__________________-
36.914272
MIN_________________-
3.20151
MEAN_________________-
30.246981520554996
next #################
you're viewing : sulphurdioxide_so2_column_number_density
MAX__________________-
0.0020244442857804
MIN_________________-
-0.0009796843238507
MEAN_________________-
2.40501960919318e-05
next #################
you're viewing : sulphurdioxide_so2_column_number_density_amf
MAX__________________-
1.6070524454116093
MIN_________________-
0.2993642309439732
MEAN_________________-
0.6872946419366405
next #################
you're viewing : sulphurdioxide_so2_slant_column_number_density
MAX__________________-
0.0011011678502383
MIN_________________-
-0.0006956074503251
MEAN_________________-
1.2559048694410523e-05
next #################
you're viewing : sulphurdioxide_cloud_fraction


In [53]:
train.columns.tolist

<bound method IndexOpsMixin.tolist of Index(['id', 'site_id', 'site_latitude', 'site_longitude', 'city', 'country',
       'date', 'hour', 'sulphurdioxide_so2_column_number_density',
       'sulphurdioxide_so2_column_number_density_amf',
       'sulphurdioxide_so2_slant_column_number_density',
       'sulphurdioxide_cloud_fraction', 'sulphurdioxide_sensor_azimuth_angle',
       'sulphurdioxide_sensor_zenith_angle',
       'sulphurdioxide_solar_azimuth_angle',
       'sulphurdioxide_solar_zenith_angle',
       'sulphurdioxide_so2_column_number_density_15km', 'month',
       'carbonmonoxide_co_column_number_density',
       'carbonmonoxide_h2o_column_number_density',
       'carbonmonoxide_cloud_height', 'carbonmonoxide_sensor_altitude',
       'carbonmonoxide_sensor_azimuth_angle',
       'carbonmonoxide_sensor_zenith_angle',
       'carbonmonoxide_solar_azimuth_angle',
       'carbonmonoxide_solar_zenith_angle',
       'nitrogendioxide_no2_column_number_density',
       'nitrogendioxid

# Generating time-related features

In [73]:
def time_features(df):
    for df in (train,test):
        df['date'] = pd.to_datetime(df['date'])
        df['date_month'] = df['date'].dt.day_of_year
        df['DayOfWeek'] =  df['date'].dt.dayofweek
        df['Day'] =  df['date'].dt.day
        df['Year'] =  df['date'].dt.year
        df.drop(columns=['id','site_id','date'],inplace=True)
time_features(train)

In [58]:
train

Unnamed: 0,site_latitude,site_longitude,city,country,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,sulphurdioxide_so2_slant_column_number_density,sulphurdioxide_cloud_fraction,sulphurdioxide_sensor_azimuth_angle,...,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,pm2_5,date_month,DayOfWeek,Day,Year
0,6.532570,3.399360,Lagos,Nigeria,13,,,,,,...,,,,,,12.015000,298,2,25,2023
1,6.532570,3.399360,Lagos,Nigeria,12,,,,,,...,,,,,,42.267200,306,3,2,2023
2,6.532570,3.399360,Lagos,Nigeria,13,,,,,,...,0.192757,-96.411890,61.045123,-121.307414,41.898269,39.450741,307,4,3,2023
3,6.532570,3.399360,Lagos,Nigeria,14,,,,,,...,,,,,,10.537600,312,2,8,2023
4,6.532570,3.399360,Lagos,Nigeria,13,0.000267,0.774656,0.000207,0.223403,-97.811241,...,0.153114,-97.811241,49.513439,-126.064453,40.167355,19.431731,313,3,9,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8066,-1.268877,36.819139,Nairobi,Kenya,10,,,,,,...,,,,,,21.086071,17,2,17,2024
8067,-1.268877,36.819139,Nairobi,Kenya,11,-0.000563,0.595788,-0.000335,0.208774,-98.841057,...,,,,,,17.110000,24,2,24,2024
8068,-1.268877,36.819139,Nairobi,Kenya,11,,,,,,...,,,,,,15.080000,25,3,25,2024
8069,-1.268877,36.819139,Nairobi,Kenya,10,-0.000188,0.746123,-0.000140,0.089824,75.947845,...,,,,,,18.026750,26,4,26,2024


In [57]:
test

Unnamed: 0,site_latitude,site_longitude,city,country,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,sulphurdioxide_so2_slant_column_number_density,sulphurdioxide_cloud_fraction,sulphurdioxide_sensor_azimuth_angle,...,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,date_month,DayOfWeek,Day,Year
0,5.61252,-0.22955,Accra,Ghana,13,-0.000072,0.762543,-0.000055,0.079645,-100.330299,...,3.063105,0.263193,-100.317077,27.059646,-86.885670,25.530511,249,2,6,2023
1,5.61252,-0.22955,Accra,Ghana,13,,,,,,...,,,,,,,250,3,7,2023
2,5.61252,-0.22955,Accra,Ghana,13,-0.000051,1.004265,-0.000051,0.163160,73.117264,...,,,,,,,251,4,8,2023
3,5.61252,-0.22955,Accra,Ghana,12,,,,,,...,29.145922,0.314945,70.680077,61.874222,-90.875603,11.865201,252,5,9,2023
4,5.61252,-0.22955,Accra,Ghana,12,-0.000634,0.632173,-0.000401,0.000000,70.066956,...,,,,,,,263,2,20,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2778,-0.10098,34.76242,Kisumu,Kenya,10,0.000099,0.660385,0.000066,0.209272,72.440125,...,2.831732,0.250637,72.428178,55.404121,-150.645025,21.248325,27,5,27,2024
2779,3.89696,11.50603,Yaoundé,Cameroon,12,,,,,,...,13.719315,0.228072,-101.384688,16.577282,-125.511751,25.086502,52,2,21,2024
2780,3.89696,11.50603,Yaoundé,Cameroon,12,,,,,,...,,,,,,,53,3,22,2024
2781,3.89696,11.50603,Yaoundé,Cameroon,12,,,,,,...,,,,,,,54,4,23,2024


In [74]:
groups = train['city'] # the groups used in grouped kfold

# Label Encodig the features

In [None]:
le = LabelEncoder()
def label_encoder(train_data, test_data):
  for column in ['city','country']:
      letrans = le.fit_transform(pd.concat([train_data,test_data])[column])
      train_data[column] = letrans[:len(train_data)]
      test_data[column] = letrans[len(train_data):]
label_encoder(train,test)

In [None]:
le = LabelEncoder()
for column in ['city','country']:
    letrans = le.fit_transform(pd.concat([train,test])[column])
    train[column] = letrans[:len(train)]
    test[column] = letrans[len(train):]

# Modeling and Validation

In [14]:
model =  LGBMRegressor(random_state=42,n_estimators=200,max_depth=10,objective='mse')
n_splits = 4
cv = GroupKFold(n_splits=n_splits)

In [15]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16520
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 73
[LightGBM] [Info] Start training from score 24.639296


In [16]:
pred = model.predict(test)

In [17]:
train_set = pd.read_csv('/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Train.csv')
test_set = pd.read_csv('/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Train.csv')

def validate(trainset,testset,t,origin):
    model.fit(trainset.drop(columns=t),trainset[t])
    pred = model.predict(np.array(testset.drop(columns=t)))
    print('std: ', testset[t].std())

    # to validate the post processing
    origin['pm_5'] = pred
    origin['date'] = pd.to_datetime(origin['date'])
    origin['date_day'] = origin['date'].dt.dayofyear
    pred = origin['date_day'].map(origin[['date_day','pm_5']].groupby('date_day')['pm_5'].mean())
    #--------------------------------------------------------------------------------------------
    stds.append(testset[t].std())
    score = mean_squared_error(pred,testset[t],squared=False)
    print('score:', score)
    return score
stds = []
rmse = []

for v_train,v_test in cv.split(train.drop(columns='pm2_5'),train['pm2_5'],groups=groups):
    train_v, test_v= train.iloc[v_train],train.iloc[v_test]
    origin = train_set.iloc[v_test]
    rmse.append(validate(train_v,test_v,'pm2_5',origin))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16385
[LightGBM] [Info] Number of data points in the train set: 2475, number of used features: 73
[LightGBM] [Info] Start training from score 28.876230
std:  14.34682858151784
score: 14.778822910734402
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16493
[LightGBM] [Info] Number of data points in the train set: 6571, number of used features: 73
[LightGBM] [Info] Start training from score 25.954156


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


std:  24.982218437683237
score: 25.122209990265613
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16462
[LightGBM] [Info] Number of data points in the train set: 7219, number of used features: 73
[LightGBM] [Info] Start training from score 22.093092


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


std:  52.84388334184576
score: 52.42014222855449


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16505
[LightGBM] [Info] Number of data points in the train set: 7948, number of used features: 73
[LightGBM] [Info] Start training from score 24.545522
std:  22.35338406057941
score: 18.67513314502571


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.dayofyear


In [18]:
np.array(rmse).mean()

27.749077068645054

# Infrence

In [19]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16520
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 73
[LightGBM] [Info] Start training from score 24.639296


In [20]:
y = model.predict(test)

In [26]:
smaple = pd.read_csv('/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/SampleSubmission.csv')
smaple['pm2_5'] = post_process(test,y)
smaple.to_csv('/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/submissions/submission01.csv',index=False)