# Importing Required Libraries

In [10]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import GroupKFold,KFold
from sklearn.preprocessing import LabelEncoder


# Defining Needed Functions

In [11]:
def post_process(test_data,y,feature_to_mean='date_month') -> int:
    """runs the post processing

    Keyword arguments:
    test_data -- the testing data
    y -- the target
    feature_to_mean -- feature to apply the post processing by
    Return: predictions after processing
    """
    test_data['target'] = y
    test_data['target_month'] = test_data[feature_to_mean].map(test_data[[feature_to_mean,'target']].groupby(feature_to_mean)['target'].mean())
    test_data.drop(columns='target',inplace=True)
    return test_data['target_month']

# Reading Data

In [12]:
train = pd.read_csv("/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Train.csv")
test = pd.read_csv("/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Test.csv")

for df in(train,test):
    df.index = pd.to_datetime(df['date'])

In [13]:
test_cities = ['Kampala', 'Lagos', 'Bujumbura']

train = train[train['city'].isin(test_cities)]
train.city.unique()

array(['Lagos', 'Bujumbura', 'Kampala'], dtype=object)

# Cleaning data

In [17]:
def clean(trainset,testset):
    train_feats = trainset.notna().sum()[trainset.notna().sum() > .15*len(trainset)].index
    test_feats = testset.notna().sum()[testset.notna().sum() > .15*len(testset)].index
    trainset = trainset[train_feats]
    testset = testset[train_feats[:-1]]
    
clean(train,test)

# Generating time-related features

In [18]:
def feature_engineering(train,test):
    for df in [train, test]:
        df['date'] = pd.to_datetime(df['date'])
        df['date_month'] = df['date'].dt.day_of_year
        df['DayOfWeek'] = df['date'].dt.dayofweek
        df['Day'] = df['date'].dt.day
        df['Year'] = df['date'].dt.year
        df.drop(columns=['id', 'site_id', 'date'], inplace=True)

    # Create lag features for the training data
    train['previous_day_pm2_5_value'] = train.groupby('Day')['pm2_5'].shift(1)
    train['previous_1_week_pm2_5_value'] = train.groupby('Day')['pm2_5'].shift(7)

    # Fill missing lag features with appropriate values (e.g., NaN or other imputation methods)
    train.fillna(method='bfill', inplace=True)

    combined = pd.concat([train, test], sort=False)

    combined['previous_day_pm2_5_value'] = combined.groupby('Day')['pm2_5'].shift(1)
    combined['previous_1_week_pm2_5_value'] = combined.groupby('Day')['pm2_5'].shift(7)

    train = combined[combined.index.isin(train.index)]
    test = combined[combined.index.isin(test.index)]

    test.fillna(method='bfill', inplace=True)

    train.drop(columns=['pm2_5'], inplace=True)
    test.drop(columns=['pm2_5'], inplace=True)


# Label Encodig the features

In [24]:
le = LabelEncoder()
for column in ['city','country']:
    letrans = le.fit_transform(pd.concat([train,test])[column])
    train[column] = letrans[:len(train)]
    test[column] = letrans[len(train):]

In [None]:
le = LabelEncoder()
for column in ['city','country']:
    letrans = le.fit_transform(pd.concat([train,test])[column])
    train[column] = letrans[:len(train)]
    test[column] = letrans[len(train):]

In [26]:
train

Unnamed: 0_level_0,id,site_id,site_latitude,site_longitude,city,country,date,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,...,cloud_cloud_top_height,cloud_cloud_base_pressure,cloud_cloud_base_height,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,pm2_5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10-25,id_vjcx08sz91,6531a46a89b3300013914a36,6.53257,3.39936,5,4,2023-10-25,13,,,...,,,,,,,,,,12.015000
2023-11-02,id_bkg215syli,6531a46a89b3300013914a36,6.53257,3.39936,5,4,2023-11-02,12,,,...,,,,,,,,,,42.267200
2023-11-03,id_oui2pot3qd,6531a46a89b3300013914a36,6.53257,3.39936,5,4,2023-11-03,13,,,...,6791.682888,51171.802486,5791.682829,11.816715,0.192757,-96.411890,61.045123,-121.307414,41.898269,39.450741
2023-11-08,id_9aandqzy4n,6531a46a89b3300013914a36,6.53257,3.39936,5,4,2023-11-08,14,,,...,,,,,,,,,,10.537600
2023-11-09,id_ali5x2m4iw,6531a46a89b3300013914a36,6.53257,3.39936,5,4,2023-11-09,13,0.000267,0.774656,...,1451.050659,96215.906250,451.050598,10.521009,0.153114,-97.811241,49.513439,-126.064453,40.167355,19.431731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-16,id_utshimidk4,64ef31227771f10013f8b066,-3.38065,29.38383,1,0,2024-01-16,12,,,...,,,,,,,,,,26.997632
2024-01-24,id_i3tbxmvr5w,64ef31227771f10013f8b066,-3.38065,29.38383,1,0,2024-01-24,11,,,...,,,,,,,,,,110.813000
2024-01-25,id_tqirgbtzoc,64ef31227771f10013f8b066,-3.38065,29.38383,1,0,2024-01-25,11,0.000229,0.638001,...,,,,,,,,,,50.078421
2024-01-26,id_rs2x3p7cgc,64ef31227771f10013f8b066,-3.38065,29.38383,1,0,2024-01-26,10,-0.000305,0.727786,...,7570.113770,46691.367187,6570.113770,9.200150,0.249005,72.482094,59.980885,-148.271301,18.270901,32.742105


# Modeling and Validation

In [22]:
groups = train['city'] # the groups used in grouped kfold
model =  LGBMRegressor(random_state=42,n_estimators=200,max_depth=10,objective='mse')
n_splits = 3
cv = GroupKFold(n_splits=n_splits)

In [25]:
train_set = pd.read_csv("/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Train.csv")
test_set = pd.read_csv("/Users/ahmed/Downloads/computerscience/AirQo/PM2.5-Prediction/data/Test.csv")
def validate(trainset,testset,t,origin):
    model.fit(trainset.drop(columns=t),trainset[t])
    pred = model.predict(np.array(testset.drop(columns=t)))
    print('std: ', testset[t].std())

    # to validate the post processing
    origin['pm_5'] = pred
    origin['date'] = pd.to_datetime(origin['date'])
    origin['date_day'] = origin['date'].dt.dayofyear
    pred = origin['date_day'].map(origin[['date_day','pm_5']].groupby('date_day')['pm_5'].mean())
    #--------------------------------------------------------------------------------------------
    stds.append(testset[t].std())
    score = mean_squared_error(pred,testset[t],squared=False)
    print('score:', score)
    return score
stds = []
rmse = []

for v_train,v_test in cv.split(train.drop(columns='pm2_5'),train['pm2_5'],groups=groups):
    train_v, test_v= train.iloc[v_train],train.iloc[v_test]
    origin = train_set.iloc[v_test]
    rmse.append(validate(train_v,test_v,'pm2_5',origin))

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: id: object, site_id: object, date: object

In [None]:
np.array(rmse).mean()

27.749077068645054

# Infrence

In [None]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16520
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 73
[LightGBM] [Info] Start training from score 24.639296


In [None]:
y = model.predict(test)



In [None]:
smaple = pd.read_csv('/Users/ahmed/Downloads/computerscience/AirQo/pm2_5-Prediction/data/SampleSubmission.csv')
smaple['pm2_5'] = post_process(test,y)
smaple.to_csv('submission.csv',index=False)