In [1]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.5.0


In [2]:
import pandas as pd 
import numpy as np 
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold,TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
#from xgboost import XGBRegressor
#from catboost import CatBoostRegressor

In [3]:
import warnings
warnings.filterwarnings('ignore')

def stats_by_feat(train,test,feat_to_stat,feat_to_group,func):
    for df in (train,test):
        df[feat_to_stat+feat_to_group+func] = df[feat_to_group].map(df.groupby(feat_to_group)[feat_to_stat].aggregate(func))


In [5]:
train = pd.read_csv("../data/Train.csv")
test = pd.read_csv("../data/Test.csv")
train = train[train['pm2_5']<120]

In [6]:
pd.options.display.max_columns = 200

In [7]:
train_feats = train.notna().sum()[train.notna().sum() > .15*len(train)].index
test_feats = test.notna().sum()[test.notna().sum() > .15*len(test)].index
train = train[train_feats]

In [8]:
test = test[train_feats[:-1]]

In [9]:
for df in (train,test):
    df['date'] = pd.to_datetime(df['date'])
    df['date_day_of_year'] = df['date'].dt.day_of_year
    df['day_of_week'] = df['date'].dt.day_of_week
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['year'] = df['date'].dt.year
    df.drop(columns=['id','site_id','date'],inplace=True)

In [10]:
groups = train['city']

In [11]:
le = LabelEncoder()
for column in ['city','country']:
    letrans = le.fit_transform(pd.concat([train,test])[column])
    train[column] = letrans[:len(train)]
    test[column] = letrans[len(train):]

In [12]:
train = train[['hour','date_day_of_year','day_of_week','month','day', 'year','site_latitude','site_longitude','pm2_5']]
test = test[['hour','date_day_of_year','day_of_week','month','day', 'year','site_latitude','site_longitude']]

In [13]:
model =  LGBMRegressor(random_state=42,n_estimators=100)
n_splits = 4
cv = GroupKFold(n_splits=n_splits)


# Cross Validation

In [14]:
import seaborn as sns 
import matplotlib.pyplot as plt

In [15]:
stds = []
rmse = []
for _,v_test in cv.split(train.drop(columns='pm2_5'),train['pm2_5'],groups=groups):
    test_v= train.iloc[v_test]
    test_v.sort_values(by=['hour','day','month','year'],inplace=True)
    for train_loc,test_loc in TimeSeriesSplit(n_splits=5).split(test_v):
        train_set = test_v.iloc[train_loc]
        test_set = test_v.iloc[test_loc]
        model.fit(train_set.drop(columns='pm2_5'),train_set['pm2_5'])
        pred = model.predict(test_set.drop(columns='pm2_5'))
        score = mean_squared_error(test_set['pm2_5'],pred,squared=False)
        print(score)
        rmse.append(score)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 935, number of used features: 7
[LightGBM] [Info] Start training from score 24.162493
13.862035429459064
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000050 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 1866, number of used features: 7
[LightGBM] [Info] Start training from score 24.161156
12.581984237955295
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=

In [16]:
rmse

[np.float64(13.862035429459064),
 np.float64(12.581984237955295),
 np.float64(11.460919487137872),
 np.float64(9.323283511962876),
 np.float64(11.100634080518747),
 np.float64(12.019972400223072),
 np.float64(10.348387577726305),
 np.float64(7.238731306090037),
 np.float64(10.120184184325215),
 np.float64(8.016783600992538),
 np.float64(21.663161016607656),
 np.float64(13.203130390018316),
 np.float64(10.153671708699687),
 np.float64(12.997655000970536),
 np.float64(15.82714781063984),
 np.float64(11.89938808373502),
 np.float64(18.367466217388017),
 np.float64(39.387710325073826),
 np.float64(15.524702429080614),
 np.float64(13.278991951752474)]

In [17]:
score = np.array(rmse).mean()
score

np.float64(13.918797037517852)

In [18]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000280 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 8004, number of used features: 8
[LightGBM] [Info] Start training from score 22.997927


In [19]:
y = model.predict(test)

In [20]:
train.head(2)

Unnamed: 0,hour,date_day_of_year,day_of_week,month,day,year,site_latitude,site_longitude,pm2_5
0,13,298,2,10,25,2023,6.53257,3.39936,12.015
1,12,306,3,11,2,2023,6.53257,3.39936,42.2672


In [21]:
train

Unnamed: 0,hour,date_day_of_year,day_of_week,month,day,year,site_latitude,site_longitude,pm2_5
0,13,298,2,10,25,2023,6.532570,3.399360,12.015000
1,12,306,3,11,2,2023,6.532570,3.399360,42.267200
2,13,307,4,11,3,2023,6.532570,3.399360,39.450741
3,14,312,2,11,8,2023,6.532570,3.399360,10.537600
4,13,313,3,11,9,2023,6.532570,3.399360,19.431731
...,...,...,...,...,...,...,...,...,...
8066,10,17,2,1,17,2024,-1.268877,36.819139,21.086071
8067,11,24,2,1,24,2024,-1.268877,36.819139,17.110000
8068,11,25,3,1,25,2024,-1.268877,36.819139,15.080000
8069,10,26,4,1,26,2024,-1.268877,36.819139,18.026750


In [23]:
smaple = pd.read_csv('../data/SampleSubmission.csv')
smaple.to_csv(f'../submissions/submission{score}.csv',index=False)
#/workspaces/AirQo-Experimentation/submissions

In [100]:
smaple

Unnamed: 0,id,pm2_5
0,id_ihxgrbq8bw,0
1,id_dg6s4fhiwe,0
2,id_f7hwwtmuzp,0
3,id_ioese5awdg,0
4,id_hdw320zpls,0
...,...,...
2778,id_5yblexf7zp,0
2779,id_c9cycuvetl,0
2780,id_8xu1ardxni,0
2781,id_aumy97t1iu,0
