## Pm2.5 prediction using XGBOOST

In [1]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

In [2]:
# --- Load Dataset ---
# data = pd.read_csv("./datasets/ratnapark_pm25_after_imputation.csv")
data = pd.read_csv("./datasets/ratnapark_pm25_after_imputation.csv")

# --- Display first few rows ---
display(data.head())
print("Dataset Loaded Successfully")

Unnamed: 0.1,Unnamed: 0,PM2.5,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
0,0,75.780952,2022,1,1,0,88.03,0.47,317.6,0.77,319.2,0.0,83.08,6.7,8.85
1,1,56.584127,2022,1,1,1,88.0,0.4,310.0,0.64,313.7,0.0,80.28,6.48,8.86
2,2,49.53871,2022,1,1,2,87.96,0.26,319.6,0.46,323.9,0.0,78.29,6.27,8.74
3,3,47.398438,2022,1,1,3,87.93,0.28,300.3,0.46,306.1,0.0,77.32,6.02,8.32
4,4,42.821875,2022,1,1,4,87.91,0.29,329.0,0.51,330.4,0.0,75.66,5.8,8.09


Dataset Loaded Successfully


In [3]:
# --- Check for missing values ---
print("\nMissing Values:\n", data.isnull().sum())


Missing Values:
 Unnamed: 0     0
PM2.5          0
YEAR           0
MO             0
DY             0
HR             0
PS             0
WS2M           0
WD2M           0
WS10M          0
WD10M          0
PRECTOTCORR    0
RH2M           0
QV2M           0
T2M            0
dtype: int64


In [4]:
# data.groupby('MO')['PM2.5'].mean().plot(color='r')
# # data.groupby('MO')['PM2.5'].max().plot(color='g')
# plt.show()
# data.groupby('DY')['PM2.5'].mean().plot(color='g')
# data.groupby('DY')['PM2.5'].max().plot()
# plt.show()
# data.groupby('HR')['PM2.5'].mean().plot(color='y')
# data.groupby('HR')['PM2.5'].max().plot()
# plt.show()

# df.groupby('Day')['PM2.5'].mean().plot()



In [5]:
# --- Define Features and Target ---
# Assuming 'PM2.5' is the target column
target_col = 'PM2.5'
X = data[['YEAR','MO',	'DY',	'HR',	'PS',	'WS2M',	'WD2M',	'WS10M', 'WD10M', 'PRECTOTCORR',	'RH2M',	'QV2M',	'T2M']]
y = data[target_col]
# target_col = 'PM2.5'
# X = data.drop(columns=[target_col])
# y = data[target_col]

In [6]:
# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train

Unnamed: 0,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
21217,2024,6,3,1,87.04,0.82,35.6,1.63,35.5,0.04,69.49,12.42,20.96
18826,2024,2,24,10,87.90,2.48,201.5,3.15,201.4,0.00,18.51,2.57,17.17
11726,2023,5,4,14,87.36,2.49,226.6,3.15,226.5,8.99,56.59,11.24,22.76
22777,2024,8,7,1,87.00,0.51,57.9,0.91,53.5,27.18,92.67,16.53,20.88
11288,2023,4,16,8,87.40,1.46,226.9,1.85,227.4,0.00,27.03,6.84,26.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,2024,6,17,23,86.81,0.68,25.1,1.39,25.1,0.27,68.50,15.25,24.51
5390,2022,8,13,14,86.65,3.20,206.7,4.39,206.7,0.85,77.75,19.45,26.40
860,2022,2,5,20,87.37,0.76,262.4,1.57,257.5,0.07,94.17,6.05,5.43
15795,2023,10,21,3,87.78,0.74,11.8,1.17,11.9,0.00,71.16,8.36,14.48


In [8]:
import time

In [55]:
# --- Model: XGBoost Regressor ---

start_time = time.time()

# xgb_model = XGBRegressor(
#     n_estimators=300,
#     learning_rate=0.05,
#     max_depth=8,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42
# )

# Model 2
xgb_model = XGBRegressor(
    n_estimators=700,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=1.0,  # L2 regularization
    random_state=42
)

In [56]:
# --- Train the Model ---
xgb_model.fit(X_train, y_train)


In [57]:
# --- Prediction ---
y_pred = xgb_model.predict(X_test)
end_time = time.time()

exec_time = end_time - start_time
print(' --- Execution Time --- ')
exec_time

 --- Execution Time --- 


10.626759052276611

In [58]:
# --- Evaluation Metrics ---
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"R²   : {r2:.3f}")



Model Performance:
MAE  : 5.827
RMSE : 8.995
R²   : 0.924


In [13]:
# --- Feature Importance Plot ---
# plt.figure(figsize=(8,5))
# plt.barh(X.columns, xgb_model.feature_importances_)
# plt.xlabel("Feature Importance")
# plt.title("XGBoost Feature Importance")
# plt.show()

# --- Compare Predicted vs Actual ---
# plt.figure(figsize=(6,6))
# plt.scatter(y_test, y_pred, alpha=0.4)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
# plt.xlabel("Actual PM2.5")
# plt.ylabel("Predicted PM2.5")
# plt.title("Actual vs Predicted PM2.5 (XGBoost)")
# plt.show()


In [41]:
y_test[:24]

5211      22.090625
26231    156.288333
2752      65.627419
3592      31.209524
4639      22.984615
8735      99.984999
8769      78.006897
25410     61.591667
6087      19.910000
14015      5.500000
6433      23.955738
17189     75.500000
15274     14.800000
23712     17.433854
3382      45.979688
15101      5.400000
26086    125.683332
2119     125.800000
5859      16.363333
21341     11.908333
2111     102.400000
20920     20.510345
20359    118.335000
1834      73.000000
Name: PM2.5, dtype: float64

In [59]:
y_pred[:24]

array([ 10.516557, 133.65022 ,  60.334465,  37.6042  ,  20.952307,
        94.27258 ,  70.332756,  67.824196,  16.678785,   8.065895,
        20.19258 ,  62.370636,  17.651848,  15.00943 ,  37.144016,
         8.539914, 107.55639 , 107.42342 ,  18.076672,  22.388556,
        79.9569  ,  25.814531,  95.31236 ,  70.754684], dtype=float32)

In [45]:
input_test = {
    'YEAR': 2023,
    'MO': 2,
    'DY': 3,
    'HR': 8,
    'PS': 87.7,
    'WS2M': 0.16,
    'WD2M': 345.1,
    'WS10M': 0.29,
    'WD10M': 352.2,
    'PRECTOTCORR': 0,
    'RH2M': 40.02,
    'QV2M': 4.62,
    'T2M': 14.23,
}
# input_test = {
#     'YEAR': 2023,
#     'MO': 2,
#     'DY': 3,
#     'HR': 13,
#     'PS': 87.48,
#     'WS2M': 3.4,
#     'WD2M': 218.6,
#     'WS10M': 4.48,
#     'WD10M': 218.6,
#     'PRECTOTCORR': 0,
#     'RH2M': 30.5,
#     'QV2M': 5.36,
#     'T2M': 20.84,
# }
# input_test
df_input_test = pd.DataFrame([input_test])
df_input_test

Unnamed: 0,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
0,2023,2,3,8,87.7,0.16,345.1,0.29,352.2,0,40.02,4.62,14.23


In [60]:
y_input_test_pred = xgb_model.predict(df_input_test)
y_input_test_pred

array([96.36242], dtype=float32)

In [65]:
X_train.head()

Unnamed: 0,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
21217,2024,6,3,1,87.04,0.82,35.6,1.63,35.5,0.04,69.49,12.42,20.96
18826,2024,2,24,10,87.9,2.48,201.5,3.15,201.4,0.0,18.51,2.57,17.17
11726,2023,5,4,14,87.36,2.49,226.6,3.15,226.5,8.99,56.59,11.24,22.76
22777,2024,8,7,1,87.0,0.51,57.9,0.91,53.5,27.18,92.67,16.53,20.88
11288,2023,4,16,8,87.4,1.46,226.9,1.85,227.4,0.0,27.03,6.84,26.87


In [69]:
X_train = X_train.reset_index()

In [70]:
X_train.sort_values('index')
# X_train

Unnamed: 0,index,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
3906,0,2022,1,1,0,88.03,0.47,317.6,0.77,319.2,0.0,83.08,6.70,8.85
14318,1,2022,1,1,1,88.00,0.40,310.0,0.64,313.7,0.0,80.28,6.48,8.86
11919,2,2022,1,1,2,87.96,0.26,319.6,0.46,323.9,0.0,78.29,6.27,8.74
1969,3,2022,1,1,3,87.93,0.28,300.3,0.46,306.1,0.0,77.32,6.02,8.32
5501,4,2022,1,1,4,87.91,0.29,329.0,0.51,330.4,0.0,75.66,5.80,8.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7090,26299,2024,12,31,19,87.83,0.62,256.0,1.44,253.1,0.0,92.40,8.28,10.37
19122,26300,2024,12,31,20,87.85,0.44,262.2,0.92,263.7,0.0,91.04,7.94,9.97
2279,26301,2024,12,31,21,87.85,0.45,266.2,1.00,261.4,0.0,91.15,7.68,9.46
18476,26302,2024,12,31,22,87.85,0.58,279.0,1.19,274.8,0.0,89.81,7.39,9.11


In [95]:
test_df = data[['YEAR','MO',	'DY',	'HR',	'PS',	'WS2M',	'WD2M',	'WS10M', 'WD10M', 'PRECTOTCORR',	'RH2M',	'QV2M',	'T2M']].head(24)
test_df

Unnamed: 0,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
0,2022,1,1,0,88.03,0.47,317.6,0.77,319.2,0.0,83.08,6.7,8.85
1,2022,1,1,1,88.0,0.4,310.0,0.64,313.7,0.0,80.28,6.48,8.86
2,2022,1,1,2,87.96,0.26,319.6,0.46,323.9,0.0,78.29,6.27,8.74
3,2022,1,1,3,87.93,0.28,300.3,0.46,306.1,0.0,77.32,6.02,8.32
4,2022,1,1,4,87.91,0.29,329.0,0.51,330.4,0.0,75.66,5.8,8.09
5,2022,1,1,5,87.92,0.32,5.4,0.53,3.2,0.0,72.34,5.54,8.08
6,2022,1,1,6,87.97,0.49,29.2,0.75,26.9,0.0,68.91,5.27,8.07
7,2022,1,1,7,88.02,0.59,35.3,0.89,34.0,0.0,64.5,5.11,8.6
8,2022,1,1,8,88.09,0.24,48.4,0.43,41.2,0.0,66.84,6.38,11.38
9,2022,1,1,9,88.15,0.98,232.4,1.29,232.9,0.0,58.95,6.61,13.84


In [102]:
y_input_test_pred = xgb_model.predict(test_df)
y_input_test_pred

array([ 75.57766 ,  57.06224 ,  49.64087 ,  47.06631 ,  42.51915 ,
        37.791824,  46.01212 ,  51.588554,  83.137184, 116.68954 ,
        98.75163 ,  55.0475  ,  36.143368,  20.655056,  23.159428,
        23.190153,  25.637772,  35.44169 ,  45.820797,  33.502506,
        50.99862 ,  64.88053 ,  70.54707 ,  53.339222], dtype=float32)

In [103]:
print("Predicted value :", np.max(y_input_test_pred))
print("Peak Hour at :", np.argmax(y_input_test_pred))

Predicted value : 116.68954
Peak Hour at : 9


In [108]:
# Find indices of top 3 maximum values
top3_indices = np.argsort(y_input_test_pred)[-5:][::-1]
top3_values = y_input_test_pred[top3_indices]
print("Top 3 max values:", top3_values)
print("Their indices:", top3_indices)


Top 3 max values: [116.68954   98.75163   83.137184  75.57766   70.54707 ]
Their indices: [ 9 10  8  0 22]


In [115]:
my_test_data = data.iloc[24:48]
my_test_data = my_test_data[['YEAR','MO',	'DY',	'HR',	'PS',	'WS2M',	'WD2M',	'WS10M', 'WD10M', 'PRECTOTCORR',	'RH2M',	'QV2M',	'T2M']]
my_test_data

Unnamed: 0,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
24,2022,1,2,0,87.9,0.65,308.8,1.03,312.2,0.0,80.15,5.98,7.69
25,2022,1,2,1,87.89,0.71,310.4,1.1,313.9,0.0,77.41,5.74,7.6
26,2022,1,2,2,87.86,0.76,316.1,1.19,317.7,0.0,74.4,5.48,7.5
27,2022,1,2,3,87.83,0.78,321.2,1.23,322.9,0.0,70.39,5.2,7.54
28,2022,1,2,4,87.81,0.76,323.0,1.19,324.6,0.0,66.24,4.9,7.56
29,2022,1,2,5,87.84,0.7,320.8,1.1,324.3,0.0,62.73,4.61,7.47
30,2022,1,2,6,87.88,0.56,327.4,0.89,332.0,0.0,58.37,4.34,7.65
31,2022,1,2,7,87.91,0.34,351.6,0.6,355.2,0.0,54.38,4.3,8.56
32,2022,1,2,8,87.96,0.04,236.3,0.08,330.3,0.0,61.11,5.72,11.07
33,2022,1,2,9,88.02,1.06,222.7,1.42,222.7,0.0,52.9,5.81,13.51


In [116]:
y_input_test_pred = xgb_model.predict(my_test_data)
y_input_test_pred
print("Predicted value :", np.max(y_input_test_pred))
print("Peak Hour at :", np.argmax(y_input_test_pred))
# Find indices of top 3 maximum values
top3_indices = np.argsort(y_input_test_pred)[-5:][::-1]
top3_values = y_input_test_pred[top3_indices]
print("Top 3 max values:", top3_values)
print("Their indices:", top3_indices)

Predicted value : 84.91895
Peak Hour at : 9
Top 3 max values: [84.91895  70.89205  62.59192  59.40578  56.883636]
Their indices: [ 9 10 19  8 20]
