In [5]:
# ----------------------------------------
# STEP 1: Import libraries
# ----------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report

# ----------------------------------------
# STEP 2: Load dataset
# ----------------------------------------
data = pd.read_csv("./datasets/ratnapark_pm25_after_imputation.csv")
print("Columns:", data.columns)
print(data.head())





Columns: Index(['Unnamed: 0', 'PM2.5', 'YEAR', 'MO', 'DY', 'HR', 'PS', 'WS2M', 'WD2M',
       'WS10M', 'WD10M', 'PRECTOTCORR', 'RH2M', 'QV2M', 'T2M', 'Unnamed: 15',
       'Unnamed: 16', 'Unnamed: 17'],
      dtype='object')
   Unnamed: 0      PM2.5  YEAR  MO  DY  HR     PS  WS2M   WD2M  WS10M  WD10M  \
0           0  75.780952  2022   1   1   0  88.03  0.47  317.6   0.77  319.2   
1           1  56.584127  2022   1   1   1  88.00  0.40  310.0   0.64  313.7   
2           2  49.538710  2022   1   1   2  87.96  0.26  319.6   0.46  323.9   
3           3  47.398438  2022   1   1   3  87.93  0.28  300.3   0.46  306.1   
4           4  42.821875  2022   1   1   4  87.91  0.29  329.0   0.51  330.4   

   PRECTOTCORR   RH2M  QV2M   T2M  Unnamed: 15  Unnamed: 16  Unnamed: 17  
0          0.0  83.08  6.70  8.85          NaN          NaN          NaN  
1          0.0  80.28  6.48  8.86          NaN          NaN          NaN  
2          0.0  78.29  6.27  8.74          NaN          NaN          

In [8]:
# ----------------------------------------
# STEP 3: Create Datetime and Date
# ----------------------------------------
# Combine Year, Month, Day, Hour columns into one datetime
data['Datetime'] = pd.to_datetime(dict(year=data['YEAR'], month=data['MO'], day=data['DY'], hour=data['HR']))
data['Date'] = data['Datetime'].dt.date

# ----------------------------------------
# STEP 4: Create daily peak PM2.5 and occurrence hour
# ----------------------------------------
daily_peak = data.groupby('Date').apply(
    lambda x: pd.Series({
        'peak_pm25': x['PM2.5'].max(),
        'peak_hour': x.loc[x['PM2.5'].idxmax(), 'HR']
    })
).reset_index()


In [12]:
# Meteorological feature columns in your dataset
meteo_cols = ['T2M', 'RH2M', 'WS2M', 'PS']   # Temperature, Humidity, Wind Speed, Pressure

# Ensure all exist
missing = [c for c in meteo_cols if c not in data.columns]
if missing:
    raise ValueError(f"⚠️ Missing columns in dataset: {missing}")

# Compute daily averages
daily_features = data.groupby('Date')[meteo_cols].mean().reset_index()

# Merge with peak info
df = pd.merge(daily_peak, daily_features, on='Date')

print("\n✅ Prepared daily dataset:")
print(df.head())


✅ Prepared daily dataset:
         Date   peak_pm25  peak_hour        T2M       RH2M      WS2M  \
0  2022-01-01  117.401562        9.0  10.721667  74.199583  1.054583   
1  2022-01-02   84.952381        9.0  10.407917  68.840833  1.001250   
2  2022-01-03   85.859375        9.0  10.692500  68.275417  1.221250   
3  2022-01-04   95.376562        9.0  10.615833  69.006250  1.030833   
4  2022-01-05   58.518182        9.0  11.260417  72.972083  0.914583   

          PS  
0  87.982083  
1  87.855000  
2  87.722500  
3  87.746667  
4  87.762917  


In [14]:
# ----------------------------------------
# STEP 5: Regression – predict daily peak PM2.5
# ----------------------------------------
X_reg = df[meteo_cols]
y_reg = df['peak_pm25']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train_r, y_train_r)
y_pred_r = regressor.predict(X_test_r)



In [15]:
print("\n--- Regression Results (Peak PM2.5) ---")
print("MAE:", mean_absolute_error(y_test_r, y_pred_r))
print("R²:", r2_score(y_test_r, y_pred_r))


--- Regression Results (Peak PM2.5) ---
MAE: 19.08899909530418
R²: 0.6029928372651501


In [16]:
y_pred_r

array([ 85.83301006,  23.47704483,  87.89333546,  22.19054705,
        96.87118456, 121.16293909, 153.94336997,  27.87725172,
       133.26739768,  61.52922306,  30.64493874,  28.25563562,
        25.6471918 , 111.94614286,  31.00425103,  54.39298949,
        25.38401485,  36.21533086, 102.59247446, 133.93403826,
        31.53644972,  27.83986964, 119.82098523, 141.03595114,
        91.74294808,  39.74440497,  64.92300526, 112.04106335,
        44.98122864,  95.22211643, 136.02903274,  91.94802914,
       111.80545634,  28.37632555,  51.10331265, 112.39513892,
        65.75539074, 107.77698223, 138.20369635,  46.89459675,
       122.21998738,  28.19531579,  65.12440545, 103.72013335,
        66.49278234,  81.19287055, 106.28218307, 120.15059646,
        23.7726839 ,  32.28640711,  72.29516292,  26.20780499,
        24.45459609,  27.98449516,  25.97485825,  24.49513345,
        26.11760395,  91.71084367,  84.30426984, 112.35588506,
        52.76784058,  98.6448181 ,  79.41794667,  63.98

In [None]:
# ----------------------------------------
# STEP 6: Classification – predict hour of peak occurrence
# ----------------------------------------
X_clf = df[meteo_cols]
y_clf = df['peak_hour']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_c, y_train_c)
y_pred_c = classifier.predict(X_test_c)

print("\n--- Classification Results (Peak Hour) ---")
print("Accuracy:", accuracy_score(y_test_c, y_pred_c))
print("\nClassification Report:\n", classification_report(y_test_c, y_pred_c))

In [17]:
y_pred_c

array([21., 22.,  8., 10.,  7.,  7., 21.,  8.,  6.,  9.,  6.,  8., 19.,
        7.,  7.,  6.,  7.,  5.,  6.,  8.,  6.,  7.,  8.,  7.,  0.,  8.,
       20.,  8.,  7., 21.,  8.,  9.,  8., 10., 21.,  9., 21.,  7.,  9.,
        8.,  7.,  6.,  7.,  8.,  7.,  8.,  9.,  7.,  9.,  7., 22.,  6.,
        7.,  7.,  7.,  6.,  7.,  7.,  8.,  8.,  8.,  7.,  8.,  8.,  8.,
        8., 19.,  7.,  9.,  9.,  8.,  8., 19.,  8.,  9., 20., 22.,  7.,
        8.,  8.,  9.,  9.,  7.,  8.,  6.,  7.,  9.,  9.,  9., 22., 20.,
        9.,  9., 21.,  8.,  9., 23., 21.,  9., 19.,  8., 18.,  9.,  8.,
       21.,  1., 10., 21.,  8.,  7.,  8., 21.,  8.,  8.,  8.,  8.,  8.,
        8.,  9.,  7., 20., 21.,  9., 21.,  5.,  7., 21.,  7., 20., 21.,
        9.,  7.,  7.,  8.,  8.,  7.,  9.,  9., 23.,  9.,  7.,  7.,  8.,
        7., 20.,  0.,  9., 11., 19., 20.,  7.,  9.,  7., 10., 11.,  9.,
        7., 21., 22.,  9.,  9.,  7., 23.,  8.,  9.,  9.,  8.,  8.,  7.,
        6.,  9., 21.,  7.,  8.,  7.,  9., 21.,  9., 10.,  8.,  8