Setup

In [10]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np

Load Clean Dataset

In [11]:
df = pd.read_csv("air_quality_clean.csv", index_col="Datetime", parse_dates=True)

Predict CO(GT) with horizons

In [12]:
# Pollutants to model
pollutants = ["CO(GT)", "NMHC(GT)", "C6H6(GT)", "NOx(GT)", "NO2(GT)"]
horizons = [1, 6, 12, 24]

all_results = []

In [13]:
for pollutant in pollutants:
    print(f"\n=== Modelling pollutant: {pollutant} ===")

    df_temp = df.copy()

    # Create targets for each horizon
    for h in horizons:
        df_temp[f"y_{h}"] = df_temp[pollutant].shift(-h)

    # Train/test split
    train = df_temp[df_temp.index.year == 2004].dropna()
    test = df_temp[df_temp.index.year == 2005].dropna()

    # Drop target cols + original target col
    drop_cols = [pollutant] + [f"y_{h}" for h in horizons]
    X_train = train.drop(columns=drop_cols)
    X_test = test.drop(columns=drop_cols)

    for h in horizons:
        y_train = train[f"y_{h}"]
        y_test = test[f"y_{h}"]

        # Random Forest
        rf = RandomForestRegressor(n_estimators=200, random_state=42)
        rf.fit(X_train, y_train)
        pred_rf = rf.predict(X_test)
        rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))

        # XGBoost
        xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, random_state=42)
        xgb.fit(X_train, y_train)
        pred_xgb = xgb.predict(X_test)
        rmse_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))

        all_results.append([pollutant, h, rmse_rf, rmse_xgb])


=== Modelling pollutant: CO(GT) ===

=== Modelling pollutant: NMHC(GT) ===

=== Modelling pollutant: C6H6(GT) ===

=== Modelling pollutant: NOx(GT) ===

=== Modelling pollutant: NO2(GT) ===


**Show Results**

In [14]:
# Convert to DataFrame
df_all_results = pd.DataFrame(
    all_results,
    columns=["Pollutant", "Horizon", "RMSE_RF", "RMSE_XGB"]
)

df_all_results

Unnamed: 0,Pollutant,Horizon,RMSE_RF,RMSE_XGB
0,CO(GT),1,0.069413,0.061494
1,CO(GT),6,0.098698,0.097207
2,CO(GT),12,0.111888,0.106259
3,CO(GT),24,0.108852,0.10126
4,NMHC(GT),1,0.101457,0.109205
5,NMHC(GT),6,0.110009,0.092365
6,NMHC(GT),12,0.136843,0.156554
7,NMHC(GT),24,0.069109,0.090734
8,C6H6(GT),1,0.044682,0.04615
9,C6H6(GT),6,0.09368,0.089078
