In [24]:
import pandas as pd
import numpy as np 
import os
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [25]:
DATA_PATH = "../data/cleaned_station_day_with_station_info.csv"
BASE_DIR = os.path.abspath("..")
MODEL_DIR = os.path.join(BASE_DIR, "models_forecast")

df = pd.read_csv(DATA_PATH, parse_dates=["Date"])

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,City,PM2.5,PM10,NO2,SO2,CO,O3,AQI,AQI_Bucket
0,6497,2015-04-05,Delhi,95.28,243.36,49.57,49.67,1.56,23.77,173.0,Moderate
1,41738,2015-04-05,Delhi,85.65,112.95,48.43,20.23,4.96,73.6,276.0,Poor
2,38065,2015-04-08,Delhi,104.31,104.31,71.27,18.1,11.29,192.14,368.0,Very Poor
3,38066,2015-04-09,Delhi,50.4,50.4,82.57,8.12,1.1,138.13,438.0,Severe
4,27969,2015-04-10,Delhi,67.31,143.7,52.88,20.87,1.01,76.17,168.0,Moderate


In [27]:
def create_features(city_df):
    city_df = city_df.sort_values("Date").copy()

    city_df["lag1"] = city_df["AQI"].shift(1)
    city_df["lag2"] = city_df["AQI"].shift(2)
    city_df["lag3"] = city_df["AQI"].shift(3)
    city_df["roll3"] = city_df["AQI"].rolling(3).mean()
    city_df["roll7"] = city_df["AQI"].rolling(7).mean()

    city_df["day"] = city_df["Date"].dt.day
    city_df["month"] = city_df["Date"].dt.month
    city_df["weekday"] = city_df["Date"].dt.weekday

    city_df = city_df.dropna()
    return city_df

In [28]:
for city in cities:
    city_df = df[df["City"] == city][["Date", "AQI"]].dropna()

    if len(city_df) < 100:
        print(f"Skipping {city} (not enough data)")
        continue

    city_df = create_features(city_df)

    X = city_df[["lag1", "lag2", "lag3", "roll3", "roll7", "day", "month", "weekday"]]
    y = city_df["AQI"]

    split = int(len(city_df) * 0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]

    model = RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)

    joblib.dump(model, f"{MODEL_DIR}/{city}_forecast.pkl")

    print(f"Saved forecast model for {city} | R2 Score: {round(r2,2)}")

Saved forecast model for Delhi | R2 Score: 1.0
Saved forecast model for Hyderabad | R2 Score: 0.98
Saved forecast model for Visakhapatnam | R2 Score: 0.92
Saved forecast model for Amritsar | R2 Score: 0.8
Saved forecast model for Jaipur | R2 Score: 0.95
Saved forecast model for Thiruvananthapuram | R2 Score: 0.89
Saved forecast model for Amaravati | R2 Score: 0.94
Saved forecast model for Brajrajnagar | R2 Score: 0.78
Saved forecast model for Talcher | R2 Score: 0.93
Saved forecast model for Kolkata | R2 Score: 0.95
Saved forecast model for Mumbai | R2 Score: 0.96
Saved forecast model for Bengaluru | R2 Score: 0.97
Saved forecast model for Jorapokhar | R2 Score: 0.39
Saved forecast model for Guwahati | R2 Score: 0.96
Saved forecast model for Ahmedabad | R2 Score: -3.02
Saved forecast model for Coimbatore | R2 Score: 0.43
Saved forecast model for Chennai | R2 Score: 0.42
Saved forecast model for Shillong | R2 Score: 0.37
Saved forecast model for Chandigarh | R2 Score: 0.84
Saved forecas