In [None]:
import pandas as pd

df = pd.read_csv("data/raw/city_day.csv", parse_dates=["Date"])

# Keep only necessary columns
df = df[["Date", "AQI"]].dropna()

# Sort by date
df = df.sort_values("Date")

# Create lag features
for lag in range(1, 8):   # past 7 days
    df[f"lag_{lag}"] = df["AQI"].shift(lag)

# Drop NA (from lags)
df = df.dropna()

# Features & target
X = df.drop(columns=["Date", "AQI"])
y = df["AQI"]


In [3]:
df.head()

Unnamed: 0,Date,AQI,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7
10236,2015-01-08,383.0,353.0,318.0,325.0,319.0,143.0,454.0,472.0
10237,2015-01-09,375.0,383.0,353.0,318.0,325.0,319.0,143.0,454.0
10238,2015-01-10,376.0,375.0,383.0,353.0,318.0,325.0,319.0,143.0
10239,2015-01-11,379.0,376.0,375.0,383.0,353.0,318.0,325.0,319.0
10240,2015-01-12,375.0,379.0,376.0,375.0,383.0,353.0,318.0,325.0


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((19874, 7), (4969, 7), (19874,), (4969,))

In [6]:
X_train


Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7
10236,353.0,318.0,325.0,319.0,143.0,454.0,472.0
10237,383.0,353.0,318.0,325.0,319.0,143.0,454.0
10238,375.0,383.0,353.0,318.0,325.0,319.0,143.0
10239,376.0,375.0,383.0,353.0,318.0,325.0,319.0
10240,379.0,376.0,375.0,383.0,353.0,318.0,325.0
...,...,...,...,...,...,...,...
2864,344.0,129.0,180.0,362.0,238.0,209.0,423.0
14372,185.0,344.0,129.0,180.0,362.0,238.0,209.0
19637,170.0,185.0,344.0,129.0,180.0,362.0,238.0
7321,183.0,170.0,185.0,344.0,129.0,180.0,362.0


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
import numpy as np

# Train
model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}")


R² Score: -0.01
MAE: 67.14, RMSE: 90.25


In [8]:
import optuna
from sklearn.model_selection import cross_val_score, KFold


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10)
    }

    model = XGBRegressor(**params, random_state=42, n_jobs=-1)
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error")
    return -scores.mean()

study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=10, n_jobs=-1)

print(" Best XGBoost params:", study_xgb.best_params)
print(" Best XGBoost RMSE:", study_xgb.best_value)

best_xgb = XGBRegressor(**study_xgb.best_params, random_state=42, n_jobs=-1)
best_xgb.fit(X_train, y_train)
y_pred_best_xgb = best_xgb.predict(X_test)

print("\n Tuned XGBoost Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred_best_xgb))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_best_xgb)))


[I 2025-09-05 15:05:25,981] A new study created in memory with name: no-name-2aacff1e-a35d-43a5-aab0-1987ed13595e
[I 2025-09-05 15:05:35,249] Trial 2 finished with value: 142.91347973147563 and parameters: {'n_estimators': 349, 'learning_rate': 0.0742863226426501, 'max_depth': 4, 'subsample': 0.9810608638997096, 'colsample_bytree': 0.983589480651408, 'gamma': 1.0226553857313436, 'min_child_weight': 6}. Best is trial 2 with value: 142.91347973147563.
[I 2025-09-05 15:05:40,474] Trial 3 finished with value: 142.57909555237424 and parameters: {'n_estimators': 506, 'learning_rate': 0.04792110435929606, 'max_depth': 4, 'subsample': 0.9227031025789887, 'colsample_bytree': 0.7192502780200187, 'gamma': 3.2837719448121567, 'min_child_weight': 7}. Best is trial 3 with value: 142.57909555237424.
[I 2025-09-05 15:06:17,358] Trial 5 finished with value: 148.9757356360469 and parameters: {'n_estimators': 567, 'learning_rate': 0.08311120704202002, 'max_depth': 9, 'subsample': 0.9007826844103579, 'col

 Best XGBoost params: {'n_estimators': 929, 'learning_rate': 0.013071328271443933, 'max_depth': 6, 'subsample': 0.9578970630220798, 'colsample_bytree': 0.7355857824402744, 'gamma': 2.509923847158071, 'min_child_weight': 1}
 Best XGBoost RMSE: 142.1625490874041

 Tuned XGBoost Performance:
MAE: 66.06560930454364
RMSE: 88.58696033729669


In [16]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
import numpy as np

# Train
model = XGBRegressor(
    n_estimators=929,
    learning_rate=0.013071328271443933,
    max_depth=6,
    subsample=0.9578970630220798,
    colsample_bytree=0.7355857824402744,
    gamma=2.509923847158071,
    min_child_weight=1,
    random_state=42
)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}")


MAE: 66.07, RMSE: 88.59


In [17]:
def forecast_future(model, last_known, steps=7):
    predictions = []
    data = last_known.copy()

    for _ in range(steps):
        pred = model.predict(data.values.reshape(1, -1))[0]
        predictions.append(pred)

        # update lags
        data = data.shift(1)
        data.iloc[0] = pred

    return predictions

# Last known values (from test set)
last_known = X_test.iloc[-1]
future_preds = forecast_future(model, last_known, steps=7)

print("Next 7 days forecast:", future_preds)


Next 7 days forecast: [np.float32(118.7058), np.float32(110.15237), np.float32(129.33633), np.float32(108.301605), np.float32(118.57194), np.float32(140.76495), np.float32(167.93121)]


In [18]:
import joblib
# Save model
joblib.dump(model, "models/aqi_forecast_xgb.joblib")
print("✅ Model saved at models/aqi_forecast_xgb.joblib")

✅ Model saved at models/aqi_forecast_xgb.joblib
