# AQI Estimation Model (Colab)

This notebook builds a full AQI estimation pipeline:
1. Load & preprocess raw OpenAQ-like data
2. Aggregate/pivot to wide format + compute deterministic AQI labels
3. Feature engineering for estimation (no location/city leakage)
4. Train/compare models, select best, export artifacts


In [None]:
# === Clone repo (Colab) ===
!git clone https://github.com/AshVenn/openaq-aqi-predictor.git
%cd openaq-aqi-predictor


In [None]:
# === Setup ===
from pathlib import Path
import json
import sys

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
import joblib

# Project root (works when notebook is run from notebooks/)
ROOT = Path.cwd()
sys.path.insert(0, str(ROOT))

from src import preprocessing
from src.aqi import compute_aqi_dataframe
from src.evaluate import regression_metrics
from src.features import add_time_features

INPUT_POLLUTANTS = ["pm25", "pm10", "no2", "o3", "co", "so2"]
SIMULATE_MISSINGNESS = True
MISSING_PROB = 0.2
RANDOM_SEED = 42
TRAIN_FREQ = "D"
TEST_SIZE = 0.2


## 1) Load & preprocess raw data

In [None]:
raw_path = ROOT / "data" / "openaq.csv"

raw_df = preprocessing.load_raw_data(str(raw_path))
clean_df = preprocessing.clean_raw_data(raw_df)

print(f"Raw rows: {len(raw_df):,}")
print(f"Clean rows: {len(clean_df):,}")


## 2) Aggregate/pivot + compute deterministic AQI label

In [None]:
wide_df = preprocessing.aggregate_and_pivot(clean_df, freq=TRAIN_FREQ)

# Compute deterministic AQI label (US EPA breakpoints)
aqi_df = compute_aqi_dataframe(wide_df)

# Persist processed datasets
processed_wide_path = ROOT / "data" / "processed_wide.csv"
processed_aqi_path = ROOT / "data" / "processed_aqi.csv"
wide_df.to_csv(processed_wide_path, index=False)
aqi_df.to_csv(processed_aqi_path, index=False)

print(f"Saved: {processed_wide_path}")
print(f"Saved: {processed_aqi_path}")


## 3) Feature engineering for AQI estimation (no leakage)

In [None]:
def ensure_pollutant_columns(df, pollutant_cols):
    df = df.copy()
    for p in pollutant_cols:
        if p not in df.columns:
            df[p] = np.nan
    return df


def simulate_missingness(df, pollutant_cols, missing_prob, random_seed):
    df = df.copy()
    rng = np.random.default_rng(random_seed)
    original = df[pollutant_cols].copy()
    mask = rng.random(original.shape) < missing_prob
    mask = mask & original.notna().values
    df[pollutant_cols] = original.mask(mask)

    # Ensure at least one pollutant remains per row if any were originally present
    all_missing = df[pollutant_cols].isna().all(axis=1)
    for idx in df.index[all_missing]:
        available = original.loc[idx].dropna()
        if not available.empty:
            restore_col = rng.choice(available.index)
            df.at[idx, restore_col] = original.at[idx, restore_col]
    return df


def add_missingness_indicators(df, pollutant_cols):
    df = df.copy()
    for p in pollutant_cols:
        df[f"{p}_is_missing"] = df[p].isna().astype(int)
    return df


features_df = ensure_pollutant_columns(aqi_df, INPUT_POLLUTANTS)
features_df = add_time_features(features_df, time_col="timestamp")

# Drop rows without AQI labels
features_df = features_df[features_df["aqi"].notna()].copy()

if SIMULATE_MISSINGNESS:
    features_df = simulate_missingness(
        features_df,
        INPUT_POLLUTANTS,
        missing_prob=MISSING_PROB,
        random_seed=RANDOM_SEED,
    )

features_df = add_missingness_indicators(features_df, INPUT_POLLUTANTS)

feature_cols = (
    ["latitude", "longitude", "hour", "day_of_week", "month"]
    + INPUT_POLLUTANTS
    + [f"{p}_is_missing" for p in INPUT_POLLUTANTS]
)

print("Feature columns:")
print(feature_cols)


## 4) Train, compare, select best model, export artifacts

In [None]:
def time_split(df, time_col="timestamp", test_size=0.2):
    df = df.sort_values(time_col)
    split_idx = int(len(df) * (1 - test_size))
    return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()


def build_pipeline(estimator):
    return Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("model", estimator),
        ]
    )


train_df, test_df = time_split(features_df, test_size=TEST_SIZE)

X_train = train_df[feature_cols]
y_train = train_df["aqi"]
X_test = test_df[feature_cols]
y_test = test_df["aqi"]

model_specs = {
    "LinearRegression": {
        "estimator": LinearRegression(),
        "param_grid": None,
    },
    "Ridge": {
        "estimator": Ridge(random_state=RANDOM_SEED),
        "param_grid": {"model__alpha": [0.1, 1.0, 10.0, 50.0]},
    },
    "Lasso": {
        "estimator": Lasso(random_state=RANDOM_SEED, max_iter=5000),
        "param_grid": {"model__alpha": [0.001, 0.01, 0.1, 1.0]},
    },
    "RandomForest": {
        "estimator": RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1),
        "param_grid": {
            "model__n_estimators": [200, 400],
            "model__max_depth": [None, 10, 20],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2],
        },
    },
    "GradientBoosting": {
        "estimator": GradientBoostingRegressor(random_state=RANDOM_SEED),
        "param_grid": {
            "model__n_estimators": [100, 200],
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [2, 3],
        },
    },
    "HistGradientBoosting": {
        "estimator": HistGradientBoostingRegressor(random_state=RANDOM_SEED),
        "param_grid": {
            "model__max_depth": [3, 6, None],
            "model__learning_rate": [0.05, 0.1],
            "model__max_iter": [200, 400],
        },
    },
}

results = []
best_models = {}
cv = TimeSeriesSplit(n_splits=3)

for name, spec in model_specs.items():
    pipeline = build_pipeline(spec["estimator"])
    if spec["param_grid"]:
        grid = GridSearchCV(
            pipeline,
            param_grid=spec["param_grid"],
            cv=cv,
            scoring="neg_mean_absolute_error",
            n_jobs=-1,
        )
        grid.fit(X_train, y_train)
        best_estimator = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_estimator = pipeline.fit(X_train, y_train)
        best_params = None

    preds = best_estimator.predict(X_test)
    metrics = regression_metrics(y_test, preds)

    results.append(
        {
            "model": name,
            "mae": metrics["mae"],
            "rmse": metrics["rmse"],
            "r2": metrics["r2"],
            "best_params": best_params,
        }
    )
    best_models[name] = best_estimator

results_df = pd.DataFrame(results).sort_values(["mae", "rmse"]).reset_index(drop=True)
results_df


In [None]:
best_row = results_df.iloc[0]
best_model_name = best_row["model"]
print(f"Best model: {best_model_name}")

# Refit best model on full dataset for export
best_spec = model_specs[best_model_name]
best_pipeline = build_pipeline(best_spec["estimator"])
if best_row["best_params"]:
    best_pipeline.set_params(**best_row["best_params"])

X_full = features_df[feature_cols]
y_full = features_df["aqi"]
best_pipeline.fit(X_full, y_full)


In [None]:
# === Export artifacts ===
models_dir = ROOT / "models"
models_dir.mkdir(exist_ok=True)

model_path = models_dir / "aqi_estimator.joblib"
feature_cols_path = models_dir / "feature_cols.json"
model_meta_path = models_dir / "model_meta.json"

joblib.dump(best_pipeline, model_path)

with open(feature_cols_path, "w", encoding="utf-8") as f:
    json.dump(feature_cols, f, indent=2)

model_meta = {
    "best_model_name": best_model_name,
    "input_pollutants": INPUT_POLLUTANTS,
    "features": feature_cols,
    "uses_missingness_indicators": True,
    "time_features": ["hour", "day_of_week", "month"],
    "expects_standard_units": True,
    "trained_freq": TRAIN_FREQ,
    "missingness_simulated": SIMULATE_MISSINGNESS,
    "missing_prob": MISSING_PROB if SIMULATE_MISSINGNESS else None,
}

with open(model_meta_path, "w", encoding="utf-8") as f:
    json.dump(model_meta, f, indent=2)

print(f"Saved model: {model_path}")
print(f"Saved feature columns: {feature_cols_path}")
print(f"Saved metadata: {model_meta_path}")


In [None]:
# === Write report summary ===
reports_path = ROOT / "reports" / "summary.md"

summary_lines = [
    "# AQI Estimation Model Summary",
    "",
    "## Dataset",
    f"- Rows after cleaning: {len(clean_df):,}",
    f"- Rows after aggregation: {len(wide_df):,}",
    f"- Rows with AQI labels: {len(features_df):,}",
    f"- Aggregation window: {TRAIN_FREQ}",
    "",
    "## Model Comparison (test set)",
    results_df.to_markdown(index=False),
    "",
    "## Best Model",
    f"- {best_model_name}",
    f"- MAE: {best_row['mae']:.2f}",
    f"- RMSE: {best_row['rmse']:.2f}",
    f"- R2: {best_row['r2']:.3f}",
]

reports_path.write_text("
".join(summary_lines), encoding="utf-8")
print(f"Wrote report: {reports_path}")
