# AQI Estimation Model (Colab)

This notebook builds a full AQI estimation pipeline:
1. Clone repo + setup
2. Load & preprocess raw OpenAQ-like data
3. Aggregate/pivot to wide format + compute deterministic AQI labels
4. Feature engineering for estimation (no leakage)
5. Safe missingness simulation
6. Train/compare models, select best, export artifacts


In [None]:
# === Clone repo (Colab) ===
!git clone https://github.com/AshVenn/openaq-aqi-predictor.git
%cd openaq-aqi-predictor


In [None]:
# === Install deps (Colab) ===
!pip -q install xgboost catboost


In [None]:
# === Setup ===
from pathlib import Path
import json
import sys

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
import joblib

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

ROOT = Path.cwd()
sys.path.insert(0, str(ROOT))

from src import preprocessing
from src.aqi import compute_aqi_dataframe
from src.evaluate import regression_metrics
from src.features import add_time_features

POLLUTANTS_ALL = ["pm25", "pm10", "no2", "o3", "co", "so2"]
INPUT_POLLUTANTS = POLLUTANTS_ALL.copy()
SIMULATE_MISSINGNESS = True
MISSING_PROB = 0.2
MIN_NON_MISSING_PER_COL = 50
RANDOM_SEED = 42
TRAIN_FREQ = "D"
TEST_SIZE = 0.2


## 1) Load & preprocess raw data

In [None]:
raw_path = ROOT / "data" / "openaq.csv"

raw_df = preprocessing.load_raw_data(str(raw_path))
clean_df = preprocessing.clean_raw_data(raw_df)

print(f"Raw rows: {len(raw_df):,}")
print(f"Clean rows: {len(clean_df):,}")

# Debug checks for pm25 presence and conversion
if "pollutant" in clean_df.columns:
    pm25_rows = clean_df[clean_df["pollutant"] == "pm25"]
    print(f"pm25 rows in cleaned long data: {len(pm25_rows):,}")
    if "value_std" in pm25_rows.columns:
        print(f"pm25 non-null value_std: {pm25_rows['value_std'].notna().sum():,}")
else:
    print("No 'pollutant' column found in clean_df")


## 2) Aggregate/pivot + compute deterministic AQI label

In [None]:
wide_df = preprocessing.aggregate_and_pivot(clean_df, freq=TRAIN_FREQ)

# Compute deterministic AQI label (US EPA breakpoints)
aqi_df = compute_aqi_dataframe(wide_df)

# Persist processed datasets
processed_wide_path = ROOT / "data" / "processed_wide.csv"
processed_aqi_path = ROOT / "data" / "processed_aqi.csv"
wide_df.to_csv(processed_wide_path, index=False)
aqi_df.to_csv(processed_aqi_path, index=False)

print(f"Saved: {processed_wide_path}")
print(f"Saved: {processed_aqi_path}")


## 3) Feature engineering for AQI estimation (no leakage)

In [None]:
def ensure_pollutant_columns(df, pollutant_cols):
    df = df.copy()
    for p in pollutant_cols:
        if p not in df.columns:
            df[p] = np.nan
    return df


def add_missingness_indicators(df, pollutant_cols):
    df = df.copy()
    for p in pollutant_cols:
        df[f"{p}_is_missing"] = df[p].isna().astype(int)
    return df


features_df = ensure_pollutant_columns(aqi_df, POLLUTANTS_ALL)
features_df = add_time_features(features_df, time_col="timestamp")

# Drop rows without AQI labels
features_df = features_df[features_df["aqi"].notna()].copy()

# Determine available pollutants (non-null > 0)
available_pollutants = [
    p for p in POLLUTANTS_ALL
    if p in features_df.columns and features_df[p].notna().sum() > 0
]

print("Non-null counts per pollutant:")
for p in POLLUTANTS_ALL:
    count = features_df[p].notna().sum() if p in features_df.columns else 0
    print(f"{p}: {count}")

INPUT_POLLUTANTS = available_pollutants
print(f"Using INPUT_POLLUTANTS: {INPUT_POLLUTANTS}")

feature_cols = (
    ["latitude", "longitude", "hour", "day_of_week", "month"]
    + INPUT_POLLUTANTS
    + [f"{p}_is_missing" for p in INPUT_POLLUTANTS]
)

assert "aqi" not in feature_cols
print("Feature columns:")
print(feature_cols)


## 4) Safe missingness simulation

In [None]:
from sklearn.model_selection import TimeSeriesSplit


def simulate_missingness(
    df,
    pollutant_cols,
    missing_prob=0.2,
    min_non_missing_per_col=50,
    seed=42,
):
    df = df.copy()
    rng = np.random.RandomState(seed)

    original = df[pollutant_cols].copy()

    # Apply random missingness per cell (only where value exists)
    mask = rng.rand(*original.shape) < missing_prob
    mask = mask & original.notna().values
    df[pollutant_cols] = original.mask(mask)

    # Row rule: ensure at least one pollutant remains per row if any were present
    all_missing = df[pollutant_cols].isna().all(axis=1)
    for idx in df.index[all_missing]:
        available = original.loc[idx].dropna()
        if not available.empty:
            restore_col = rng.choice(available.index)
            df.at[idx, restore_col] = original.at[idx, restore_col]

    # Column rule: ensure minimum non-missing per pollutant (global)
    for p in pollutant_cols:
        current_non_missing = df[p].notna().sum()
        if current_non_missing >= min_non_missing_per_col:
            continue
        candidates = original[p][original[p].notna()].index
        if len(candidates) == 0:
            continue
        needed = min_non_missing_per_col - current_non_missing
        restore_rows = rng.choice(candidates, size=min(needed, len(candidates)), replace=False)
        df.loc[restore_rows, p] = original.loc[restore_rows, p]

    return df


def enforce_non_missing_per_fold(df, original, pollutant_cols, n_splits=3, seed=42):
    df = df.copy()
    rng = np.random.RandomState(seed)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    indices = np.arange(len(df))

    for train_idx, _ in tscv.split(indices):
        for p in pollutant_cols:
            if df.iloc[train_idx][p].notna().sum() == 0:
                candidates = train_idx[original.iloc[train_idx][p].notna().values]
                if len(candidates) == 0:
                    continue
                restore_pos = rng.choice(candidates, size=1, replace=False)
                df.iloc[restore_pos, df.columns.get_loc(p)] = original.iloc[restore_pos][p].values

    return df


original_pollutants = features_df[INPUT_POLLUTANTS].copy()

if SIMULATE_MISSINGNESS and INPUT_POLLUTANTS:
    features_df = simulate_missingness(
        features_df,
        INPUT_POLLUTANTS,
        missing_prob=MISSING_PROB,
        min_non_missing_per_col=MIN_NON_MISSING_PER_COL,
        seed=RANDOM_SEED,
    )

    # Ensure no fold has a fully-missing pollutant column
    features_df = enforce_non_missing_per_fold(
        features_df,
        original_pollutants,
        INPUT_POLLUTANTS,
        n_splits=3,
        seed=RANDOM_SEED,
    )

# Missingness indicators AFTER simulation
features_df = add_missingness_indicators(features_df, INPUT_POLLUTANTS)

# Sanity: no pollutant column is fully missing
for p in INPUT_POLLUTANTS:
    assert features_df[p].notna().sum() > 0, f"{p} is fully missing"


## 5) Train, tune, compare models

In [None]:
def time_split(df, time_col="timestamp", test_size=0.2):
    df = df.sort_values(time_col)
    split_idx = int(len(df) * (1 - test_size))
    return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()


def build_pipeline(estimator):
    return Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("model", estimator),
        ]
    )


train_df, test_df = time_split(features_df, test_size=TEST_SIZE)

X_train = train_df[feature_cols]
y_train = train_df["aqi"]
X_test = test_df[feature_cols]
y_test = test_df["aqi"]

print("Non-missing counts in TRAIN split:")
print(X_train[INPUT_POLLUTANTS].notna().sum())

model_specs = {
    "LinearRegression": {
        "estimator": LinearRegression(),
        "param_grid": None,
    },
    "Ridge": {
        "estimator": Ridge(random_state=RANDOM_SEED),
        "param_grid": {"model__alpha": [0.1, 1.0, 10.0, 100.0]},
    },
    "Lasso": {
        "estimator": Lasso(random_state=RANDOM_SEED, max_iter=5000),
        "param_grid": {"model__alpha": [0.001, 0.01, 0.1, 1.0]},
    },
    "RandomForest": {
        "estimator": RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1),
        "param_grid": {
            "model__n_estimators": [200, 400],
            "model__max_depth": [None, 10, 20],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2],
        },
    },
    "XGBoost": {
        "estimator": XGBRegressor(
            objective="reg:squarederror",
            random_state=RANDOM_SEED,
            n_jobs=-1,
        ),
        "param_grid": {
            "model__n_estimators": [300, 600],
            "model__max_depth": [4, 6, 8],
            "model__learning_rate": [0.05, 0.1],
            "model__subsample": [0.8, 1.0],
            "model__colsample_bytree": [0.8, 1.0],
        },
    },
    "CatBoost": {
        "estimator": CatBoostRegressor(
            loss_function="RMSE",
            random_seed=RANDOM_SEED,
            verbose=False,
        ),
        "param_grid": {
            "model__depth": [6, 8, 10],
            "model__learning_rate": [0.03, 0.1],
            "model__iterations": [500, 1000],
        },
    },
}

results = []
best_models = {}
cv = TimeSeriesSplit(n_splits=3)

for name, spec in model_specs.items():
    pipeline = build_pipeline(spec["estimator"])
    if spec["param_grid"]:
        grid = GridSearchCV(
            pipeline,
            param_grid=spec["param_grid"],
            cv=cv,
            scoring="neg_mean_absolute_error",
            n_jobs=-1,
        )
        grid.fit(X_train, y_train)
        best_estimator = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_estimator = pipeline.fit(X_train, y_train)
        best_params = None

    preds = best_estimator.predict(X_test)
    metrics = regression_metrics(y_test, preds)

    results.append(
        {
            "model": name,
            "mae": metrics["mae"],
            "rmse": metrics["rmse"],
            "r2": metrics["r2"],
            "best_params": best_params,
        }
    )
    best_models[name] = best_estimator

results_df = pd.DataFrame(results).sort_values(["mae", "rmse"]).reset_index(drop=True)
results_df


In [None]:
best_row = results_df.iloc[0]
best_model_name = best_row["model"]
best_params = best_row["best_params"]

print(f"Best model: {best_model_name}")
print(f"Best params: {best_params}")
print(f"Best metrics: MAE={best_row['mae']:.3f}, RMSE={best_row['rmse']:.3f}, R2={best_row['r2']:.3f}")

# Refit best model on full dataset for export
best_spec = model_specs[best_model_name]
best_pipeline = build_pipeline(best_spec["estimator"])
if best_params:
    best_pipeline.set_params(**best_params)

X_full = features_df[feature_cols]
y_full = features_df["aqi"]
best_pipeline.fit(X_full, y_full)


## 6) Export artifacts + Colab downloads

In [None]:
# === Export artifacts ===
models_dir = ROOT / "models"
models_dir.mkdir(exist_ok=True)

model_path = models_dir / "aqi_estimator.joblib"
feature_cols_path = models_dir / "feature_cols.json"
model_meta_path = models_dir / "model_meta.json"

joblib.dump(best_pipeline, model_path)

with open(feature_cols_path, "w", encoding="utf-8") as f:
    json.dump(feature_cols, f, indent=2)

model_meta = {
    "best_model_name": best_model_name,
    "best_params": best_params,
    "metrics": {
        "mae": float(best_row["mae"]),
        "rmse": float(best_row["rmse"]),
        "r2": float(best_row["r2"]),
    },
    "input_pollutants": INPUT_POLLUTANTS,
    "time_features": ["hour", "day_of_week", "month"],
    "uses_missingness_indicators": True,
    "expects_standard_units": True,
}

with open(model_meta_path, "w", encoding="utf-8") as f:
    json.dump(model_meta, f, indent=2)

print(f"Saved model: {model_path}")
print(f"Saved feature columns: {feature_cols_path}")
print(f"Saved metadata: {model_meta_path}")


In [None]:
# === Write report summary ===
reports_path = ROOT / "reports" / "summary.md"

summary_lines = [
    "# AQI Estimation Model Summary",
    "",
    "## Dataset",
    f"- Rows after cleaning: {len(clean_df):,}",
    f"- Rows after aggregation: {len(wide_df):,}",
    f"- Rows with AQI labels: {len(features_df):,}",
    f"- Aggregation window: {TRAIN_FREQ}",
    "",
    "## Model Comparison (test set)",
    results_df.to_markdown(index=False),
    "",
    "## Best Model",
    f"- {best_model_name}",
    f"- MAE: {best_row['mae']:.2f}",
    f"- RMSE: {best_row['rmse']:.2f}",
    f"- R2: {best_row['r2']:.3f}",
]

reports_path.write_text("".join(summary_lines), encoding="utf-8")
print(f"Wrote report: {reports_path}")


In [None]:
# === Zip + download artifacts in Colab ===
from google.colab import files
import zipfile

zip_path = ROOT / "aqi_artifacts.zip"
with zipfile.ZipFile(zip_path, "w") as zf:
    zf.write(model_path, arcname=f"models/{model_path.name}")
    zf.write(feature_cols_path, arcname=f"models/{feature_cols_path.name}")
    zf.write(model_meta_path, arcname=f"models/{model_meta_path.name}")
    zf.write(reports_path, arcname="reports/summary.md")

files.download(str(model_path))
files.download(str(feature_cols_path))
files.download(str(model_meta_path))
files.download(str(reports_path))
files.download(str(zip_path))
