# OpenAQ AQI Prediction — Full Colab Notebook

End-to-end pipeline:
- Load `openaq.csv` from Google Drive
- Use `preprocessing.py` from the cloned GitHub repo to clean/standardize
- Pivot long → wide (pollutants as columns)
- Compute AQI (US EPA breakpoints) from standardized pollutant concentrations
- Build time + lag features
- Train baseline + RandomForest with time-series CV

**Expected Drive path:** `/content/drive/MyDrive/data_set/openaq.csv`  
**Outputs saved to:** `/content/drive/MyDrive/data/`

In [None]:
# =============================
# 0) Setup: Drive + Repo
# =============================
from google.colab import drive
drive.mount('/content/drive')

# Clone repo (safe if already cloned)
!test -d /content/openaq-aqi-predictor || git clone https://github.com/AshVenn/openaq-aqi-predictor.git

from pathlib import Path
import sys

REPO_ROOT = Path("/content/openaq-aqi-predictor").resolve()
SRC_DIR = REPO_ROOT / "src"
sys.path.insert(0, str(SRC_DIR))

# Paths
DATA_PATH = Path("/content/drive/MyDrive/data_set/openaq.csv").resolve()
OUTPUT_DIR = Path("/content/drive/MyDrive/data").resolve()
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("Repo:", REPO_ROOT.exists(), "| src:", SRC_DIR.exists())
print("Dataset:", DATA_PATH.exists())
print("Output dir:", OUTPUT_DIR)


In [None]:
# =============================
# 1) Imports
# =============================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

SEED = 42
np.random.seed(SEED)


In [None]:
# =============================
# 2) Load + clean (from repo preprocessing.py)
# =============================
import importlib
import preprocessing
importlib.reload(preprocessing)

from preprocessing import load_raw_data, clean_raw_data

raw_df = load_raw_data(str(DATA_PATH))
clean_df = clean_raw_data(raw_df)

print("raw_df:", raw_df.shape)
print("clean_df:", clean_df.shape)
display(clean_df.head())
print("Columns:", list(clean_df.columns))

# Save long cleaned data
processed_long_path = OUTPUT_DIR / "processed_long.csv"
clean_df.to_csv(processed_long_path, index=False)
print("Saved:", processed_long_path)


In [None]:
# =============================
# 3) Quick sanity plots (long format)
# =============================
clean_df["timestamp"] = pd.to_datetime(clean_df["timestamp"], errors="coerce")

counts = clean_df.groupby(clean_df["timestamp"].dt.date).size()
plt.figure(figsize=(10,4))
counts.plot()
plt.title("Daily Measurement Counts")
plt.xlabel("Date")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Pollutant distribution (counts)
plt.figure(figsize=(8,4))
clean_df["pollutant"].value_counts().head(15).plot(kind="bar")
plt.title("Top pollutants by count")
plt.tight_layout()
plt.show()


## AQI computation
We compute AQI from pollutant concentrations using **US EPA** breakpoints.

- Compute **sub-index (IAQI)** for each pollutant using linear interpolation
- Final **AQI = max(IAQI)** across pollutants

This notebook assumes `value_std` is already standardized by your `clean_raw_data()` step.
If a pollutant is missing at a timestamp/location, it’s ignored for the max.

In [None]:
# =============================
# 4) Pivot long -> wide (ML-ready table)
# Your clean_df columns: ['city','location','latitude','longitude','timestamp','pollutant','value_std','unit_std','source_name']
# =============================
df = clean_df.copy()
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df["pollutant"] = df["pollutant"].astype(str).str.lower()

wanted_pollutants = ["pm25", "pm10", "no2", "o3", "co", "so2"]
df = df[df["pollutant"].isin(wanted_pollutants)].copy()

features_df = (
    df.pivot_table(
        index=["city", "location", "latitude", "longitude", "timestamp"],
        columns="pollutant",
        values="value_std",          # IMPORTANT: standardized numeric value
        aggfunc="mean"
    )
    .reset_index()
)
features_df.columns.name = None

print("features_df:", features_df.shape)
display(features_df.head())


In [None]:
# =============================
# 5) AQI breakpoints (US EPA) + helpers
# Units (typical US EPA AQI):
# - PM2.5: µg/m³ (24h) [we treat your value_std as matching]
# - PM10 : µg/m³ (24h)
# - O3   : ppb (8h) or ppm; here use ppb-style breakpoints for simplicity
# - NO2  : ppb (1h)
# - SO2  : ppb (1h)
# - CO   : ppm (8h)
#
# NOTE: If your 'value_std' is in different units, you must convert accordingly.
# We'll include simple conversions for CO if it seems too large/small (optional).
# =============================
from typing import Dict, List, Tuple, Optional

AQI_BANDS = [
    (0, 50),
    (51, 100),
    (101, 150),
    (151, 200),
    (201, 300),
    (301, 400),
    (401, 500),
]

# Breakpoints: pollutant -> list of (BPlo, BPhi, Ilo, Ihi)
# These are commonly used US EPA breakpoint ranges.
BREAKPOINTS: Dict[str, List[Tuple[float, float, int, int]]] = {
    "pm25": [
        (0.0, 12.0, 0, 50),
        (12.1, 35.4, 51, 100),
        (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200),
        (150.5, 250.4, 201, 300),
        (250.5, 350.4, 301, 400),
        (350.5, 500.4, 401, 500),
    ],
    "pm10": [
        (0, 54, 0, 50),
        (55, 154, 51, 100),
        (155, 254, 101, 150),
        (255, 354, 151, 200),
        (355, 424, 201, 300),
        (425, 504, 301, 400),
        (505, 604, 401, 500),
    ],
    # O3 8-hour (ppm) breakpoints often used; here we store in ppb for convenience:
    # 0.054 ppm = 54 ppb, etc.
    "o3": [
        (0, 54, 0, 50),
        (55, 70, 51, 100),
        (71, 85, 101, 150),
        (86, 105, 151, 200),
        (106, 200, 201, 300),
    ],
    "no2": [
        (0, 53, 0, 50),
        (54, 100, 51, 100),
        (101, 360, 101, 150),
        (361, 649, 151, 200),
        (650, 1249, 201, 300),
        (1250, 1649, 301, 400),
        (1650, 2049, 401, 500),
    ],
    "so2": [
        (0, 35, 0, 50),
        (36, 75, 51, 100),
        (76, 185, 101, 150),
        (186, 304, 151, 200),
        (305, 604, 201, 300),
        (605, 804, 301, 400),
        (805, 1004, 401, 500),
    ],
    # CO in ppm (8h)
    "co": [
        (0.0, 4.4, 0, 50),
        (4.5, 9.4, 51, 100),
        (9.5, 12.4, 101, 150),
        (12.5, 15.4, 151, 200),
        (15.5, 30.4, 201, 300),
        (30.5, 40.4, 301, 400),
        (40.5, 50.4, 401, 500),
    ],
}

def iaqi_from_breakpoints(c: float, bps: List[Tuple[float, float, int, int]]) -> Optional[float]:
    if c is None or (isinstance(c, float) and np.isnan(c)):
        return None
    for BPlo, BPhi, Ilo, Ihi in bps:
        if BPlo <= c <= BPhi:
            # IAQI = (Ihi − Ilo)/(BPhi − BPlo) × (C − BPlo) + Ilo
            return (Ihi - Ilo) / (BPhi - BPlo) * (c - BPlo) + Ilo
    # Out of range: cap to 500 if above highest
    if c > bps[-1][1]:
        return 500.0
    return None

def compute_aqi_row(row: pd.Series, pollutants: List[str]) -> float:
    iaqis = []
    for p in pollutants:
        if p not in BREAKPOINTS or p not in row.index:
            continue
        val = row[p]
        # Optional heuristic for O3: if values look like ppm (e.g., 0.07), convert to ppb
        if p == "o3" and pd.notna(val) and val < 1.0:
            val = val * 1000.0  # ppm -> ppb heuristic
        # Optional heuristic for NO2/SO2: if values look like ppm (<1), convert to ppb
        if p in ("no2","so2") and pd.notna(val) and val < 1.0:
            val = val * 1000.0
        iaqi = iaqi_from_breakpoints(float(val), BREAKPOINTS[p])
        if iaqi is not None:
            iaqis.append(iaqi)
    return float(np.max(iaqis)) if iaqis else np.nan


In [None]:
# =============================
# 6) Compute AQI target
# =============================
pollutant_cols_present = [c for c in ["pm25","pm10","no2","o3","co","so2"] if c in features_df.columns]
print("Pollutant cols present:", pollutant_cols_present)

features_df["aqi"] = features_df.apply(lambda r: compute_aqi_row(r, pollutant_cols_present), axis=1)

print("AQI null rate:", features_df["aqi"].isna().mean())
display(features_df[["timestamp","location","aqi"] + pollutant_cols_present].head())


In [None]:
# =============================
# 7) Add time + lag features (from repo features.py)
# =============================
from features import add_time_features, add_lag_features, build_feature_columns

features_df = add_time_features(features_df, time_col="timestamp")

features_df = add_lag_features(
    features_df,
    group_cols=["location"],
    target_cols=pollutant_cols_present,
    lags=(1,),
    time_col="timestamp"
)

feature_cols = build_feature_columns(pollutant_cols_present, include_lags=True)

print("Num feature cols:", len(feature_cols))
print(feature_cols[:10], "...")

# Save engineered dataset
processed_features_path = OUTPUT_DIR / "processed_features.csv"
features_df.to_csv(processed_features_path, index=False)
print("Saved:", processed_features_path)


In [None]:
# =============================
# 8) Quick EDA: AQI distribution + correlation
# =============================
plt.figure(figsize=(8,4))
features_df["aqi"].dropna().hist(bins=50)
plt.title("AQI distribution")
plt.xlabel("AQI")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# Correlation heatmap (numeric only)
num_df = features_df[[c for c in (pollutant_cols_present + ["aqi","hour","day_of_week","month","latitude","longitude"]) if c in features_df.columns]].dropna()
corr = num_df.corr(numeric_only=True)

plt.figure(figsize=(8,6))
plt.imshow(corr.values)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45, ha="right")
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.title("Correlation heatmap")
plt.tight_layout()
plt.show()


In [None]:
# =============================
# 9) Modeling (baseline + RandomForest)
# =============================
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_test_split_time(df, time_col="timestamp", test_size=0.2):
    df = df.sort_values(time_col)
    split_index = int(len(df) * (1 - test_size))
    return df.iloc[:split_index].copy(), df.iloc[split_index:].copy()

def regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)  # compatible across sklearn versions
    rmse = float(np.sqrt(mse))
    return {
        "mae": float(mean_absolute_error(y_true, y_pred)),
        "rmse": rmse,
        "r2": float(r2_score(y_true, y_pred)),
    }

def summarize_metrics(m):
    return f"MAE={m['mae']:.2f}, RMSE={m['rmse']:.2f}, R2={m['r2']:.3f}"

df_model = features_df.dropna(subset=["aqi"]).copy()

# Keep only existing feature cols
feature_cols_final = [c for c in feature_cols if c in df_model.columns]

train_df, test_df = train_test_split_time(df_model, time_col="timestamp", test_size=0.2)

X_train = train_df[feature_cols_final].values
y_train = train_df["aqi"].values
X_test  = test_df[feature_cols_final].values
y_test  = test_df["aqi"].values

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Num features:", len(feature_cols_final))

baseline_model = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("model", LinearRegression())
])
baseline_model.fit(X_train, y_train)
baseline_pred = baseline_model.predict(X_test)
baseline_m = regression_metrics(y_test, baseline_pred)
print("Baseline:", summarize_metrics(baseline_m))

param_grid = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2],
}
rf_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("model", RandomForestRegressor(random_state=SEED, n_jobs=-1))
])

grid = GridSearchCV(
    rf_pipe,
    param_grid=param_grid,
    cv=TimeSeriesSplit(n_splits=3),
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
pred = best_model.predict(X_test)
tree_m = regression_metrics(y_test, pred)

print("Tree:", summarize_metrics(tree_m))
print("Best params:", grid.best_params_)


In [None]:
# =============================
# 10) Diagnostics
# =============================
errors = np.abs(y_test - pred)
print("Median abs error:", float(np.median(errors)))
print("90th pct abs error:", float(np.percentile(errors, 90)))
print("99th pct abs error:", float(np.percentile(errors, 99)))

# Predicted vs Actual
plt.figure(figsize=(6,6))
plt.scatter(y_test, pred, alpha=0.4)
mn = float(min(y_test.min(), pred.min()))
mx = float(max(y_test.max(), pred.max()))
plt.plot([mn, mx], [mn, mx], "r--")
plt.xlabel("Actual AQI")
plt.ylabel("Predicted AQI")
plt.title("Predicted vs Actual AQI")
plt.tight_layout()
plt.show()

# Error histogram
plt.figure(figsize=(8,4))
plt.hist(errors, bins=50)
plt.title("Absolute error distribution")
plt.xlabel("|error|")
plt.ylabel("count")
plt.tight_layout()
plt.show()

# Feature importance
model = best_model.named_steps["model"]
if hasattr(model, "feature_importances_"):
    importances = model.feature_importances_
    order = np.argsort(importances)[::-1]
    plt.figure(figsize=(10,4))
    plt.bar(range(len(feature_cols_final)), importances[order])
    plt.xticks(range(len(feature_cols_final)), np.array(feature_cols_final)[order], rotation=45, ha="right")
    plt.title("Feature importance (RandomForest)")
    plt.tight_layout()
    plt.show()


## Notes / Next steps
- If AQI spikes dominate RMSE: try predicting `log1p(AQI)` and invert.
- Add rolling features (3h/6h) per location to improve temporal signal.
- Consider filtering rows with too many missing pollutant measurements.
