<a href="https://colab.research.google.com/github/ArunK-ML/Project---Live-PM2.5-Nowcast-and-Forecast---Final-Project/blob/main/PM2_5_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install streamlit>=1.20
!pip install pandas>=2.0
!pip install numpy>=1.24
!pip install requests>=2.28
!pip install scikit-learn>=1.2
!pip install joblib>=1.3
!pip install plotly>=5.15
!pip install python-dateutil
!pip install pytz



In [None]:
# config.py
# Small config defaults — change coordinates to your city

DEFAULT_LAT = 13.0827   # Chennai example
DEFAULT_LON = 80.2707
DEFAULT_RADIUS_M = 10000  # OpenAQ radius in meters
DEFAULT_DAYS_HISTORY = 14  # how many days of historical data to pull by default


In [None]:
# data_fetch.py
import requests
import pandas as pd
from datetime import datetime, timedelta, date
from dateutil import parser

OPENAQ_API = "https://api.openaq.org/v2/measurements"
OPENMETEO_API = "https://api.open-meteo.com/v1/forecast"

def fetch_openaq_pm25(lat, lon, radius_m=10000, days=14, limit_per_page=10000):
    """
    Fetch PM2.5 measurements from OpenAQ (UTC timestamps).
    Aggregates to hourly mean.
    Returns DataFrame with index = UTC hourly datetime and column 'pm25'.
    """
    date_to = datetime.utcnow().date()
    date_from = date_to - timedelta(days=days)
    params = {
        "coordinates": f"{lat},{lon}",
        "radius": radius_m,
        "parameter": "pm25",
        "date_from": date_from.isoformat(),
        "date_to": date_to.isoformat(),
        "limit": limit_per_page,
        "page": 1,
        "sort": "desc"
    }

    rows = []
    while True:
        r = requests.get(OPENAQ_API, params=params, timeout=30)
        r.raise_for_status()
        j = r.json()
        results = j.get("results", [])
        if not results:
            break
        for rec in results:
            # date.utc exists like "2025-10-04T12:00:00+00:00"
            dt = rec.get("date", {}).get("utc")
            if not dt:
                continue
            try:
                ts = parser.isoparse(dt)
            except Exception:
                continue
            rows.append({"timestamp": ts, "pm25": rec.get("value")})
        # paging
        params["page"] += 1
        # avoid infinite loop if server returns huge data; break if less than limit
        if len(results) < limit_per_page:
            break

    if not rows:
        return pd.DataFrame(columns=["pm25"]).astype({"pm25": "float64"})

    df = pd.DataFrame(rows)
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df = df.set_index("timestamp")
    # resample to hourly mean
    hourly = df.resample("H").mean().sort_index()
    return hourly

def fetch_open_meteo_weather(lat, lon, start_date: date, end_date: date, timezone="UTC"):
    """
    Pull hourly weather variables from Open-Meteo.
    start_date/end_date are date objects (YYYY-MM-DD).
    Returns DataFrame indexed by UTC hourly timestamps with columns:
      temperature_2m, relativehumidity_2m, windspeed_10m, winddirection_10m, pressure_msl
    """
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly": ",".join(
            ["temperature_2m", "relativehumidity_2m", "windspeed_10m", "winddirection_10m", "pressure_msl"]
        ),
        "start_date": start_date.isoformat(),
        "end_date": end_date.isoformat(),
        "timezone": timezone  # can pass "UTC" or "auto"
    }
    r = requests.get(OPENMETEO_API, params=params, timeout=30)
    r.raise_for_status()
    j = r.json()
    hourly = j.get("hourly", {})
    if not hourly:
        return pd.DataFrame()
    times = [pd.to_datetime(t).tz_convert(None) if pd.to_datetime(t).tzinfo else pd.to_datetime(t) for t in hourly.get("time", [])]
    df = pd.DataFrame(hourly)
    # drop the time column duplication
    df = df.drop(columns=["time"], errors="ignore")
    df.index = pd.to_datetime(hourly["time"])
    # ensure timezone naive UTC for alignment
    df.index = pd.to_datetime(df.index).tz_convert(None)
    df = df.sort_index()
    return df


In [None]:
# features.py
import pandas as pd
import numpy as np

def add_time_features(df):
    df = df.copy()
    # ensure datetime index
    idx = pd.DatetimeIndex(df.index)
    df["hour"] = idx.hour
    df["weekday"] = idx.weekday
    df["month"] = idx.month
    return df

def make_lag_roll_features(df, target_col="pm25", max_lag=24):
    """
    df must have 'pm25' in index-aligned rows. Returns DataFrame with lag & rolling features.
    """
    out = df.copy()
    # lags
    for lag in range(1, max_lag + 1):
        out[f"lag_{lag}"] = out[target_col].shift(lag)
    # rolling windows
    out["roll_mean_3"] = out[target_col].rolling(3, min_periods=1).mean().shift(1)
    out["roll_mean_6"] = out[target_col].rolling(6, min_periods=1).mean().shift(1)
    out["roll_mean_24"] = out[target_col].rolling(24, min_periods=1).mean().shift(1)
    out["roll_std_24"] = out[target_col].rolling(24, min_periods=1).std().shift(1).fillna(0.0)
    return out

def build_feature_matrix(pm_df, weather_df=None, max_lag=24):
    """
    pm_df: DataFrame with index hourly, column 'pm25'
    weather_df: DataFrame with hourly weather variables aligned to pm_df index (can be None)
    Returns X (features), y (target)
    """
    df = pm_df.copy()
    if weather_df is not None and not weather_df.empty:
        # join weather features (align by index)
        df = df.join(weather_df, how="left")
    # create time features
    df = add_time_features(df)
    # lags/rolling
    df = make_lag_roll_features(df, target_col="pm25", max_lag=max_lag)
    # anomaly flag might later be added outside
    # drop rows where target is NaN (can't train on missing y)
    y = df["pm25"].copy()
    X = df.drop(columns=["pm25"])
    # fill missing numeric features sensibly
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X[numeric_cols] = X[numeric_cols].fillna(method="ffill").fillna(method="bfill").fillna(0.0)
    return X, y
