<a href="https://colab.research.google.com/github/ArunK-ML/Project---Live-PM2.5-Nowcast-and-Forecast---Final-Project/blob/main/PM2_5_final_project_p2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install streamlit>=1.20
!pip install pandas>=2.0
!pip install numpy>=1.24
!pip install requests>=2.28
!pip install scikit-learn>=1.2
!pip install joblib>=1.3
!pip install plotly>=5.15
!pip install python-dateutil
!pip install pytz



In [1]:

import requests
import pandas as pd
from datetime import datetime, timedelta, timezone

TZ = timezone.utc
LAT, LON = 13.0827, 80.2707   # Chennai (change as needed)
HOURS_BACK = 72               # how many past hours of PM2.5
RADIUS_KM = 30                # OpenAQ search radius

def fetch_openaq_pm25(lat: float, lon: float, hours: int = 168, radius_km: int = 30) -> pd.DataFrame:
    end = datetime.now(TZ)
    start = end - timedelta(hours=hours)
    base = "https://api.openaq.org/v2/measurements"
    params = {
        "coordinates": f"{lat},{lon}",
        "radius": int(radius_km * 1000),
        "parameter": "pm25",
        "date_from": start.isoformat(),
        "date_to": end.isoformat(),
        "limit": 10000,
        "sort": "desc",
        "order_by": "datetime",
        "page": 1,
    }
    frames = []
    try:
        while True:
            r = requests.get(base, params=params, timeout=30)
            if r.status_code >= 400:
                return pd.DataFrame()
            js = r.json()
            items = js.get("results", [])
            if not items:
                break
            df = pd.DataFrame(items)
            if "date" not in df:
                break
            df = df[["date", "value"]]
            df["datetime"] = pd.to_datetime(df["date"].apply(lambda d: d.get("utc")), utc=True)
            df = df.drop(columns=["date"]).sort_values("datetime")
            frames.append(df)
            meta = js.get("meta", {})
            found = meta.get("found")
            page = params["page"]
            limit = params["limit"]
            if found is None or page * limit >= int(found):
                break
            params["page"] = page + 1
    except Exception:
        return pd.DataFrame()

    if not frames:
        return pd.DataFrame()

    df = pd.concat(frames, ignore_index=True)
    df = df[(df["datetime"] >= start) & (df["datetime"] <= end)]
    # hourly median across stations
    df_hour = (
        df.set_index("datetime")
          .groupby(pd.Grouper(freq="1H"))["value"]
          .median()
          .reset_index()
          .rename(columns={"value": "pm25"})
    )
    return df_hour


def fetch_openmeteo_aq_pm25(lat: float, lon: float, hours: int = 168) -> pd.DataFrame:
    end = datetime.now(TZ)
    start = end - timedelta(hours=hours)
    url = (
        "https://air-quality-api.open-meteo.com/v1/air-quality?"
        f"latitude={lat}&longitude={lon}&hourly=pm2_5&past_days={(hours//24)+1}&timezone=UTC"
    )
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        h = r.json().get("hourly", {})
        if not h:
            return pd.DataFrame()
        df = pd.DataFrame({"datetime": h.get("time", []), "pm25": h.get("pm2_5", [])})
        df["datetime"] = pd.to_datetime(df["datetime"], utc=True)
        return df[(df["datetime"] >= start) & (df["datetime"] <= end)].reset_index(drop=True)
    except Exception:
        return pd.DataFrame()


if __name__ == "__main__":

    pm25 = fetch_openaq_pm25(LAT, LON, hours=HOURS_BACK, radius_km=RADIUS_KM)
    if pm25.empty:
        pm25 = fetch_openmeteo_aq_pm25(LAT, LON, hours=HOURS_BACK)




In [2]:
pm25

Unnamed: 0,datetime,pm25
0,2025-10-03 06:00:00+00:00,16.4
1,2025-10-03 07:00:00+00:00,17.7
2,2025-10-03 08:00:00+00:00,19.2
3,2025-10-03 09:00:00+00:00,20.7
4,2025-10-03 10:00:00+00:00,20.5
...,...,...
67,2025-10-06 01:00:00+00:00,19.3
68,2025-10-06 02:00:00+00:00,19.3
69,2025-10-06 03:00:00+00:00,17.2
70,2025-10-06 04:00:00+00:00,16.1


In [None]:
# config.py
# Small config defaults — change coordinates to your city

DEFAULT_LAT = 13.0827   # Chennai example
DEFAULT_LON = 80.2707
DEFAULT_RADIUS_M = 10000  # OpenAQ radius in meters
DEFAULT_DAYS_HISTORY = 14  # how many days of historical data to pull by default


In [None]:
# data_fetch.py
import requests
import pandas as pd
from datetime import datetime, timedelta, date
from dateutil import parser

OPENAQ_API = "https://api.openaq.org/v2/measurements"
OPENMETEO_API = "https://api.open-meteo.com/v1/forecast"

def fetch_openaq_pm25(lat, lon, radius_m=10000, days=14, limit_per_page=10000):
    """
    Fetch PM2.5 measurements from OpenAQ (UTC timestamps).
    Aggregates to hourly mean.
    Returns DataFrame with index = UTC hourly datetime and column 'pm25'.
    """
    date_to = datetime.utcnow().date()
    date_from = date_to - timedelta(days=days)
    params = {
        "coordinates": f"{lat},{lon}",
        "radius": radius_m,
        "parameter": "pm25",
        "date_from": date_from.isoformat(),
        "date_to": date_to.isoformat(),
        "limit": limit_per_page,
        "page": 1,
        "sort": "desc"
    }

    rows = []
    while True:
        r = requests.get(OPENAQ_API, params=params, timeout=30)
        r.raise_for_status()
        j = r.json()
        results = j.get("results", [])
        if not results:
            break
        for rec in results:
            # date.utc exists like "2025-10-04T12:00:00+00:00"
            dt = rec.get("date", {}).get("utc")
            if not dt:
                continue
            try:
                ts = parser.isoparse(dt)
            except Exception:
                continue
            rows.append({"timestamp": ts, "pm25": rec.get("value")})
        # paging
        params["page"] += 1
        # avoid infinite loop if server returns huge data; break if less than limit
        if len(results) < limit_per_page:
            break

    if not rows:
        return pd.DataFrame(columns=["pm25"]).astype({"pm25": "float64"})

    df = pd.DataFrame(rows)
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df = df.set_index("timestamp")
    # resample to hourly mean
    hourly = df.resample("H").mean().sort_index()
    return hourly

def fetch_open_meteo_weather(lat, lon, start_date: date, end_date: date, timezone="UTC"):
    """
    Pull hourly weather variables from Open-Meteo.
    start_date/end_date are date objects (YYYY-MM-DD).
    Returns DataFrame indexed by UTC hourly timestamps with columns:
      temperature_2m, relativehumidity_2m, windspeed_10m, winddirection_10m, pressure_msl
    """
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly": ",".join(
            ["temperature_2m", "relativehumidity_2m", "windspeed_10m", "winddirection_10m", "pressure_msl"]
        ),
        "start_date": start_date.isoformat(),
        "end_date": end_date.isoformat(),
        "timezone": timezone  # can pass "UTC" or "auto"
    }
    r = requests.get(OPENMETEO_API, params=params, timeout=30)
    r.raise_for_status()
    j = r.json()
    hourly = j.get("hourly", {})
    if not hourly:
        return pd.DataFrame()
    times = [pd.to_datetime(t).tz_convert(None) if pd.to_datetime(t).tzinfo else pd.to_datetime(t) for t in hourly.get("time", [])]
    df = pd.DataFrame(hourly)
    # drop the time column duplication
    df = df.drop(columns=["time"], errors="ignore")
    df.index = pd.to_datetime(hourly["time"])
    # ensure timezone naive UTC for alignment
    df.index = pd.to_datetime(df.index).tz_convert(None)
    df = df.sort_index()
    return df


In [None]:
# features.py
import pandas as pd
import numpy as np

def add_time_features(df):
    df = df.copy()
    # ensure datetime index
    idx = pd.DatetimeIndex(df.index)
    df["hour"] = idx.hour
    df["weekday"] = idx.weekday
    df["month"] = idx.month
    return df

def make_lag_roll_features(df, target_col="pm25", max_lag=24):
    """
    df must have 'pm25' in index-aligned rows. Returns DataFrame with lag & rolling features.
    """
    out = df.copy()
    # lags
    for lag in range(1, max_lag + 1):
        out[f"lag_{lag}"] = out[target_col].shift(lag)
    # rolling windows
    out["roll_mean_3"] = out[target_col].rolling(3, min_periods=1).mean().shift(1)
    out["roll_mean_6"] = out[target_col].rolling(6, min_periods=1).mean().shift(1)
    out["roll_mean_24"] = out[target_col].rolling(24, min_periods=1).mean().shift(1)
    out["roll_std_24"] = out[target_col].rolling(24, min_periods=1).std().shift(1).fillna(0.0)
    return out

def build_feature_matrix(pm_df, weather_df=None, max_lag=24):
    """
    pm_df: DataFrame with index hourly, column 'pm25'
    weather_df: DataFrame with hourly weather variables aligned to pm_df index (can be None)
    Returns X (features), y (target)
    """
    df = pm_df.copy()
    if weather_df is not None and not weather_df.empty:
        # join weather features (align by index)
        df = df.join(weather_df, how="left")
    # create time features
    df = add_time_features(df)
    # lags/rolling
    df = make_lag_roll_features(df, target_col="pm25", max_lag=max_lag)
    # anomaly flag might later be added outside
    # drop rows where target is NaN (can't train on missing y)
    y = df["pm25"].copy()
    X = df.drop(columns=["pm25"])
    # fill missing numeric features sensibly
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X[numeric_cols] = X[numeric_cols].fillna(method="ffill").fillna(method="bfill").fillna(0.0)
    return X, y


In [None]:
# modeling.py
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest, HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import joblib

def detect_anomalies(series, contamination=0.02, random_state=42):
    """
    series: pandas Series of pm25 values (indexed by time)
    returns df with 'anomaly' column (True if anomaly)
    """
    clean = series.fillna(-999).values.reshape(-1, 1)
    iso = IsolationForest(contamination=contamination, random_state=random_state)
    mask = iso.fit_predict(clean)  # -1 anomaly, 1 normal
    anomaly = pd.Series(mask == -1, index=series.index, name="anomaly")
    return anomaly

def train_regressor(X_train, y_train, X_val=None, y_val=None, random_state=42):
    """
    Fits a time-aware gradient booster (HistGradientBoostingRegressor with early stop).
    Returns model and optionally validation MAE.
    """
    model = HistGradientBoostingRegressor(max_iter=400, early_stopping=True, random_state=random_state)
    model.fit(X_train, y_train)
    val_mae = None
    if X_val is not None and y_val is not None:
        preds = model.predict(X_val)
        val_mae = mean_absolute_error(y_val, preds)
    return model, val_mae

def recursive_forecast(model, last_known_pm, feature_builder_fn, future_weather_df=None,
                       horizon=24, max_lag=24):
    """
    model: trained regressor accepting the features created by feature_builder_fn
    last_known_pm: pandas Series indexed by time (most recent value last) including available history for lags
    feature_builder_fn: function that takes (pm_series, weather_row, t_index) -> DataFrame with a single row of features (matching training X columns)
    future_weather_df: DataFrame indexed by future timestamps with weather columns (optional)
    Returns pandas Series of predictions indexed by future timestamps
    """
    preds = []
    idx = []
    working_series = last_known_pm.copy().astype(float)

    # start forecast at next hour
    next_ts = working_series.index[-1] + pd.Timedelta(hours=1)
    for h in range(horizon):
        # select weather for this hour if provided
        weather_row = None
        if future_weather_df is not None and next_ts in future_weather_df.index:
            weather_row = future_weather_df.loc[next_ts]
        # build feature row
        X_row = feature_builder_fn(working_series, weather_row, t_index=next_ts)
        # ensure column order matches model
        # predict
        yhat = model.predict(X_row)[0]
        preds.append(yhat)
        idx.append(next_ts)
        # append predicted value to working_series (so next lags include prediction)
        working_series = working_series.append(pd.Series([yhat], index=[next_ts]))
        next_ts = next_ts + pd.Timedelta(hours=1)
    pred_series = pd.Series(preds, index=idx, name="pm25_pred")
    return pred_series

def save_model(model, path="model.joblib"):
    joblib.dump(model, path)

def load_model(path="model.joblib"):
    return joblib.load(path)


In [None]:

%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
import plotly.graph_objs as go

# from config import DEFAULT_LAT, DEFAULT_LON, DEFAULT_RADIUS_M, DEFAULT_DAYS_HISTORY
# from data_fetch import fetch_openaq_pm25, fetch_open_meteo_weather
# from features import build_feature_matrix
# from modeling import detect_anomalies, train_regressor, recursive_forecast, save_model, load_model

# config.py
# Small config defaults — change coordinates to your city

DEFAULT_LAT = 13.0827   # Chennai example
DEFAULT_LON = 80.2707
DEFAULT_RADIUS_M = 10000  # OpenAQ radius in meters
DEFAULT_DAYS_HISTORY = 14  # how many days of historical data to pull by default


# data_fetch.py
import requests
import pandas as pd
from datetime import datetime, timedelta, date
from dateutil import parser

OPENAQ_API = "https://api.openaq.org/v2/measurements"
OPENMETEO_API = "https://api.open-meteo.com/v1/forecast"

def fetch_openaq_pm25(lat, lon, radius_m=10000, days=14, limit_per_page=10000):
    """
    Fetch PM2.5 measurements from OpenAQ (UTC timestamps).
    Aggregates to hourly mean.
    Returns DataFrame with index = UTC hourly datetime and column 'pm25'.
    """
    date_to = datetime.utcnow().date()
    date_from = date_to - timedelta(days=days)
    params = {
        "coordinates": f"{lat},{lon}",
        "radius": radius_m,
        "parameter": "pm25",
        "date_from": date_from.isoformat(),
        "date_to": date_to.isoformat(),
        "limit": limit_per_page,
        "page": 1,
        "sort": "desc"
    }

    rows = []
    while True:
        r = requests.get(OPENAQ_API, params=params, timeout=30)
        r.raise_for_status()
        j = r.json()
        results = j.get("results", [])
        if not results:
            break
        for rec in results:
            # date.utc exists like "2025-10-04T12:00:00+00:00"
            dt = rec.get("date", {}).get("utc")
            if not dt:
                continue
            try:
                ts = parser.isoparse(dt)
            except Exception:
                continue
            rows.append({"timestamp": ts, "pm25": rec.get("value")})
        # paging
        params["page"] += 1
        # avoid infinite loop if server returns huge data; break if less than limit
        if len(results) < limit_per_page:
            break

    if not rows:
        return pd.DataFrame(columns=["pm25"]).astype({"pm25": "float64"})

    df = pd.DataFrame(rows)
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df = df.set_index("timestamp")
    # resample to hourly mean
    hourly = df.resample("H").mean().sort_index()
    return hourly

def fetch_open_meteo_weather(lat, lon, start_date: date, end_date: date, timezone="UTC"):
    """
    Pull hourly weather variables from Open-Meteo.
    start_date/end_date are date objects (YYYY-MM-DD).
    Returns DataFrame indexed by UTC hourly timestamps with columns:
      temperature_2m, relativehumidity_2m, windspeed_10m, winddirection_10m, pressure_msl
    """
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly": ",".join(
            ["temperature_2m", "relativehumidity_2m", "windspeed_10m", "winddirection_10m", "pressure_msl"]
        ),
        "start_date": start_date.isoformat(),
        "end_date": end_date.isoformat(),
        "timezone": timezone  # can pass "UTC" or "auto"
    }
    r = requests.get(OPENMETEO_API, params=params, timeout=30)
    r.raise_for_status()
    j = r.json()
    hourly = j.get("hourly", {})
    if not hourly:
        return pd.DataFrame()
    times = [pd.to_datetime(t).tz_convert(None) if pd.to_datetime(t).tzinfo else pd.to_datetime(t) for t in hourly.get("time", [])]
    df = pd.DataFrame(hourly)
    # drop the time column duplication
    df = df.drop(columns=["time"], errors="ignore")
    df.index = pd.to_datetime(hourly["time"])
    # ensure timezone naive UTC for alignment
    df.index = pd.to_datetime(df.index).tz_convert(None)
    df = df.sort_index()
    return df

# features.py
import pandas as pd
import numpy as np

def add_time_features(df):
    df = df.copy()
    # ensure datetime index
    idx = pd.DatetimeIndex(df.index)
    df["hour"] = idx.hour
    df["weekday"] = idx.weekday
    df["month"] = idx.month
    return df

def make_lag_roll_features(df, target_col="pm25", max_lag=24):
    """
    df must have 'pm25' in index-aligned rows. Returns DataFrame with lag & rolling features.
    """
    out = df.copy()
    # lags
    for lag in range(1, max_lag + 1):
        out[f"lag_{lag}"] = out[target_col].shift(lag)
    # rolling windows
    out["roll_mean_3"] = out[target_col].rolling(3, min_periods=1).mean().shift(1)
    out["roll_mean_6"] = out[target_col].rolling(6, min_periods=1).mean().shift(1)
    out["roll_mean_24"] = out[target_col].rolling(24, min_periods=1).mean().shift(1)
    out["roll_std_24"] = out[target_col].rolling(24, min_periods=1).std().shift(1).fillna(0.0)
    return out

def build_feature_matrix(pm_df, weather_df=None, max_lag=24):
    """
    pm_df: DataFrame with index hourly, column 'pm25'
    weather_df: DataFrame with hourly weather variables aligned to pm_df index (can be None)
    Returns X (features), y (target)
    """
    df = pm_df.copy()
    if weather_df is not None and not weather_df.empty:
        # join weather features (align by index)
        df = df.join(weather_df, how="left")
    # create time features
    df = add_time_features(df)
    # lags/rolling
    df = make_lag_roll_features(df, target_col="pm25", max_lag=max_lag)
    # anomaly flag might later be added outside
    # drop rows where target is NaN (can't train on missing y)
    y = df["pm25"].copy()
    X = df.drop(columns=["pm25"])
    # fill missing numeric features sensibly
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X[numeric_cols] = X[numeric_cols].fillna(method="ffill").fillna(method="bfill").fillna(0.0)
    return X, y

# modeling.py
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest, HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import joblib

def detect_anomalies(series, contamination=0.02, random_state=42):
    """
    series: pandas Series of pm25 values (indexed by time)
    returns df with 'anomaly' column (True if anomaly)
    """
    clean = series.fillna(-999).values.reshape(-1, 1)
    iso = IsolationForest(contamination=contamination, random_state=random_state)
    mask = iso.fit_predict(clean)  # -1 anomaly, 1 normal
    anomaly = pd.Series(mask == -1, index=series.index, name="anomaly")
    return anomaly

def train_regressor(X_train, y_train, X_val=None, y_val=None, random_state=42):
    """
    Fits a time-aware gradient booster (HistGradientBoostingRegressor with early stop).
    Returns model and optionally validation MAE.
    """
    model = HistGradientBoostingRegressor(max_iter=400, early_stopping=True, random_state=random_state)
    model.fit(X_train, y_train)
    val_mae = None
    if X_val is not None and y_val is not None:
        preds = model.predict(X_val)
        val_mae = mean_absolute_error(y_val, preds)
    return model, val_mae

def recursive_forecast(model, last_known_pm, feature_builder_fn, future_weather_df=None,
                       horizon=24, max_lag=24):
    """
    model: trained regressor accepting the features created by feature_builder_fn
    last_known_pm: pandas Series indexed by time (most recent value last) including available history for lags
    feature_builder_fn: function that takes (pm_series, weather_row, t_index) -> DataFrame with a single row of features (matching training X columns)
    future_weather_df: DataFrame indexed by future timestamps with weather columns (optional)
    Returns pandas Series of predictions indexed by future timestamps
    """
    preds = []
    idx = []
    working_series = last_known_pm.copy().astype(float)

    # start forecast at next hour
    next_ts = working_series.index[-1] + pd.Timedelta(hours=1)
    for h in range(horizon):
        # select weather for this hour if provided
        weather_row = None
        if future_weather_df is not None and next_ts in future_weather_df.index:
            weather_row = future_weather_df.loc[next_ts]
        # build feature row
        X_row = feature_builder_fn(working_series, weather_row, t_index=next_ts)
        # ensure column order matches model
        # predict
        yhat = model.predict(X_row)[0]
        preds.append(yhat)
        idx.append(next_ts)
        # append predicted value to working_series (so next lags include prediction)
        working_series = working_series.append(pd.Series([yhat], index=[next_ts]))
        next_ts = next_ts + pd.Timedelta(hours=1)
    pred_series = pd.Series(preds, index=idx, name="pm25_pred")
    return pred_series

def save_model(model, path="model.joblib"):
    joblib.dump(model, path)

def load_model(path="model.joblib"):
    return joblib.load(path)


st.set_page_config(page_title="PM2.5 Nowcast & Forecast", layout="wide")

st.title("Live PM2.5 Nowcast & Forecast")

# Sidebar config
st.sidebar.header("Settings")
lat = st.sidebar.number_input("Latitude", value=float(DEFAULT_LAT))
lon = st.sidebar.number_input("Longitude", value=float(DEFAULT_LON))
radius = st.sidebar.number_input("OpenAQ Radius (m)", value=int(DEFAULT_RADIUS_M))
history_days = st.sidebar.slider("History days to fetch", min_value=3, max_value=60, value=int(DEFAULT_DAYS_HISTORY))
horizon = st.sidebar.selectbox("Forecast horizon (hours)", [24, 48])
contamination = st.sidebar.slider("Anomaly contamination (fraction)", min_value=0.0, max_value=0.1, value=0.02)

if st.sidebar.button("Fetch & Train Model"):
    with st.spinner("Fetching data from APIs..."):
        # fetch PM2.5
        pm_df = fetch_openaq_pm25(lat, lon, radius_m=radius, days=history_days)
        if pm_df.empty:
            st.error("No PM2.5 data returned. Try increasing radius or days, or use different coordinates.")
            st.stop()
        # get weather for historical range (Open-Meteo). Use local naive datetime index (UTC)
        start = pm_df.index.min().date()
        end = datetime.utcnow().date()
        weather_df = fetch_open_meteo_weather(lat, lon, start_date=start, end_date=end, timezone="UTC")
    st.success("Data fetched.")

    # Show recent data
    st.subheader("Recent PM2.5 (hourly)")
    st.write(pm_df.tail(24))

    # Anomaly detection
    st.subheader("Anomaly detection")
    anomalies = detect_anomalies(pm_df["pm25"], contamination=contamination)
    pm_df = pm_df.join(anomalies)
    st.metric("Most recent PM2.5 (µg/m³)", f"{pm_df['pm25'].iloc[-1]:.2f}", delta=None)
    st.dataframe(pm_df.tail(24))

    # Build features
    X, y = build_feature_matrix(pm_df[["pm25"]], weather_df=weather_df, max_lag=24)
    # align to remove rows with missing y
    mask = ~y.isna()
    X = X.loc[mask]
    y = y.loc[mask]

    # time split: last 20% as validation
    split_idx = int(len(X) * 0.8)
    X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

    st.write(f"Training rows: {len(X_train)} | Validation rows: {len(X_val)}")

    # Train model
    with st.spinner("Training model..."):
        model, val_mae = train_regressor(X_train, y_train, X_val=X_val, y_val=y_val)
    st.success(f"Model trained — validation MAE: {val_mae:.3f}")

    # Save model to file for later reuse
    save_model(model, "pm25_model.joblib")
    st.write("Model saved to pm25_model.joblib")

    # Prepare future-weather fetch for horizon (get forecast from now..horizon)
    now = datetime.utcnow().replace(minute=0, second=0, microsecond=0)
    start_forecast = now.date()
    end_forecast = (now + pd.Timedelta(hours=horizon)).date()
    future_weather = fetch_open_meteo_weather(lat, lon, start_date=start_forecast, end_date=end_forecast, timezone="UTC")

    # helper feature builder used by recursive_forecast (must match X columns and preprocessing)
    feature_cols = X.columns.tolist()  # order used during training

    def feature_builder(pm_series: pd.Series, weather_row, t_index):
        """
        Build a single-row DataFrame matching training features.
        pm_series: historical series including predictions appended.
        weather_row: a Series of weather data for t_index (or None)
        t_index: timestamp for row
        """
        # create a temp df for the timestamp to reuse feature logic
        df_temp = pd.DataFrame(index=[t_index])
        # time features
        df_temp["hour"] = t_index.hour
        df_temp["weekday"] = t_index.weekday
        df_temp["month"] = t_index.month
        # add weather columns expected (if in training)
        if weather_row is not None:
            for col in weather_row.index:
                df_temp[col] = weather_row[col]
        else:
            # fill with last known weather using fallback zeros
            for col in ["temperature_2m", "relativehumidity_2m", "windspeed_10m", "winddirection_10m", "pressure_msl"]:
                if col in feature_cols:
                    # try take last available from future_weather or training weather
                    df_temp[col] = 0.0
        # lags
        for lag in range(1, 25):
            df_temp[f"lag_{lag}"] = pm_series.shift(lag).iloc[-1]
        # rolling features computed from pm_series (use upto most recent values)
        df_temp["roll_mean_3"] = pm_series.iloc[-3:].mean() if len(pm_series) >= 3 else pm_series.mean()
        df_temp["roll_mean_6"] = pm_series.iloc[-6:].mean() if len(pm_series) >= 6 else pm_series.mean()
        df_temp["roll_mean_24"] = pm_series.iloc[-24:].mean() if len(pm_series) >= 24 else pm_series.mean()
        df_temp["roll_std_24"] = pm_series.iloc[-24:].std() if len(pm_series) >= 2 else 0.0

        # align columns to feature_cols (missing -> 0)
        for c in feature_cols:
            if c not in df_temp.columns:
                df_temp[c] = 0.0
        df_temp = df_temp[feature_cols]
        # fill any NaNs
        df_temp = df_temp.fillna(method="ffill").fillna(method="bfill").fillna(0.0)
        return df_temp

    # do recursive forecast
    last_pm_series = pm_df["pm25"].dropna()
    pred_series = recursive_forecast(model, last_pm_series, feature_builder, future_weather_df=future_weather, horizon=horizon, max_lag=24)

    # Display forecast plot
    st.subheader("Forecast (next hours)")
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=last_pm_series.tail(48).index, y=last_pm_series.tail(48).values,
                             mode="lines+markers", name="Observed PM2.5"))
    fig.add_trace(go.Scatter(x=pred_series.index, y=pred_series.values,
                             mode="lines+markers", name="Forecast PM2.5"))
    fig.update_layout(xaxis_title="Time (UTC)", yaxis_title="PM2.5 (µg/m³)", height=500)
    st.plotly_chart(fig, use_container_width=True)

    # show top features
    st.subheader("Feature importance (approx.)")
    try:
        importances = model.feature_importances_
        importance_df = pd.DataFrame({
            "feature": feature_cols,
            "importance": importances
        }).sort_values("importance", ascending=False).head(20)
        st.dataframe(importance_df)
    except Exception as e:
        st.write("Feature importance not available for this model type:", e)

    # show anomalies
    st.subheader("Anomalies flagged (recent)")
    st.dataframe(pm_df[pm_df["anomaly"]].tail(50))

    # allow download of forecast CSV
    csv = pd.concat([last_pm_series.tail(48), pred_series]).rename("pm25").to_csv()
    st.download_button("Download observed+forecast CSV", csv, file_name="pm25_observed_forecast.csv", mime="text/csv")

Writing streamlit_app.py


In [None]:
!wget -q -O - ipv4.icanhazip.com

34.169.210.57


In [None]:
!streamlit run streamlit_app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.169.210.57:8501[0m
[0m
[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0Kyour url is: https://all-boxes-act.loca.lt
[34m  Stopping...[0m
^C
