# TEMPO AQ Forecaster — MVP Notebook

Runs **offline** with synthetic data to demo the pipeline.

In [1]:
import os, json
from datetime import datetime, timedelta, timezone
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from scripts.loaders import BBox, load_ground_observations, load_tempo_columns, load_weather
from scripts.fusion import simple_kalman, epa_aqi_pm25, feature_engineer
print('OK')

In [2]:
from scripts.fusion import simple_kalman, epa_aqi_pm25, feature_engineer

In [13]:
from sklearn.metrics import mean_absolute_error

In [3]:
from scripts.loaders import BBox, load_ground_observations, load_tempo_columns, load_weather

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

In [5]:
start = datetime.now(timezone.utc) - timedelta(hours=72)
end   = datetime.now(timezone.utc)
bbox = BBox(-123.5, 47.0, -122.0, 48.0)
obs = load_ground_observations(bbox, start, end, pollutant='PM25')
sat = load_tempo_columns(bbox, start, end, var='NO2_column')
wx  = load_weather(bbox, start, end)
obs.head()

Unnamed: 0,station_id,ts,pollutant,value,units,qa_flag,source,source_url,ingested_at
0,STN001,2025-10-01 20:38:12.953187+00:00,PM25,6.586856,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953630+00:00
1,STN001,2025-10-01 21:38:12.953187+00:00,PM25,10.32619,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953632+00:00
2,STN001,2025-10-01 22:38:12.953187+00:00,PM25,9.656503,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953633+00:00
3,STN001,2025-10-01 23:38:12.953187+00:00,PM25,8.785023,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953633+00:00
4,STN001,2025-10-02 00:38:12.953187+00:00,PM25,9.478036,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953634+00:00


In [16]:
obs

Unnamed: 0,station_id,ts,pollutant,value,units,qa_flag,source,source_url,ingested_at,tile_id
0,STN001,2025-10-01 20:38:12.953187+00:00,PM25,6.586856,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953630+00:00,T001
1,STN001,2025-10-01 21:38:12.953187+00:00,PM25,10.326190,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953632+00:00,T001
2,STN001,2025-10-01 22:38:12.953187+00:00,PM25,9.656503,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953633+00:00,T001
3,STN001,2025-10-01 23:38:12.953187+00:00,PM25,8.785023,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953633+00:00,T001
4,STN001,2025-10-02 00:38:12.953187+00:00,PM25,9.478036,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953634+00:00,T001
...,...,...,...,...,...,...,...,...,...,...
214,STN003,2025-10-04 16:38:12.953187+00:00,PM25,10.503525,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953888+00:00,T003
215,STN003,2025-10-04 17:38:12.953187+00:00,PM25,13.083869,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953888+00:00,T003
216,STN003,2025-10-04 18:38:12.953187+00:00,PM25,12.738785,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953889+00:00,T003
217,STN003,2025-10-04 19:38:12.953187+00:00,PM25,12.674446,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953889+00:00,T003


In [6]:
sat_piv = sat.pivot_table(index=['tile_id','ts'], columns='var', values='value').reset_index()
wx_piv  = wx.pivot_table(index=['tile_id','ts'], columns='var', values='value').reset_index()
obs['tile_id'] = obs['station_id'].map({'STN001':'T001','STN002':'T002','STN003':'T003'})
merge = (obs.merge(sat_piv, on=['tile_id','ts'], how='left')
            .merge(wx_piv,  on=['tile_id','ts'], how='left'))
merge = merge.sort_values(['station_id','ts'])
merge.head()

Unnamed: 0,station_id,ts,pollutant,value,units,qa_flag,source,source_url,ingested_at,tile_id,NO2_column,PBLH,RH2,T2,U10,V10
0,STN001,2025-10-01 20:38:12.953187+00:00,PM25,6.586856,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953630+00:00,T001,0.224594,450.0,55.0,293.0,2.0,-1.0
1,STN001,2025-10-01 21:38:12.953187+00:00,PM25,10.32619,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953632+00:00,T001,0.25728,450.0,55.0,293.0,2.0,-1.0
2,STN001,2025-10-01 22:38:12.953187+00:00,PM25,9.656503,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953633+00:00,T001,0.240259,450.0,55.0,293.0,2.0,-1.0
3,STN001,2025-10-01 23:38:12.953187+00:00,PM25,8.785023,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953633+00:00,T001,0.298245,450.0,55.0,293.0,2.0,-1.0
4,STN001,2025-10-02 00:38:12.953187+00:00,PM25,9.478036,ug/m3,,synthetic,about:blank,2025-10-04 20:38:12.953634+00:00,T001,0.292169,450.0,55.0,293.0,2.0,-1.0


In [8]:
means, vars_ = [], []
for stn, g in merge.groupby('station_id'):
    m, v = g['value'].iloc[0], 4.0
    out_m, out_v = [], []
    for val in g['value']:
        m, v = simple_kalman(m, v, val, obs_var=1.5)
        out_m.append(m); out_v.append(v)
    means += out_m; vars_ += out_v
merge['nowcast_mean'] = means
merge['nowcast_std']  = np.sqrt(np.clip(vars_, 1e-6, None))
merge[['station_id','ts','value','nowcast_mean','nowcast_std']].tail()

Unnamed: 0,station_id,ts,value,nowcast_mean,nowcast_std
214,STN003,2025-10-04 16:06:30.187842+00:00,10.924299,8.289046,0.147043
215,STN003,2025-10-04 17:06:30.187842+00:00,12.458874,8.348298,0.145994
216,STN003,2025-10-04 18:06:30.187842+00:00,11.591706,8.39374,0.144968
217,STN003,2025-10-04 19:06:30.187842+00:00,11.17926,8.432227,0.143963
218,STN003,2025-10-04 20:06:30.187842+00:00,13.153301,8.496569,0.142979


In [14]:
feat = feature_engineer(merge)
X = feat[['no2_scaled','U10_norm','V10_norm','PBLH_norm','T2_norm','RH2_norm','hour','dow']]
y = feat['value']
cut = int(len(X)*0.8)
Xtr, Xte = X.iloc[:cut], X.iloc[cut:]
ytr, yte = y.iloc[:cut], y.iloc[cut:]
model = GradientBoostingRegressor(random_state=0)
model.fit(Xtr, ytr)
pred = model.predict(Xte)
mae = mean_absolute_error(yte, pred)
mae

1.1794247461391851

In [15]:
out = pd.DataFrame({'ts': feat.iloc[cut:]['ts'].values, 'pred_pm25': pred})
out['pred_aqi'] = out['pred_pm25'].apply(lambda v: epa_aqi_pm25(float(v)))
Path('artifacts').mkdir(exist_ok=True)
out.to_parquet('artifacts/sample_forecast.parquet', index=False)
out.head()

Unnamed: 0,ts,pred_pm25,pred_aqi
0,2025-10-03 01:38:12.953187,8.210398,34
1,2025-10-03 02:38:12.953187,7.223552,30
2,2025-10-03 03:38:12.953187,8.468001,35
3,2025-10-03 04:38:12.953187,7.507105,31
4,2025-10-03 05:38:12.953187,5.721175,24
