In [1]:
"""
Reproducible backtest harness:
- Loads outputs/live_dataset.csv (or synthesizes a small mock).
- Uses time-based 70/30 split.
- Evaluates current forecasters if available; otherwise uses a trivial baseline.
- Exports outputs/backtest/backtest_report.md + saves calibration charts (reuse your functions).
"""
from __future__ import annotations
import os, json, math
from pathlib import Path
from datetime import datetime, timezone
import numpy as np
import pandas as pd

In [2]:
OUT = Path(os.getenv("OUT", "outputs"))
OUT.mkdir(parents=True, exist_ok=True)
LIVE_CSV = Path(os.getenv("LIVE_CSV", OUT / "live_dataset.csv"))
BK_DIR = OUT / "backtest"
BK_DIR.mkdir(parents=True, exist_ok=True)
REPORT = BK_DIR / "backtest_report.md"
SEED = int(os.getenv("SEED", "42"))
rng = np.random.default_rng(SEED)

In [3]:
def _ensure_live():
    if LIVE_CSV.exists():
        return pd.read_csv(LIVE_CSV, parse_dates=["ts"])
    ts = pd.date_range("2025-01-01", periods=400, freq="min", tz="UTC")
    rows = []
    for p in ["USDC/USDT-uni", "DAI/USDC-curve"]:
        dev = rng.normal(0, 0.0008, len(ts))
        dev[300:305] += 0.006  
        rows += [{"ts": t, "pool": p, "dev": float(d), "anom_fused": 0.2, "feeds_fresh": True} for t, d in zip(ts, dev)]
    df = pd.DataFrame(rows)
    df.to_csv(LIVE_CSV, index=False)
    return df

In [4]:
def _time_split_idx(ts, frac=0.70):
    ts = pd.to_datetime(ts, utc=True, errors="coerce")
    cut = ts.quantile(frac)
    tr = (ts <= cut).to_numpy().nonzero()[0]
    te = (ts >  cut).to_numpy().nonzero()[0]
    return tr, te
def _labels(df: pd.DataFrame, horizon=10, thr=0.005):
    y = []
    for p, g in df.sort_values(["pool","ts"]).groupby("pool"):
        v = g["dev"].abs().fillna(0).to_numpy()
        lab = np.zeros_like(v, dtype=int)
        for i in range(len(v)):
            lab[i] = 1 if (v[i+1:i+1+horizon] >= thr).any() else 0
        y += list(lab)
    return np.array(y, dtype=int)

In [5]:
def _predict(df, use_cols, model_loader=None):
    X = df[use_cols].replace([np.inf,-np.inf], np.nan).fillna(0.0).to_numpy()
    if model_loader:
        clf, calib = model_loader()
        if calib is not None:
            return calib.predict_proba(X)[:,1]
        if clf is not None and hasattr(clf, "predict_proba"):
            return clf.predict_proba(X)[:,1]
    z = (df["dev"].fillna(0) / (df["dev"].std() or 1)).clip(-5, 5)
    return 1/(1+np.exp(-z))
def _metric_ap(y_true, p):
    try:
        from sklearn.metrics import average_precision_score as AP
        return float(AP(y_true, p))
    except Exception:
        thr = np.linspace(0, 1, 11)
        prec = []
        for t in thr:
            pred = (p >= t).astype(int)
            tp = ((pred==1)&(y_true==1)).sum()
            fp = ((pred==1)&(y_true==0)).sum()
            fn = ((pred==0)&(y_true==1)).sum()
            prec.append(tp/(tp+fp+1e-9))
        return float(np.mean(prec))

In [6]:
def _metric_brier(y_true, p):
    return float(np.mean((p - y_true)**2))

In [7]:
def main():
    df = _ensure_live()
    use_cols = [c for c in [
        "dev","dev_roll_std","tvl_outflow_rate","spot_twap_gap_bps",
        "oracle_ratio","anom_fused","neighbor_max_dev","neighbor_avg_anom","corr_best","lead_lag_best"
    ] if c in df.columns] or ["dev"]
    y10 = _labels(df, horizon=10, thr=0.005)
    y30 = _labels(df, horizon=30, thr=0.005)
    tr, te = _time_split_idx(df["ts"], 0.70)
    try:
        from sentinel_runtime import _load_forecaster, _load_forecaster_30m
        p10 = _predict(df.iloc[te], use_cols, model_loader=_load_forecaster)
        p30 = _predict(df.iloc[te], use_cols, model_loader=_load_forecaster_30m)
    except Exception:
        p10 = _predict(df.iloc[te], use_cols, model_loader=None)
        p30 = _predict(df.iloc[te], use_cols, model_loader=None)
    ap10 = _metric_ap(y10[te], p10)
    ap30 = _metric_ap(y30[te], p30)
    br10 = _metric_brier(y10[te], p10)
    br30 = _metric_brier(y30[te], p30)
    try:
        from sentinel_runtime import save_all_calibration_artifacts
        save_all_calibration_artifacts()
    except Exception:
        pass
    md = []
    md.append("# Backtest Report\n")
    md.append(f"- Generated: {datetime.now(timezone.utc).isoformat(timespec='seconds')}\n")
    md.append("## Holdout Metrics (30%)\n")
    md.append(f"- **10m**  AP={ap10:.3f}, Brier={br10:.4f}")
    md.append(f"- **30m**  AP={ap30:.3f}, Brier={br30:.4f}")
    md.append("\nArtifacts:\n")
    for a in ["artifacts/calibration_10m.png", "artifacts/calibration_30m.png"]:
        if (OUT / a).exists():
            md.append(f"- {a}")
    REPORT.write_text("\n".join(md))
    print(f"[backtest] wrote {REPORT}")
if __name__ == "__main__":
    main()

  return pd.read_csv(LIVE_CSV, parse_dates=["ts"])
  ts = pd.to_datetime(ts, utc=True, errors="coerce")


[backtest] wrote outputs\backtest\backtest_report.md


