In [1]:
# level41_roll_spread_estimator.py
# Python-only, free data (yfinance). Roll (1984) effective spread estimator from daily closes.
# Outputs:
#   - CSV : level41_roll_timeseries.csv (Close, dP, gamma1, spread_$, spread_bps, valid flags)
#   - JSON: level41_roll_metrics.json (summary stats)
# Usage:
#   python level41_roll_spread_estimator.py
# Notes:
#   - Works best on high-liquidity names. Daily closes are a proxy; tick data is superior but not free.

import numpy as np
import pandas as pd
import yfinance as yf
from dataclasses import dataclass, asdict
from typing import Optional, Dict
import json

# ----------------------------- Config -----------------------------
@dataclass
class Config:
    ticker: str = "SPY"
    start: str = "2005-01-01"
    end: Optional[str] = None
    auto_adjust: bool = True           # True -> 'Close' is adjusted
    window: int = 60                   # rolling window for autocovariance
    min_obs: int = 40                  # minimum obs to trust estimate
    clip_bps: float = 200.0            # clip extreme bps (sanity cap) for reporting only

CFG = Config()

# ----------------------------- Loader -----------------------------
def load_adjusted_close(tickers, start, end=None, auto_adjust=True) -> pd.DataFrame:
    raw = yf.download(tickers, start=start, end=end, auto_adjust=auto_adjust, progress=False)
    price_key = "Close" if auto_adjust else "Adj Close"
    if isinstance(raw.columns, pd.MultiIndex):
        out = raw[price_key].copy()
    else:
        name = tickers if isinstance(tickers, str) else tickers[0]
        out = raw[[price_key]].rename(columns={price_key: name})
    cols = tickers if isinstance(tickers, (list, tuple)) else [tickers]
    out = out.reindex(columns=cols).dropna(how="all")
    return out

# ----------------------------- Roll Core -----------------------------
def roll_autocov_lag1(dp: pd.Series, window: int, min_obs: int) -> pd.Series:
    """Rolling first-order autocovariance of price changes (dp vs dp.shift(1))."""
    x = dp
    xlag = dp.shift(1)
    m0 = x.rolling(window, min_periods=min_obs).mean()
    m1 = xlag.rolling(window, min_periods=min_obs).mean()
    cov = ((x - m0) * (xlag - m1)).rolling(window, min_periods=min_obs).mean()
    return cov

def roll_spread_from_cov(gamma1: pd.Series) -> pd.Series:
    """Roll (1984) effective spread s = 2*sqrt(-gamma1) when gamma1<0; else NaN."""
    neg = (-gamma1).clip(lower=0.0)
    return 2.0 * np.sqrt(neg)

# ----------------------------- KPIs -----------------------------
def summarize(close: pd.Series, spread_abs: pd.Series) -> Dict[str, float]:
    # Convert to bps for stats too
    bps = 1e4 * (spread_abs / close)
    valid = spread_abs.notna()
    out = {
        "avg_spread_bps": float(bps[valid].mean()) if valid.any() else np.nan,
        "median_spread_bps": float(bps[valid].median()) if valid.any() else np.nan,
        "p90_spread_bps": float(bps[valid].quantile(0.90)) if valid.any() else np.nan,
        "pct_valid_days": float(100 * valid.mean()),
        "n_days": int(len(close)),
        "window": int(CFG.window),
        "min_obs": int(CFG.min_obs),
    }
    return out

# ----------------------------- Pipeline -----------------------------
def build_roll_series(cfg: Config):
    px = load_adjusted_close(cfg.ticker, start=cfg.start, end=cfg.end, auto_adjust=cfg.auto_adjust)
    if isinstance(px, pd.DataFrame):
        close = px[cfg.ticker].dropna()
    else:
        close = px.dropna()
        close.name = cfg.ticker

    dp = close.diff()
    gamma1 = roll_autocov_lag1(dp, cfg.window, cfg.min_obs)

    # Roll spread (absolute $)
    spread_abs = roll_spread_from_cov(gamma1)

    # Validity flag: only when gamma1 < 0 and enough obs
    valid = (gamma1 < 0) & gamma1.notna()

    # Spread in bps
    spread_bps = 1e4 * (spread_abs / close)
    # For reporting (optional), cap crazy tails (won't affect CSV raw values)
    spread_bps_report = spread_bps.clip(upper=cfg.clip_bps)

    out = pd.DataFrame({
        "Close": close,
        "dP": dp,
        "gamma1": gamma1,
        "spread_$": spread_abs,
        "spread_bps": spread_bps,
        "valid": valid.astype(float)  # 1.0/0.0 for easy averaging
    }).dropna(subset=["Close"])

    metrics = summarize(close, spread_abs)

    return out, metrics, {
        "config": asdict(cfg),
        "ticker": cfg.ticker,
        "avg_spread_bps_report_capped": float(spread_bps_report.dropna().mean()) if spread_bps_report.notna().any() else np.nan
    }

# ----------------------------- Main -----------------------------
def main():
    out, metrics, diag = build_roll_series(CFG)
    ts_path = "level41_roll_timeseries.csv"
    m_path  = "level41_roll_metrics.json"
    d_path  = "level41_roll_diagnostics.json"

    out.to_csv(ts_path, index=True)
    with open(m_path, "w") as f:
        json.dump(metrics, f, indent=2)
    with open(d_path, "w") as f:
        json.dump(diag, f, indent=2)

    print(f"[OK] Saved timeseries  → {ts_path}")
    print(f"[OK] Saved metrics    → {m_path}")
    print(f"[OK] Saved diagnostics→ {d_path}")
    print("Metrics summary:", {k: round(v, 4) if v==v else None for k, v in metrics.items()})

if __name__ == "__main__":
    main()


[OK] Saved timeseries  → level41_roll_timeseries.csv
[OK] Saved metrics    → level41_roll_metrics.json
[OK] Saved diagnostics→ level41_roll_diagnostics.json
Metrics summary: {'avg_spread_bps': 46.1314, 'median_spread_bps': 32.4737, 'p90_spread_bps': 98.5705, 'pct_valid_days': 98.4753, 'n_days': 5247, 'window': 60, 'min_obs': 40}
