# Summary pipeline (multi-tenor arb outcomes)
This notebook runs the arb outcome event-study pipeline using real repo data only (no synthetic seeds).


In [None]:
from __future__ import annotations

import ast
import hashlib
import json
import logging
import shutil
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.diagnostic import acorr_ljungbox

from slr_bucket.econometrics.event_study import add_event_time, event_study_regression, jump_estimator
from slr_bucket.io import build_data_catalog, load_any_table, resolve_dataset_path


In [None]:
CONFIG = {
    "outcomes_source": "tips_treasury_implied_rf_2010",
    "outcome_pattern": "arb_",
    "tenors_required": [2, 5, 10],
    "events": ["2020-04-01", "2021-03-19", "2021-03-31"],
    "windows": [20, 60],
    "event_bins": [(-60, -41), (-40, -21), (-20, -1), (0, 0), (1, 20), (21, 40), (41, 60)],
    "total_controls": ["VIX", "HY_OAS", "BAA10Y", "issu_7_bil", "issu_14_bil", "issu_30_bil"],
    "direct_controls": ["VIX", "HY_OAS", "BAA10Y", "issu_7_bil", "issu_14_bil", "issu_30_bil", "SOFR", "spr_tgcr", "spr_effr"],
    "hac_lags": 5,
}
repo_root = Path.cwd()
cfg_hash = hashlib.sha256(json.dumps(CONFIG, sort_keys=True).encode()).hexdigest()[:12]
run_stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
run_dir = repo_root / "outputs" / "summary_pipeline" / f"{run_stamp}_{cfg_hash}"
for sub in ["figures", "tables", "data", "logs"]:
    (run_dir / sub).mkdir(parents=True, exist_ok=True)
latest_dir = repo_root / "outputs" / "summary_pipeline" / "latest"
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s - %(message)s", handlers=[logging.FileHandler(run_dir / "logs" / "pipeline.log"), logging.StreamHandler()], force=True)
logger = logging.getLogger("summary_pipeline_multi")
run_dir


## Data map for the new `/data` structure
Used in this run:
- Outcomes: `data/series/tips_treasury_implied_rf_2010.(parquet|csv)` (`arb_*` only).
- Controls (preferred): `data/intermediate/analysis_panel.csv` if all required columns are present.
- Controls (fallback):
  - `data/raw/event_inputs/controls_vix_creditspreads_fred.(parquet|csv)`
  - `data/raw/event_inputs/repo_rates_combined.(parquet|csv)` or `repo_rates_fred`
  - `data/raw/event_inputs/treasury_issuance_by_tenor_fiscaldata.(parquet|csv)`
- Mechanism proxies (optional):
  - `primary_dealer_stats_ofr_stfm_nypd_long`
  - `bank_exposure_y9c_agg_daily`


In [None]:
catalog = build_data_catalog(repo_root / "data")
catalog.to_csv(run_dir / "data" / "data_catalog.csv", index=False)
catalog.to_parquet(run_dir / "data" / "data_catalog.parquet", index=False)
catalog.to_markdown(run_dir / "data" / "data_catalog.md", index=False)
catalog.head(20)


In [None]:
outcome_path = resolve_dataset_path(CONFIG["outcomes_source"], expected_dir=repo_root / "data" / "series")
outcomes = load_any_table(outcome_path)
outcomes["date"] = pd.to_datetime(outcomes["date"], errors="coerce")
arb_cols = sorted([c for c in outcomes.columns if c.startswith("arb_")], key=lambda x: int(x.split("_")[1]))
if not arb_cols:
    raise ValueError("No arb_* outcome columns found.")
arb_long = outcomes[["date", *arb_cols]].melt(id_vars=["date"], var_name="outcome", value_name="y")
arb_long["tenor"] = arb_long["outcome"].str.extract(r"arb_(\d+)").astype(float).astype("Int64")
arb_long["y"] = pd.to_numeric(arb_long["y"], errors="coerce")
arb_long = arb_long.dropna(subset=["date", "tenor", "y"]).sort_values(["tenor", "date"]).reset_index(drop=True)
value_q = arb_long["y"].abs().quantile([0.5, 0.9, 0.99]).to_dict()
unit_note = "Likely basis points" if value_q.get(0.5, 0) > 0.2 else "Likely decimal units"
{"outcome_path": str(outcome_path), "arb_cols": arb_cols, "value_q": value_q, "unit_note": unit_note}


In [None]:
def _build_controls() -> tuple[pd.DataFrame, str]:
    required = set(CONFIG["direct_controls"])
    try:
        panel_path = resolve_dataset_path("analysis_panel", expected_dir=repo_root / "data" / "intermediate")
        panel = load_any_table(panel_path)
        panel["date"] = pd.to_datetime(panel["date"], errors="coerce")
        if required.issubset(panel.columns):
            return panel[["date", *sorted(required)]].copy(), str(panel_path)
    except Exception as exc:  # noqa: BLE001
        logger.warning("analysis_panel not usable: %s", exc)

    controls = load_any_table(resolve_dataset_path("controls_vix_creditspreads_fred", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
    controls["date"] = pd.to_datetime(controls["date"], errors="coerce")

    try:
        repo = load_any_table(resolve_dataset_path("repo_rates_combined", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
    except FileNotFoundError:
        repo = load_any_table(resolve_dataset_path("repo_rates_fred", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
    repo["date"] = pd.to_datetime(repo["date"], errors="coerce")
    if "spr_tgcr" not in repo.columns and {"SOFR", "TGCR"}.issubset(repo.columns):
        repo["spr_tgcr"] = pd.to_numeric(repo["SOFR"], errors="coerce") - pd.to_numeric(repo["TGCR"], errors="coerce")
    if "spr_effr" not in repo.columns and {"SOFR", "EFFR"}.issubset(repo.columns):
        repo["spr_effr"] = pd.to_numeric(repo["SOFR"], errors="coerce") - pd.to_numeric(repo["EFFR"], errors="coerce")

    issuance = load_any_table(resolve_dataset_path("treasury_issuance_by_tenor_fiscaldata", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
    issuance["date"] = pd.to_datetime(issuance["issue_date"], errors="coerce")
    issuance["tenor_bucket"] = pd.to_numeric(issuance["tenor_bucket"], errors="coerce")
    issuance["issuance_amount"] = pd.to_numeric(issuance["issuance_amount"], errors="coerce") / 1e9
    wide = issuance.pivot_table(index="date", columns="tenor_bucket", values="issuance_amount", aggfunc="sum").reset_index()
    wide = wide.rename(columns={7.0: "issu_7_bil", 14.0: "issu_14_bil", 30.0: "issu_30_bil"})
    for c in ["issu_7_bil", "issu_14_bil", "issu_30_bil"]:
        if c not in wide.columns:
            wide[c] = np.nan

    ctrl = controls.merge(repo[[c for c in ["date", "SOFR", "spr_tgcr", "spr_effr"] if c in repo.columns]], on="date", how="outer")
    ctrl = ctrl.merge(wide[["date", "issu_7_bil", "issu_14_bil", "issu_30_bil"]], on="date", how="left")
    return ctrl, "raw/event_inputs fallback"

controls, controls_source = _build_controls()
controls.head()


In [None]:
panel_long = arb_long.merge(controls, on="date", how="left")
for c in CONFIG["direct_controls"]:
    if c in panel_long.columns:
        panel_long[c] = pd.to_numeric(panel_long[c], errors="coerce")
panel_long.to_parquet(run_dir / "data" / "arb_panel_long.parquet", index=False)
panel_long.head()


In [None]:
regimes = {
    "pre": (pd.Timestamp("2019-01-01"), pd.Timestamp("2020-03-31")),
    "relief": (pd.Timestamp("2020-04-01"), pd.Timestamp("2021-03-31")),
    "post": (pd.Timestamp("2021-04-01"), pd.Timestamp.max),
}
rows = []
for tenor, g in panel_long.groupby("tenor"):
    g = g.sort_values("date")
    for regime, (start, end) in regimes.items():
        sub = g[(g["date"] >= start) & (g["date"] <= end)][["date", "y"]].dropna()
        if sub.empty:
            continue
        lb_p = np.nan
        try:
            lb = acorr_ljungbox(sub["y"], lags=[min(10, max(1, len(sub) // 5))], return_df=True)
            lb_p = float(lb["lb_pvalue"].iloc[0])
        except Exception:  # noqa: BLE001
            pass
        rows.append({
            "tenor": int(tenor), "regime": regime,
            "sample_start": sub["date"].min(), "sample_end": sub["date"].max(), "N": int(sub.shape[0]),
            "mean": sub["y"].mean(), "std": sub["y"].std(),
            "p1": sub["y"].quantile(0.01), "p5": sub["y"].quantile(0.05), "p50": sub["y"].quantile(0.50), "p95": sub["y"].quantile(0.95), "p99": sub["y"].quantile(0.99),
            "autocorr1": sub["y"].autocorr(1), "ljungbox_pvalue": lb_p,
        })
summary_stats = pd.DataFrame(rows)
summary_stats.to_csv(run_dir / "tables" / "summary_stats.csv", index=False)
summary_stats.head()


In [None]:
jump_rows = []
for event in CONFIG["events"]:
    for window in CONFIG["windows"]:
        for tenor, g in panel_long.groupby("tenor"):
            for spec, controls_set in [("TOTAL", CONFIG["total_controls"]), ("DIRECT", CONFIG["direct_controls"])]:
                est, se, n = jump_estimator(g, y_col="y", event_date=event, window=window, controls=controls_set, hac_lags=CONFIG["hac_lags"])
                jump_rows.append({
                    "event": event, "window": window, "tenor": int(tenor), "spec": spec,
                    "estimate": est, "se": se,
                    "ci_low": est - 1.96 * se if pd.notna(est) and pd.notna(se) else np.nan,
                    "ci_high": est + 1.96 * se if pd.notna(est) and pd.notna(se) else np.nan,
                    "N": n,
                })
jump_results = pd.DataFrame(jump_rows)
jump_results.to_csv(run_dir / "tables" / "jump_results.csv", index=False)
jump_results.head()


In [None]:
import matplotlib.pyplot as plt

bin_rows = []
for event in CONFIG["events"]:
    for tenor, g in panel_long.groupby("tenor"):
        for spec, controls_set in [("TOTAL", CONFIG["total_controls"]), ("DIRECT", CONFIG["direct_controls"])]:
            es = event_study_regression(g, y_col="y", event_date=event, bins=CONFIG["event_bins"], controls=controls_set, hac_lags=CONFIG["hac_lags"])
            if es.empty:
                continue
            es["event"] = event
            es["tenor"] = int(tenor)
            es["spec"] = spec
            bin_rows.append(es)

            plot_df = es.sort_values("term")
            fig, ax = plt.subplots(figsize=(8, 4))
            ax.plot(plot_df["term"], plot_df["estimate"], marker="o")
            ax.fill_between(plot_df["term"], plot_df["ci_low"], plot_df["ci_high"], alpha=0.2)
            ax.axhline(0, color="black", linewidth=1)
            ax.tick_params(axis="x", rotation=45)
            ax.set_title(f"Event={event} tenor={int(tenor)} spec={spec}")
            fig.tight_layout()
            fig.savefig(run_dir / "figures" / f"event_path_arb_{int(tenor)}y_{event}_{spec.lower()}.png", dpi=150)
            plt.close(fig)

eventstudy_bins = pd.concat(bin_rows, ignore_index=True) if bin_rows else pd.DataFrame()
eventstudy_bins.to_csv(run_dir / "tables" / "eventstudy_bins.csv", index=False)
eventstudy_bins.head()


In [None]:
pooled_rows = []
models = []
for event in CONFIG["events"]:
    sub = add_event_time(panel_long, event)
    sub = sub[sub["event_time"].between(-60, 60)].copy()
    sub["post"] = (sub["event_time"] >= 0).astype(int)
    for spec, controls_set in [("TOTAL", CONFIG["total_controls"]), ("DIRECT", CONFIG["direct_controls"])]:
        use_controls = [c for c in controls_set if c in sub.columns]
        use_cols = ["y", "post", "tenor", *use_controls]
        reg = sub[use_cols].dropna().copy()
        if reg.empty:
            continue
        rhs = "post + C(tenor)"
        if use_controls:
            rhs += " + " + " + ".join(use_controls)
        res = ols(f"y ~ {rhs}", data=reg).fit()
        robust = res.get_robustcov_results(cov_type="HAC", maxlags=CONFIG["hac_lags"])
        models.append(robust)
        if "post" in robust.model.exog_names:
            i = robust.model.exog_names.index("post")
            pooled_rows.append({"event": event, "spec": spec, "post": robust.params[i], "se": robust.bse[i], "N": int(robust.nobs)})

pd.DataFrame(pooled_rows).to_csv(run_dir / "tables" / "pooled_jump_results.csv", index=False)

html_path = run_dir / "tables" / "regression_table.html"
try:
    from stargazer.stargazer import Stargazer
    if models:
        sg = Stargazer(models)
        sg.title("Pooled jump regressions (HAC SE)")
        html_path.write_text(sg.render_html(), encoding="utf-8")
    else:
        html_path.write_text("<html><body><p>No pooled models available.</p></body></html>", encoding="utf-8")
except Exception as exc:  # noqa: BLE001
    html_path.write_text(f"<html><body><p>Stargazer unavailable: {exc}</p></body></html>", encoding="utf-8")

pd.DataFrame(pooled_rows).head()


In [None]:
layer2_note = ""
try:
    pd_long = load_any_table(resolve_dataset_path("primary_dealer_stats_ofr_stfm_nypd_long", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
    bank = load_any_table(resolve_dataset_path("bank_exposure_y9c_agg_daily", expected_dir=repo_root / "data" / "raw" / "event_inputs"))

    pd_long["date"] = pd.to_datetime(pd_long["date"], errors="coerce")
    bank["date"] = pd.to_datetime(bank["date"], errors="coerce")

    util_w = pd_long.pivot_table(index="date", columns="mnemonic", values="value", aggfunc="mean").resample("W-FRI").mean()
    util_w["utilization_index"] = util_w.sum(axis=1, min_count=1)
    util_w["utilization_lag1w"] = util_w["utilization_index"].shift(1)

    bank_w = bank.set_index("date").resample("W-FRI").mean()[["agg_exempt_share"]]
    y_w = panel_long.groupby([pd.Grouper(key="date", freq="W-FRI")])["y"].mean().to_frame("y")
    c_w = panel_long.set_index("date")[[c for c in CONFIG["direct_controls"] if c in panel_long.columns]].resample("W-FRI").mean()

    mech = y_w.join([bank_w, util_w[["utilization_lag1w"]], c_w], how="inner").dropna()
    mech["relief"] = ((mech.index >= "2020-04-01") & (mech.index <= "2021-03-31")).astype(int)
    mech["z_exempt"] = (mech["agg_exempt_share"] - mech["agg_exempt_share"].mean()) / mech["agg_exempt_share"].std()
    mech["z_util_l1"] = (mech["utilization_lag1w"] - mech["utilization_lag1w"].mean()) / mech["utilization_lag1w"].std()
    mech["relief_x_exempt"] = mech["relief"] * mech["z_exempt"]
    mech["relief_x_util"] = mech["relief"] * mech["z_util_l1"]

    xcols = ["relief", "relief_x_exempt", "relief_x_util", *[c for c in CONFIG["direct_controls"] if c in mech.columns]]
    reg = mech[["y", *xcols]].dropna()
    X = sm.add_constant(reg[xcols], has_constant="add")
    res = sm.OLS(reg["y"], X).fit(cov_type="HAC", cov_kwds={"maxlags": 2})
    pd.DataFrame({"term": res.params.index, "coef": res.params.values, "se": res.bse.values}).to_csv(run_dir / "tables" / "layer2_mechanism_weekly.csv", index=False)
    layer2_note = "Layer 2 executed."
except Exception as exc:  # noqa: BLE001
    layer2_note = f"Layer 2 skipped gracefully: {exc}"

layer2_note


In [None]:
if latest_dir.exists():
    shutil.rmtree(latest_dir)
shutil.copytree(run_dir, latest_dir)

notes = {
    "run_dir": str(run_dir),
    "controls_source": controls_source,
    "outcomes_source": str(outcome_path),
    "arb_columns": arb_cols,
    "unit_note": unit_note,
    "layer2_note": layer2_note,
}
(run_dir / "README.md").write_text("# Summary pipeline run\n\n```json\n" + json.dumps(notes, indent=2) + "\n```\n", encoding="utf-8")
notes
