# Summary pipeline: multi-strategy arbitrage event study

This notebook builds a stacked panel of arbitrage spreads (TIPS–Treasury, Treasury spot–futures, CIP, equity spot–futures), merges common controls, and runs Layer 1 + Layer 2 designs.


In [1]:
from __future__ import annotations

import hashlib
import json
import logging
import shutil
from datetime import datetime
from pathlib import Path
import sys, os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.diagnostic import acorr_ljungbox
sys.path.insert(2, "../src")
if 'src' in os.getcwd():
    os.chdir(os.path.pardir)
    print(os.getcwd())
else:
    print(os.getcwd())
from slr_bucket.econometrics.event_study import add_event_time, event_study_regression, jump_estimator
from slr_bucket.io import build_data_catalog, load_any_table, resolve_dataset_path, as_daily_date, coerce_num, keep_controls_with_coverage


c:\Users\Owner\Box\Winter26\slr_bucket\notebooks


In [2]:
# CONFIG (multi-strategy)
CONFIG = {
    # Outcomes (loaded from data/series via slr_bucket.outcomes.stack_outcomes)
    "y_col": "y_abs_bps",   # analyze dislocation magnitude; set to "y_bps" for signed
    "events": ["2020-04-01", "2021-03-19", "2021-03-31"],
    "windows": [20, 60],
    "event_bins": [(-60,-41),(-40,-21),(-20,-1),(0,0),(1,20),(21,40),(41,60)],
    # Controls
    "total_controls": ["VIX", "HY_OAS", "BAA10Y", "issu_7_bil", "issu_14_bil", "issu_30_bil"],
    "direct_controls": ["VIX", "HY_OAS", "BAA10Y", "issu_7_bil", "issu_14_bil", "issu_30_bil", "SOFR", "spr_tgcr", "spr_effr"],
    "hac_lags": 5,
    "run_layer2": True,
    # panel keys
    "fe_col": "series",
    "strategy_col": "strategy",
    "group_col": "treasury_based",
}
cfg_hash = hashlib.sha256(json.dumps(CONFIG, sort_keys=True).encode()).hexdigest()[:12]
run_stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
repo_root = Path.cwd().parent
run_dir = repo_root / "outputs" / "summary_pipeline" / f"{run_stamp}_{cfg_hash}"
(run_dir / "tables").mkdir(parents=True, exist_ok=True)
(run_dir / "figures").mkdir(parents=True, exist_ok=True)
(run_dir / "data").mkdir(parents=True, exist_ok=True)
print(run_dir)


c:\Users\Owner\Box\Winter26\slr_bucket\outputs\summary_pipeline\20260227_051043_cd72a0e15615


  run_stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


## Data catalog and dataset inventory

New `/data` structure uses layered folders (`raw`, `intermediate`, `series`). This run uses:
- Outcomes: `data/series/tips_treasury_implied_rf_2010.(parquet|csv)` (`arb_*`).
- Preferred merged controls: `data/intermediate/analysis_panel.csv` (if valid for required columns).
- Fallback controls from raw inputs:
  - `raw/event_inputs/controls_vix_creditspreads_fred`
  - `raw/event_inputs/repo_rates_combined` (or `repo_rates_fred`)
  - `raw/event_inputs/treasury_issuance_by_tenor_fiscaldata`
- Layer 2 proxies (optional):
  - `raw/event_inputs/primary_dealer_stats_ofr_stfm_nypd_long`
  - `raw/event_inputs/bank_exposure_y9c_agg_daily.csv`


In [3]:
catalog = build_data_catalog(repo_root / "data")
catalog.to_csv(run_dir / "data" / "data_catalog.csv", index=False)
catalog.to_parquet(run_dir / "data" / "data_catalog.parquet", index=False)
catalog.to_markdown(run_dir / "data" / "data_catalog.md", index=False)
catalog.head(10)


Unnamed: 0,path,layer,rows,columns,frequency,date_min,date_max,key_columns,join_hints
0,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,5476,"date,spread_2y_bps,spread_5y_bps,spread_10y_bp...",daily,2010-01-04,2024-12-31,date,daily:date | keys:date | layer:intermediate
1,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,420,"date,bid_ask_spread,pubout,n_issues",monthly,1980-01-31,2014-12-31,date,keys:date | layer:intermediate
2,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,1209,"date,fed_assets",weekly,2002-12-18,2026-02-11,date,weekly:date | keys:date | layer:intermediate
3,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,1209,"date,fed_treasury_holdings",weekly,2002-12-18,2026-02-11,date,weekly:date | keys:date | layer:intermediate
4,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,751,"date,sofr,sofr_volume",daily,2019-01-02,2021-12-31,date,daily:date | keys:date | layer:intermediate
5,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,3752,"date,spread_2y_bps,spread_5y_bps,spread_10y_bp...",daily,2010-01-04,2024-12-31,date,daily:date | keys:date | layer:intermediate
6,c:\Users\Owner\Box\Winter26\slr_bucket\data\ra...,raw,3955,"Date,AUD,CAD,CHF,EUR,GBP,JPY,NZD,SEK,USD",unknown,NaT,NaT,,layer:raw
7,c:\Users\Owner\Box\Winter26\slr_bucket\data\ra...,raw,3913,"('SPX Index', 'PX_LAST'),('SPX Index', 'IDX_ES...",unknown,NaT,NaT,,layer:raw
8,c:\Users\Owner\Box\Winter26\slr_bucket\data\ra...,raw,14,"report_date,total_assets,total_reserves,total_...",quarterly,NaT,NaT,report_date,quarterly:report_date | keys:report_date | lay...
9,c:\Users\Owner\Box\Winter26\slr_bucket\data\ra...,raw,14,"report_date,total_assets,total_reserves,total_...",quarterly,NaT,NaT,report_date,quarterly:report_date | keys:report_date | lay...


In [6]:
# Outcomes: build stacked arbitrage panel (TIPS–Treasury, Treasury SF, CIP, Equity SF)
from slr_bucket.outcomes import stack_outcomes
logger = logging.getLogger("summary_pipeline")

series_dir = repo_root / "data" / "series"
outcomes_long = stack_outcomes(series_dir)

# Canonical outcome used in regressions
y_col = CONFIG["y_col"]
outcomes_long["y"] = outcomes_long[y_col].astype(float)

# A stable identifier used for FE and stratification
outcomes_long["series"] = outcomes_long["series"].astype(str)
outcomes_long["strategy"] = outcomes_long["strategy"].astype(str)
outcomes_long["treasury_based"] = outcomes_long["treasury_based"].astype(int)

# quick unit check
unit_q = outcomes_long["y"].abs().quantile([0.5, 0.9, 0.99]).to_dict()
info = {
    "series_count": int(outcomes_long["series"].nunique()),
    "strategy_count": int(outcomes_long["strategy"].nunique()),
    "median_abs_bps": float(unit_q.get(0.5, np.nan)),
    "p90_abs_bps": float(unit_q.get(0.9, np.nan)),
    "p99_abs_bps": float(unit_q.get(0.99, np.nan)),
    "y_col": y_col,
}
(outcomes_long.head(), info)


(        date       strategy series  tenor      y_bps  treasury_based  \
 0 2010-01-04  TIPS_Treasury  arb_2    2.0  43.534609               1   
 1 2010-01-05  TIPS_Treasury  arb_2    2.0  39.490811               1   
 2 2010-01-06  TIPS_Treasury  arb_2    2.0  38.344764               1   
 3 2010-01-07  TIPS_Treasury  arb_2    2.0  30.474542               1   
 4 2010-01-08  TIPS_Treasury  arb_2    2.0  40.810449               1   
 
    y_abs_bps          y  
 0  43.534609  43.534609  
 1  39.490811  39.490811  
 2  38.344764  38.344764  
 3  30.474542  30.474542  
 4  40.810449  40.810449  ,
 {'series_count': 20,
  'strategy_count': 4,
  'median_abs_bps': 22.159000000000013,
  'p90_abs_bps': 63.76055393562226,
  'p99_abs_bps': 197.69811111111116,
  'y_col': 'y_abs_bps'})

In [7]:
# Controls: prefer intermediate analysis_panel if valid, else fallback build from raw.
# def build_controls_panel():

needed = set(CONFIG["direct_controls"])
try:
    p = resolve_dataset_path("analysis_panel", expected_dir=repo_root / "data" / "intermediate")
    panel = load_any_table(p)
    panel["date"] = pd.to_datetime(panel["date"], errors="coerce")
    if needed.issubset(set(panel.columns)):
        logger.info("Using controls from intermediate analysis_panel: %s", p)
        controls =  panel[["date", *sorted(needed)]].copy() #, str(p)
except Exception as exc:
    logger.warning("analysis_panel unavailable/invalid (%s), using raw fallback", exc)

fred = load_any_table(resolve_dataset_path("controls_vix_creditspreads_fred", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
fred["date"] = pd.to_datetime(fred["date"], errors="coerce")
fred["date"] = as_daily_date(fred["date"])
try:
    repo = load_any_table(resolve_dataset_path("repo_rates_combined", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
except FileNotFoundError:
    repo = load_any_table(resolve_dataset_path("repo_rates_fred", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
repo["date"] = pd.to_datetime(repo["date"], errors="coerce")
repo["date"] = as_daily_date(repo["date"])
repo = repo.rename(columns={"TGCR":"tgcr", "EFFR":"effr"})
if "spr_tgcr" not in repo.columns and {"SOFR","tgcr"}.issubset(repo.columns):
    repo["spr_tgcr"] = pd.to_numeric(repo["tgcr"], errors="coerce") - pd.to_numeric(repo["SOFR"], errors="coerce")
if "spr_effr" not in repo.columns and {"SOFR","effr"}.issubset(repo.columns):
    repo["spr_effr"] = pd.to_numeric(repo["effr"], errors="coerce") - pd.to_numeric(repo["SOFR"], errors="coerce")

issu = load_any_table(resolve_dataset_path("treasury_issuance_by_tenor_fiscaldata", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
issu["date"] = pd.to_datetime(issu.get("issue_date"), errors="coerce")
issu["date"] = as_daily_date(issu["date"])
issu["tenor_bucket"] = pd.to_numeric(issu["tenor_bucket"], errors="coerce")
issu["issuance_amount"] = pd.to_numeric(issu["issuance_amount"], errors="coerce") / 1e9
d = issu.pivot_table(index="date", columns="tenor_bucket", values="issuance_amount", aggfunc="sum").reset_index()

# Robustly rename tenor-bucket columns to issu_*_bil (handles int/float/str column labels)
rename_map = {}
for col in d.columns:
    if col == "date":
        continue
    try:
        v = float(col)
    except Exception:
        continue
    if abs(v - 7.0) < 1e-9:
        rename_map[col] = "issu_7_bil"
    elif abs(v - 10.0) < 1e-9:
        rename_map[col] = "issu_10_bil"
    elif abs(v - 14.0) < 1e-9:
        rename_map[col] = "issu_14_bil"
    elif abs(v - 20.0) < 1e-9:
        rename_map[col] = "issu_20_bil"
    elif abs(v - 30.0) < 1e-9:
        rename_map[col] = "issu_30_bil"
d = d.rename(columns=rename_map)

# Ensure required issuance controls exist (zeros if not present in file)
for c in ["issu_7_bil", "issu_14_bil", "issu_30_bil", "issu_10_bil", "issu_20_bil"]:
    if c not in d.columns:
        d[c] = 0.0

# If 14y bucket absent, approximate as 10y+20y (as in prior logic)
if d["issu_14_bil"].fillna(0.0).abs().sum() == 0.0:
    d["issu_14_bil"] = d.get("issu_10_bil", 0.0) + d.get("issu_20_bil", 0.0)

for c in ["issu_7_bil", "issu_14_bil", "issu_30_bil"]:
    d[c] = pd.to_numeric(d[c], errors="coerce").fillna(0.0)

# Keep only the issuance controls used in the design
d = d[["date", "issu_7_bil", "issu_14_bil", "issu_30_bil"]]
fred = fred.groupby("date", as_index=False).mean(numeric_only=True)
repo = repo.groupby("date", as_index=False).mean(numeric_only=True)
d    = d.groupby("date", as_index=False).sum(numeric_only=True)   # issuance is additive

for col in ["VIX","HY_OAS","BAA10Y","SOFR","spr_tgcr","spr_effr","tgcr","effr"]:
    if col in fred.columns: fred[col] = coerce_num(fred[col])
    if col in repo.columns: repo[col] = coerce_num(repo[col])


# If 'controls' was not set from intermediate analysis_panel, build it from raw sources.
if "controls" not in globals():
    controls = fred.merge(repo, on="date", how="outer").merge(d, on="date", how="outer").sort_values("date")
    # keep only needed controls (drop extras like tgcr/effr if not needed)
    keep = ["date"] + sorted(set(CONFIG["direct_controls"]) & set(controls.columns))
    controls = controls[keep].copy()
    logger.info("Built controls from raw sources. columns=%s", keep)


In [8]:
import re
import pandas as pd
import numpy as np

def _as_date(x):
    return pd.to_datetime(x, errors="coerce", utc=True).dt.tz_convert(None).dt.normalize()

import re
import pandas as pd

def _canon(x) -> str:
    # robust to non-string column names (float/int/None)
    if x is None:
        return ""
    s = str(x)
    return re.sub(r"[^a-z0-9]+", "", s.lower())

def _sanitize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [str(c) for c in df.columns]  # force all to strings
    return df

def _rename_to_canonical(df: pd.DataFrame, want: list[str]) -> pd.DataFrame:
    df = _sanitize_columns(df)
    m = {_canon(c): c for c in df.columns}
    ren = {}
    for w in want:
        key = _canon(w)
        if key in m:
            ren[m[key]] = w
    return df.rename(columns=ren)


def _collapse_daily(df: pd.DataFrame, how="mean") -> pd.DataFrame:
    df = df.copy()
    df["date"] = _as_date(df["date"])
    df = df.dropna(subset=["date"])
    num = [c for c in df.columns if c != "date"]
    if not num:
        return df[["date"]].drop_duplicates().sort_values("date")
    agg = "mean" if how == "mean" else "sum"
    return df.groupby("date", as_index=False)[num].agg(agg).sort_values("date")

# 0) Build a master date index from your OUTCOMES (arb panel)
# assumes you have arb_long/panel with a 'date' column
base_dates = _as_date(arb_long["date"]).dropna().drop_duplicates().sort_values()
controls = pd.DataFrame({"date": base_dates}).reset_index(drop=True)

# 1) FRED controls
fred_use = _rename_to_canonical(fred, ["date", "VIX", "HY_OAS", "BAA10Y"])
fred_use = _collapse_daily(fred_use[["date"] + [c for c in ["VIX","HY_OAS","BAA10Y"] if c in fred_use.columns]], how="mean")
controls = controls.merge(fred_use, on="date", how="left")

# 2) Repo / funding controls
repo_use = _rename_to_canonical(repo, ["date", "SOFR", "TGCR", "EFFR", "spr_tgcr", "spr_effr"])
repo_use = _collapse_daily(repo_use, how="mean")

# if spreads missing but levels exist, compute them
if "spr_tgcr" not in repo_use.columns and {"TGCR","SOFR"}.issubset(repo_use.columns):
    repo_use["spr_tgcr"] = repo_use["TGCR"] - repo_use["SOFR"]
if "spr_effr" not in repo_use.columns and {"EFFR","SOFR"}.issubset(repo_use.columns):
    repo_use["spr_effr"] = repo_use["EFFR"] - repo_use["SOFR"]

repo_keep = ["date"] + [c for c in ["SOFR","spr_tgcr","spr_effr"] if c in repo_use.columns]
controls = controls.merge(repo_use[repo_keep], on="date", how="left")

# 3) Issuance (event-based -> zeros on non-issuance days)
d_use = d.copy()
if "issue_date" in d_use.columns and "date" not in d_use.columns:
    d_use = d_use.rename(columns={"issue_date": "date"})
d_use = _rename_to_canonical(d_use, ["date", "issu_7_bil", "issu_14_bil", "issu_30_bil"])
issu_use = _collapse_daily(d_use[["date"] + [c for c in ["issu_7_bil","issu_14_bil","issu_30_bil"] if c in d_use.columns]], how="sum")
controls = controls.merge(issu_use, on="date", how="left")

for c in ["issu_7_bil","issu_14_bil","issu_30_bil"]:
    if c in controls.columns:
        controls[c] = controls[c].fillna(0.0)

# 4) Fill only market/funding gaps inside the outcome sample
fill_cols = [c for c in ["VIX","HY_OAS","BAA10Y","SOFR","spr_tgcr","spr_effr"] if c in controls.columns]
controls = controls.sort_values("date").reset_index(drop=True)
controls[fill_cols] = controls[fill_cols].ffill().bfill()

# 5) Quick diagnostic (within outcome sample only)
miss = controls[fill_cols + [c for c in ["issu_7_bil","issu_14_bil","issu_30_bil"] if c in controls.columns]].isna().mean().sort_values(ascending=False)
display(miss.to_frame("missing_share"))

NameError: name 'arb_long' is not defined

In [None]:
# Merge outcomes with controls
panel_long = outcomes_long.merge(controls, on="date", how="left")

# coverage diagnostics for DIRECT controls (including missing/absent)
need = CONFIG["direct_controls"]
present = [c for c in need if c in panel_long.columns]
miss_share = panel_long[present].isna().mean().to_frame("missing_share") if present else pd.DataFrame({"missing_share":[]})
display(miss_share)

print(panel_long.columns)

# Preview
panel_long.head()


In [None]:
# Diagnostics: missingness (including columns that are entirely absent)
def missingness_report(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    out = []
    for c in cols:
        if c not in df.columns:
            out.append({"var": c, "present": False, "missing_share": 1.0, "n_nonmissing": 0})
        else:
            s = pd.to_numeric(df[c], errors="coerce")
            out.append({"var": c, "present": True, "missing_share": float(s.isna().mean()), "n_nonmissing": int(s.notna().sum())})
    return pd.DataFrame(out).sort_values(["present","missing_share"], ascending=[True, False])

diag = missingness_report(panel_long, CONFIG["direct_controls"])
display(diag)

# Warn if any direct controls have less than 90% coverage in the sample used for event studies
coverage = diag.set_index("var")["missing_share"]
bad = coverage[coverage > 0.10]
if len(bad):
    logger.warning("Low coverage direct controls (>10%% missing): %s", bad.to_dict())


In [None]:
# Layer 1A: summary stats by strategy and regime (using y = CONFIG['y_col'])
regimes = {
    "pre": (pd.Timestamp("2019-01-01"), pd.Timestamp("2020-03-31")),
    "relief": (pd.Timestamp("2020-04-01"), pd.Timestamp("2021-03-31")),
    "post": (pd.Timestamp("2021-04-01"), pd.Timestamp.max),
}

rows = []
for (strategy, series), g in panel_long.groupby(["strategy","series"]):
    g = g.sort_values("date").set_index("date")
    for regime, (start, end) in regimes.items():
        s = g.loc[(g.index>=start) & (g.index<=end), "y"].dropna()
        if s.empty:
            continue
        lb_p = np.nan
        try:
            lb = acorr_ljungbox(s, lags=[min(10, max(1, len(s)//5))], return_df=True)
            lb_p = float(lb["lb_pvalue"].iloc[0])
        except Exception:
            pass
        rows.append({
            "strategy": strategy,
            "series": series,
            "treasury_based": int(g["treasury_based"].iloc[0]) if "treasury_based" in g.columns else np.nan,
            "regime": regime,
            "sample_start": s.index.min(),
            "sample_end": s.index.max(),
            "N": int(s.shape[0]),
            "mean": float(s.mean()),
            "std": float(s.std()),
            "p5": float(s.quantile(0.05)),
            "p50": float(s.quantile(0.50)),
            "p95": float(s.quantile(0.95)),
            "autocorr1": float(s.autocorr(lag=1)) if len(s) > 2 else np.nan,
            "ljungbox_pvalue": lb_p,
        })
summary_stats = pd.DataFrame(rows)
summary_stats.to_csv(run_dir / "tables" / "summary_stats_by_strategy.csv", index=False)
summary_stats.head()


In [None]:
panel_long = panel_long.reset_index(drop=True)
if "date" not in panel_long.columns and isinstance(panel_long.index, pd.DatetimeIndex):
    panel_long["date"] = panel_long.index

panel_long["date"] = pd.to_datetime(panel_long["date"], errors="coerce")
panel_long = panel_long.dropna(subset=["date"]).sort_values(["tenor", "date"])

In [None]:
# Layer 1B: jump regressions (series-level TOTAL vs DIRECT) + pooled group interaction
from slr_bucket.econometrics.event_study import pooled_jump_regression

jump_rows = []
for event in CONFIG["events"]:
    for w in CONFIG["windows"]:
        for (strategy, series), g in panel_long.groupby(["strategy","series"]):
            for spec, controls_list in [("TOTAL", CONFIG["total_controls"]), ("DIRECT", CONFIG["direct_controls"])]:
                est, se, n = jump_estimator(g, y_col="y", event_date=event, window=w, controls=controls_list, hac_lags=CONFIG["hac_lags"])
                jump_rows.append({
                    "event": event, "window": w, "strategy": strategy, "series": series,
                    "treasury_based": int(g["treasury_based"].iloc[0]),
                    "spec": spec, "estimate": est, "se": se,
                    "ci_low": est - 1.96*se if pd.notna(est) and pd.notna(se) else np.nan,
                    "ci_high": est + 1.96*se if pd.notna(est) and pd.notna(se) else np.nan,
                    "N": n,
                })
jump_results = pd.DataFrame(jump_rows)
jump_results.to_csv(run_dir / "tables" / "jump_results_by_series.csv", index=False)

# Pooled jump with group interaction (Treasury-based vs non) using series FE
pooled_rows = []
for event in CONFIG["events"]:
    work = panel_long.copy()
    for w in CONFIG["windows"]:
        for spec, controls_list in [("TOTAL", CONFIG["total_controls"]), ("DIRECT", CONFIG["direct_controls"])]:
            out = pooled_jump_regression(
                work, y_col="y", event_date=event, window=w,
                group_col=CONFIG["group_col"], fe_col=CONFIG["fe_col"],
                controls=controls_list, hac_lags=CONFIG["hac_lags"],
            )
            if out.empty:
                continue
            out["event"] = event
            out["window"] = w
            out["spec"] = spec
            pooled_rows.append(out)
pooled_jump = pd.concat(pooled_rows, ignore_index=True) if pooled_rows else pd.DataFrame()
pooled_jump.to_csv(run_dir / "tables" / "pooled_jump_group_interaction.csv", index=False)

(jump_results.head(), pooled_jump)


In [None]:
# Layer 1C: binned event-study paths by strategy + pooled Treasury-based interactions
from slr_bucket.econometrics.event_study import pooled_event_study
from slr_bucket.plotting.plots import plot_binned_event_overlay
import matplotlib.pyplot as plt

bins = CONFIG["event_bins"]

# (i) Event-time paths by STRATEGY (pooled within strategy, series FE; no group interaction)
strategy_paths = []
for event in CONFIG["events"]:
    for strategy, g in panel_long.groupby("strategy"):
        g = g.copy()
        g["__g0"] = 0  # force no group interaction; pooled_event_study will drop constant columns
        res_df, _ = pooled_event_study(
            g, y_col="y", event_date=event, bins=bins,
            group_col="__g0", fe_col=CONFIG["fe_col"],
            controls=CONFIG["direct_controls"], hac_lags=CONFIG["hac_lags"],
        )
        if res_df.empty:
            continue
        res_df["strategy"] = strategy
        strategy_paths.append(res_df)

        # Save overlay plot for this strategy/event (plots group0_effect == pooled path)
        plot_binned_event_overlay(
            res_df[res_df["kind"].isin(["group0_effect"])].copy(),
            title=f"Event-study path | {strategy} | event={event}",
            outpath=run_dir / "figures" / f"event_path_{strategy}_{event}.png",
        )

strategy_paths_df = pd.concat(strategy_paths, ignore_index=True) if strategy_paths else pd.DataFrame()
strategy_paths_df.to_csv(run_dir / "tables" / "eventstudy_paths_by_strategy.csv", index=False)

# (ii) Pooled Treasury-based vs non interactions across ALL strategies (series FE)
pooled_all = []
for event in CONFIG["events"]:
    res_df, _ = pooled_event_study(
        panel_long, y_col="y", event_date=event, bins=bins,
        group_col=CONFIG["group_col"], fe_col=CONFIG["fe_col"],
        controls=CONFIG["direct_controls"], hac_lags=CONFIG["hac_lags"],
    )
    if res_df.empty:
        continue
    pooled_all.append(res_df)

pooled_all_df = pd.concat(pooled_all, ignore_index=True) if pooled_all else pd.DataFrame()
pooled_all_df.to_csv(run_dir / "tables" / "eventstudy_pooled_treasury_interactions.csv", index=False)

# Plot the pooled comparison (group0 vs group1 + interaction bins)
for event in CONFIG["events"]:
    sub = pooled_all_df[pooled_all_df["event_date"] == event].copy()
    if sub.empty:
        continue
    plot_binned_event_overlay(
        sub[sub["kind"].isin(["group0_effect","group1_effect","interaction_bin"])].copy(),
        title=f"Pooled event-study | Treasury-based vs non | event={event}",
        outpath=run_dir / "figures" / f"event_path_pooled_treasury_vs_non_{event}.png",
    )

(strategy_paths_df.head(), pooled_all_df.head())


In [None]:
# Visual check: plot raw series around each event (per SERIES), with event date highlighted
import matplotlib.pyplot as plt

def plot_event_window_series(df: pd.DataFrame, series: str, event: str, window: int = 120):
    d = df[df["series"] == series].copy()
    d = d.sort_values("date").set_index("date")
    t0 = pd.Timestamp(event)
    sub = d.loc[(d.index >= t0 - pd.Timedelta(days=window)) & (d.index <= t0 + pd.Timedelta(days=window)), ["y"]].dropna()
    if sub.empty:
        return
    fig, ax = plt.subplots(figsize=(9,4))
    ax.plot(sub.index, sub["y"])
    ax.axvline(t0, color="black", ls="--", lw=1)
    ax.set_title(f"{series} | raw y around {event} (±{window}d)")
    ax.set_ylabel("bps")
    fig.tight_layout()
    fig.savefig(run_dir / "figures" / f"raw_{series}_window_{event}.png", dpi=150)
    plt.close(fig)

for event in CONFIG["events"]:
    for s in sorted(panel_long["series"].dropna().unique().tolist()):
        plot_event_window_series(panel_long, s, event, window=120)


In [None]:
# Layer 1D: overlay strategy paths in a single figure (per event)
import matplotlib.pyplot as plt

paths = pd.read_csv(run_dir / "tables" / "eventstudy_paths_by_strategy.csv") if (run_dir / "tables" / "eventstudy_paths_by_strategy.csv").exists() else strategy_paths_df.copy()
if not paths.empty:
    # keep pooled path rows
    paths = paths[paths["kind"] == "group0_effect"].copy()
    paths["bin_mid"] = pd.to_numeric(paths["bin_mid"], errors="coerce")
    for event in CONFIG["events"]:
        sub = paths[paths["event_date"] == event].copy()
        if sub.empty:
            continue
        fig, ax = plt.subplots(figsize=(10,5))
        for strategy, g in sub.groupby("strategy"):
            g = g.sort_values("bin_mid")
            ax.plot(g["bin_mid"], g["estimate"], marker="o", label=strategy)
        ax.axhline(0, color="black", lw=1)
        ax.axvline(0, color="black", lw=1, ls="--")
        ax.set_xlabel("Event time (bin midpoint)")
        ax.set_ylabel("Effect (bps)")
        ax.set_title(f"Event-study paths by strategy | event={event}")
        ax.grid(alpha=0.2)
        ax.legend()
        fig.tight_layout()
        fig.savefig(run_dir / "figures" / f"event_paths_all_strategies_{event}.png", dpi=150)
        plt.close(fig)

paths.head()


In [None]:
# Layer 2 mechanism (weekly): pooled panel with series FE + interactions by treasury_based
import statsmodels.api as sm
layer2_note = ""
mech_out = pd.DataFrame()
try:
    if not CONFIG.get("run_layer2", True):
        raise RuntimeError("CONFIG.run_layer2=False")

    # inputs
    pd_long = load_any_table(resolve_dataset_path("primary_dealer_stats_ofr_stfm_nypd_long", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
    bank = load_any_table(resolve_dataset_path("bank_exposure_y9c_agg_daily", expected_dir=repo_root / "data" / "raw" / "event_inputs"))

    pd_long["date"] = pd.to_datetime(pd_long["date"], errors="coerce")
    bank["date"] = pd.to_datetime(bank["date"], errors="coerce")

    # Weekly dealer utilization proxy (lagged)
    pd_w = pd_long.pivot_table(index="date", columns="mnemonic", values="value", aggfunc="mean").resample("W-FRI").mean()
    pd_w["utilization_index"] = pd_w.sum(axis=1, min_count=1)
    pd_w["utilization_lag1w"] = pd_w["utilization_index"].shift(1)

    # Weekly bank exposure proxy
    if "agg_exempt_share" not in bank.columns:
        raise KeyError("bank_exposure_y9c_agg_daily missing 'agg_exempt_share'")
    b_w = bank.set_index("date").resample("W-FRI").mean()[["agg_exempt_share"]]

    # Weekly outcome per SERIES
    y_w = panel_long.set_index("date").groupby("series")["y"].resample("W-FRI").mean().reset_index()
    # attach strategy + treasury_based (time-invariant per series)
    meta = panel_long.groupby("series", as_index=False)[["strategy","treasury_based"]].first()
    y_w = y_w.merge(meta, on="series", how="left").set_index("date")

    # Weekly controls (take what is available)
    desired = list(CONFIG.get("direct_controls", []))
    present = [c for c in desired if c in panel_long.columns]
    missing = sorted(set(desired) - set(present))
    if missing:
        logger.warning("Layer 2: dropping missing controls: %s", missing)

    c_w = panel_long.set_index("date")[present].resample("W-FRI").mean() if present else pd.DataFrame(index=y_w.index.unique())

    # Merge: broadcast b_w and pd_w to all series dates
    mech = y_w.join([b_w, pd_w[["utilization_lag1w"]], c_w], how="inner").dropna()

    # Relief indicator (inclusive)
    mech["relief"] = ((mech.index >= "2020-04-01") & (mech.index <= "2021-03-31")).astype(int)

    # z-scores
    ex_std = mech["agg_exempt_share"].std()
    util_std = mech["utilization_lag1w"].std()
    mech["z_exempt"] = (mech["agg_exempt_share"] - mech["agg_exempt_share"].mean()) / (ex_std if ex_std and ex_std > 0 else 1.0)
    mech["z_util_l1"] = (mech["utilization_lag1w"] - mech["utilization_lag1w"].mean()) / (util_std if util_std and util_std > 0 else 1.0)

    mech["treasury_based"] = pd.to_numeric(mech["treasury_based"], errors="coerce").fillna(0).astype(int)

    # interactions
    mech["relief_x_exempt"] = mech["relief"] * mech["z_exempt"]
    mech["relief_x_util"]   = mech["relief"] * mech["z_util_l1"]
    mech["relief_x_treas"]  = mech["relief"] * mech["treasury_based"]
    mech["relief_x_exempt_x_treas"] = mech["relief"] * mech["z_exempt"] * mech["treasury_based"]
    mech["relief_x_util_x_treas"]   = mech["relief"] * mech["z_util_l1"] * mech["treasury_based"]

    # Series FE
    fe = pd.get_dummies(mech["series"].astype(str), prefix="fe", drop_first=True)

    xcols = ["relief", "relief_x_exempt", "relief_x_util",
             "relief_x_treas", "relief_x_exempt_x_treas", "relief_x_util_x_treas"] + present
    X = pd.concat([mech[xcols].apply(pd.to_numeric, errors="coerce"), fe], axis=1)
    reg = pd.concat([mech[["y"]], X], axis=1).dropna()

    if len(reg) < 50:
        raise RuntimeError(f"Layer 2 insufficient weekly observations after joins: n={len(reg)}")

    Y = reg["y"].astype(float)
    X = sm.add_constant(reg.drop(columns=["y"]).astype(float), has_constant="add")
    res = sm.OLS(Y, X).fit(cov_type="HAC", cov_kwds={"maxlags":2})

    mech_out = pd.DataFrame({"term": res.params.index, "coef": res.params.values, "se": res.bse.values})
    mech_out.to_csv(run_dir / "tables" / "layer2_mechanism_weekly_panel.csv", index=False)

    layer2_note = f"Layer 2 executed successfully. n={int(res.nobs)}; controls_used={present}; series_FE={fe.shape[1]}"
except Exception as exc:
    layer2_note = f"Layer 2 skipped gracefully due to missing/unusable inputs: {exc}"

print(layer2_note)
mech_out.head()


In [None]:
pd_w.head() if 'pd_w' in globals() else None

In [None]:
y_w.head() if 'y_w' in globals() else None

In [None]:
mech_out if 'mech_out' in globals() else None

In [None]:
# Refresh latest and write run metadata (works when CONFIG is a dict)

import json
import sys
import platform
import subprocess
import shutil
import hashlib
from datetime import datetime, timezone
from pathlib import Path

import pandas as pd

def _safe_git_info(repo_root: Path) -> dict:
    try:
        sha = subprocess.check_output(
            ["git", "rev-parse", "HEAD"], cwd=str(repo_root), stderr=subprocess.STDOUT, text=True
        ).strip()
        branch = subprocess.check_output(
            ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=str(repo_root), stderr=subprocess.STDOUT, text=True
        ).strip()
        status = subprocess.check_output(
            ["git", "status", "--porcelain"], cwd=str(repo_root), stderr=subprocess.STDOUT, text=True
        )
        return {"commit": sha, "branch": branch, "dirty": bool(status.strip())}
    except Exception as e:
        return {"commit": None, "branch": None, "dirty": None, "error": str(e)}

def _safe_pkg_versions(pkgs: list[str]) -> dict:
    out = {}
    try:
        import importlib.metadata as md
        for p in pkgs:
            try:
                out[p] = md.version(p)
            except Exception:
                out[p] = None
    except Exception:
        for p in pkgs:
            out[p] = None
    return out

def _find_main_df() -> tuple[str | None, pd.DataFrame | None]:
    for name in ["analysis_panel", "arb_panel", "panel", "daily_long", "pivot"]:
        obj = globals().get(name, None)
        if isinstance(obj, pd.DataFrame):
            return name, obj
    return None, None

def _dataset_notes() -> str:
    name, df = _find_main_df()
    if df is None:
        return "No main dataframe found (looked for: analysis_panel, arb_panel, panel, daily_long, pivot)."
    notes = [f"Dataset: {name}", f"Rows: {len(df):,}", f"Columns: {df.shape[1]:,}"]
    if "date" in df.columns:
        d = pd.to_datetime(df["date"], errors="coerce")
        if d.notna().any():
            notes.append(f"Date range: {d.min().date()} → {d.max().date()}")
    if "tenor" in df.columns:
        try:
            notes.append(f"Tenors: {df['tenor'].nunique()}")
        except Exception:
            pass
    return "\n".join(notes)

def _stable_config_dict(cfg) -> dict:
    # If cfg is already dict-like, use it; else try to coerce.
    if isinstance(cfg, dict):
        return cfg
    try:
        return dict(cfg)
    except Exception:
        return {"CONFIG_repr": repr(cfg)}

def _config_hash(cfg_dict: dict) -> str:
    s = json.dumps(cfg_dict, sort_keys=True, default=str).encode("utf-8")
    return hashlib.sha1(s).hexdigest()[:10]

def _write_readme(run_dir: Path, cfg_dict: dict, notes: str):
    txt = []
    txt.append("# summary_pipeline run")
    txt.append("")
    txt.append("## Config")
    txt.append("```json")
    txt.append(json.dumps(cfg_dict, indent=2, default=str))
    txt.append("```")
    txt.append("")
    txt.append("## Notes")
    txt.append("```")
    txt.append(notes)
    txt.append("```")
    (run_dir / "README.md").write_text("\n".join(txt), encoding="utf-8")

# --- Preconditions
if "CONFIG" not in globals():
    raise RuntimeError("CONFIG not found. Run the CONFIG cell first.")

# Repo root (best-effort)
if "REPO_ROOT" in globals():
    repo_root = Path(REPO_ROOT)
else:
    repo_root = Path().cwd().resolve().parent  # assumes notebook is in notebooks/

# Output root (prefer CONFIG override if provided)
cfg_dict = _stable_config_dict(CONFIG)
out_root = cfg_dict.get("output_root", None)
if out_root:
    output_root = Path(out_root).expanduser()
    if not output_root.is_absolute():
        output_root = (repo_root / output_root).resolve()
else:
    output_root = (repo_root / "outputs" / "summary_pipeline").resolve()

output_root.mkdir(parents=True, exist_ok=True)

# Run folder naming
utc_now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
cfg_hash = _config_hash(cfg_dict)

run_dir = output_root / f"{ts}_{cfg_hash}"
figures_dir = run_dir / "figures"
tables_dir  = run_dir / "tables"
data_dir    = run_dir / "data"
logs_dir    = run_dir / "logs"

for p in [run_dir, figures_dir, tables_dir, data_dir, logs_dir]:
    p.mkdir(parents=True, exist_ok=True)

# Write README + metadata
notes = _dataset_notes()
_write_readme(run_dir, cfg_dict, notes)

metadata = {
    "utc_timestamp": utc_now,
    "run_dir": str(run_dir),
    "config_hash": cfg_hash,
    "config": cfg_dict,
    "git": _safe_git_info(repo_root),
    "python": {"version": sys.version, "executable": sys.executable},
    "platform": {
        "system": platform.system(),
        "release": platform.release(),
        "version": platform.version(),
        "machine": platform.machine(),
    },
    "packages": _safe_pkg_versions(["numpy", "pandas", "statsmodels", "matplotlib", "scipy"]),
    "notes": notes,
}
(run_dir / "run_metadata.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8")

# Refresh latest/ (copy entire run folder)
latest_dir = output_root / "latest"
if latest_dir.exists():
    shutil.rmtree(latest_dir)
shutil.copytree(run_dir, latest_dir)

# Expose run_dirs for downstream cells (compatible with older code)
run_dirs = {
    "run": run_dir,
    "figures": figures_dir,
    "tables": tables_dir,
    "data": data_dir,
    "logs": logs_dir,
    "latest": latest_dir,
}

print("Run dir:", run_dir)
print("Latest refreshed:", latest_dir)
print("Config hash:", cfg_hash)

Run dir: C:\Users\Owner\Box\Winter26\slr_bucket\outputs\summary_pipeline\20260226_221836_278369efc0
Latest refreshed: C:\Users\Owner\Box\Winter26\slr_bucket\outputs\summary_pipeline\latest
Config hash: 278369efc0
