# Summary pipeline: arb
a outcomes and event-study design

This notebook rebuilds Layer 1 around real `arb_*` outcomes from `tips_treasury_implied_rf_2010` and aligns controls from the repo data structure.


In [1]:
from __future__ import annotations

import hashlib
import json
import logging
import shutil
from datetime import datetime
from pathlib import Path
import sys, os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.diagnostic import acorr_ljungbox
sys.path.insert(2, "../src")
if 'src' in os.getcwd():
    os.chdir(os.path.pardir)
    print(os.getcwd())
else:
    print(os.getcwd())
from slr_bucket.econometrics.event_study import add_event_time, event_study_regression, jump_estimator
from slr_bucket.io import build_data_catalog, load_any_table, resolve_dataset_path, as_daily_date, coerce_num, keep_controls_with_coverage


c:\Users\Owner\Box\Winter26\slr_bucket\notebooks


In [2]:
# CONFIG
CONFIG = {
    "outcomes_source": "tips_treasury_implied_rf_2010",
    "outcome_pattern": "arb_",
    "tenors_required": [2, 5, 10],
    "events": ["2020-04-01", "2021-03-19", "2021-03-31"],
    "windows": [20, 60],
    "event_bins": [(-60,-41),(-40,-21),(-20,-1),(0,0),(1,20),(21,40),(41,60)],
    "total_controls": ["VIX", "HY_OAS", "BAA10Y", "issu_7_bil", "issu_14_bil", "issu_30_bil"],
    "direct_controls": ["VIX", "HY_OAS", "BAA10Y", "issu_7_bil", "issu_14_bil", "issu_30_bil", "SOFR", "spr_tgcr", "spr_effr"],
    "hac_lags": 5,
    "run_layer2": True,
}
cfg_hash = hashlib.sha256(json.dumps(CONFIG, sort_keys=True).encode()).hexdigest()[:12]
run_stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
repo_root = Path.cwd().parent
run_dir = repo_root / "outputs" / "summary_pipeline" / f"{run_stamp}_{cfg_hash}"
for sub in ["figures","tables","data","logs"]:
    (run_dir / sub).mkdir(parents=True, exist_ok=True)
latest_dir = repo_root / "outputs" / "summary_pipeline" / "latest"

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s - %(message)s", handlers=[logging.FileHandler(run_dir / "logs" / "pipeline.log"), logging.StreamHandler()], force=True)
logger = logging.getLogger("summary_pipeline")
print(run_dir)


c:\Users\Owner\Box\Winter26\slr_bucket\outputs\summary_pipeline\20260227_043044_eabefa1239bc


  run_stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


## Data catalog and dataset inventory

New `/data` structure uses layered folders (`raw`, `intermediate`, `series`). This run uses:
- Outcomes: `data/series/tips_treasury_implied_rf_2010.(parquet|csv)` (`arb_*`).
- Preferred merged controls: `data/intermediate/analysis_panel.csv` (if valid for required columns).
- Fallback controls from raw inputs:
  - `raw/event_inputs/controls_vix_creditspreads_fred`
  - `raw/event_inputs/repo_rates_combined` (or `repo_rates_fred`)
  - `raw/event_inputs/treasury_issuance_by_tenor_fiscaldata`
- Layer 2 proxies (optional):
  - `raw/event_inputs/primary_dealer_stats_ofr_stfm_nypd_long`
  - `raw/event_inputs/bank_exposure_y9c_agg_daily.csv`


In [3]:
catalog = build_data_catalog(repo_root / "data")
catalog.to_csv(run_dir / "data" / "data_catalog.csv", index=False)
catalog.to_parquet(run_dir / "data" / "data_catalog.parquet", index=False)
catalog.to_markdown(run_dir / "data" / "data_catalog.md", index=False)
catalog.head(10)


Unnamed: 0,path,layer,rows,columns,frequency,date_min,date_max,key_columns,join_hints
0,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,5476,"date,spread_2y_bps,spread_5y_bps,spread_10y_bp...",daily,2010-01-04,2024-12-31,date,daily:date | keys:date | layer:intermediate
1,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,420,"date,bid_ask_spread,pubout,n_issues",monthly,1980-01-31,2014-12-31,date,keys:date | layer:intermediate
2,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,1209,"date,fed_assets",weekly,2002-12-18,2026-02-11,date,weekly:date | keys:date | layer:intermediate
3,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,1209,"date,fed_treasury_holdings",weekly,2002-12-18,2026-02-11,date,weekly:date | keys:date | layer:intermediate
4,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,751,"date,sofr,sofr_volume",daily,2019-01-02,2021-12-31,date,daily:date | keys:date | layer:intermediate
5,c:\Users\Owner\Box\Winter26\slr_bucket\data\in...,intermediate,3752,"date,spread_2y_bps,spread_5y_bps,spread_10y_bp...",daily,2010-01-04,2024-12-31,date,daily:date | keys:date | layer:intermediate
6,c:\Users\Owner\Box\Winter26\slr_bucket\data\ra...,raw,3955,"Date,AUD,CAD,CHF,EUR,GBP,JPY,NZD,SEK,USD",unknown,NaT,NaT,,layer:raw
7,c:\Users\Owner\Box\Winter26\slr_bucket\data\ra...,raw,3913,"('SPX Index', 'PX_LAST'),('SPX Index', 'IDX_ES...",unknown,NaT,NaT,,layer:raw
8,c:\Users\Owner\Box\Winter26\slr_bucket\data\ra...,raw,14,"report_date,total_assets,total_reserves,total_...",quarterly,NaT,NaT,report_date,quarterly:report_date | keys:report_date | lay...
9,c:\Users\Owner\Box\Winter26\slr_bucket\data\ra...,raw,14,"report_date,total_assets,total_reserves,total_...",quarterly,NaT,NaT,report_date,quarterly:report_date | keys:report_date | lay...


In [4]:
# Outcomes: arb_* only
outcome_path = resolve_dataset_path(CONFIG["outcomes_source"], expected_dir=repo_root / "data" / "series")
out = load_any_table(outcome_path)
out["date"] = pd.to_datetime(out["date"], errors="coerce")
out["date"] = as_daily_date(out["date"])
arb_cols = sorted([c for c in out.columns if c.startswith(CONFIG["outcome_pattern"])], key=lambda c: int(c.split("_")[1]))
if not arb_cols:
    raise ValueError("No arb_* columns found in outcomes source")
arb_long = out[["date", *arb_cols]].melt(id_vars=["date"], var_name="outcome", value_name="y")
arb_long["tenor"] = arb_long["outcome"].str.extract(r"arb_(\d+)").astype(float).astype("Int64")
arb_long = arb_long.dropna(subset=["date","y","tenor"]).sort_values(["tenor","date"]).reset_index(drop=True)
val_abs_q = arb_long["y"].abs().quantile([0.5,0.9,0.99]).to_dict()
unit_note = "Values look like bps" if val_abs_q.get(0.5,0) > 0.2 else "Values look like decimals"

logger.info("Loaded outcomes from %s with tenors=%s", outcome_path, sorted(arb_long["tenor"].dropna().unique().tolist()))
{"outcome_path": str(outcome_path), "arb_columns": arb_cols, "value_quantiles_abs": val_abs_q, "unit_note": unit_note}


2026-02-26 22:30:49,842 INFO summary_pipeline - Loaded outcomes from c:\Users\Owner\Box\Winter26\slr_bucket\data\series\tips_treasury_implied_rf_2010.parquet with tenors=[2, 5, 10, 20]


{'outcome_path': 'c:\\Users\\Owner\\Box\\Winter26\\slr_bucket\\data\\series\\tips_treasury_implied_rf_2010.parquet',
 'arb_columns': ['arb_2', 'arb_5', 'arb_10', 'arb_20'],
 'value_quantiles_abs': {0.5: 22.171970346176465,
  0.9: 33.97164601022243,
  0.99: 40.397917461319494},
 'unit_note': 'Values look like bps'}

In [5]:
# Controls: prefer intermediate analysis_panel if valid, else fallback build from raw.
# def build_controls_panel():

needed = set(CONFIG["direct_controls"])
try:
    p = resolve_dataset_path("analysis_panel", expected_dir=repo_root / "data" / "intermediate")
    panel = load_any_table(p)
    panel["date"] = pd.to_datetime(panel["date"], errors="coerce")
    if needed.issubset(set(panel.columns)):
        logger.info("Using controls from intermediate analysis_panel: %s", p)
        controls =  panel[["date", *sorted(needed)]].copy() #, str(p)
except Exception as exc:
    logger.warning("analysis_panel unavailable/invalid (%s), using raw fallback", exc)

fred = load_any_table(resolve_dataset_path("controls_vix_creditspreads_fred", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
fred["date"] = pd.to_datetime(fred["date"], errors="coerce")
fred["date"] = as_daily_date(fred["date"])
try:
    repo = load_any_table(resolve_dataset_path("repo_rates_combined", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
except FileNotFoundError:
    repo = load_any_table(resolve_dataset_path("repo_rates_fred", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
repo["date"] = pd.to_datetime(repo["date"], errors="coerce")
repo["date"] = as_daily_date(repo["date"])
repo = repo.rename(columns={"TGCR":"tgcr", "EFFR":"effr"})
if "spr_tgcr" not in repo.columns and {"SOFR","tgcr"}.issubset(repo.columns):
    repo["spr_tgcr"] = pd.to_numeric(repo["tgcr"], errors="coerce") - pd.to_numeric(repo["SOFR"], errors="coerce")
if "spr_effr" not in repo.columns and {"SOFR","effr"}.issubset(repo.columns):
    repo["spr_effr"] = pd.to_numeric(repo["effr"], errors="coerce") - pd.to_numeric(repo["SOFR"], errors="coerce")

issu = load_any_table(resolve_dataset_path("treasury_issuance_by_tenor_fiscaldata", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
issu["date"] = pd.to_datetime(issu.get("issue_date"), errors="coerce")
issu["date"] = as_daily_date(issu["date"])
issu["tenor_bucket"] = pd.to_numeric(issu["tenor_bucket"], errors="coerce")
issu["issuance_amount"] = pd.to_numeric(issu["issuance_amount"], errors="coerce") / 1e9
d = issu.pivot_table(index="date", columns="tenor_bucket", values="issuance_amount", aggfunc="sum").reset_index()

# Robustly rename tenor-bucket columns to issu_*_bil (handles int/float/str column labels)
rename_map = {}
for col in d.columns:
    if col == "date":
        continue
    try:
        v = float(col)
    except Exception:
        continue
    if abs(v - 7.0) < 1e-9:
        rename_map[col] = "issu_7_bil"
    elif abs(v - 10.0) < 1e-9:
        rename_map[col] = "issu_10_bil"
    elif abs(v - 14.0) < 1e-9:
        rename_map[col] = "issu_14_bil"
    elif abs(v - 20.0) < 1e-9:
        rename_map[col] = "issu_20_bil"
    elif abs(v - 30.0) < 1e-9:
        rename_map[col] = "issu_30_bil"
d = d.rename(columns=rename_map)

# Ensure required issuance controls exist (zeros if not present in file)
for c in ["issu_7_bil", "issu_14_bil", "issu_30_bil", "issu_10_bil", "issu_20_bil"]:
    if c not in d.columns:
        d[c] = 0.0

# If 14y bucket absent, approximate as 10y+20y (as in prior logic)
if d["issu_14_bil"].fillna(0.0).abs().sum() == 0.0:
    d["issu_14_bil"] = d.get("issu_10_bil", 0.0) + d.get("issu_20_bil", 0.0)

for c in ["issu_7_bil", "issu_14_bil", "issu_30_bil"]:
    d[c] = pd.to_numeric(d[c], errors="coerce").fillna(0.0)

# Keep only the issuance controls used in the design
d = d[["date", "issu_7_bil", "issu_14_bil", "issu_30_bil"]]
fred = fred.groupby("date", as_index=False).mean(numeric_only=True)
repo = repo.groupby("date", as_index=False).mean(numeric_only=True)
d    = d.groupby("date", as_index=False).sum(numeric_only=True)   # issuance is additive

for col in ["VIX","HY_OAS","BAA10Y","SOFR","spr_tgcr","spr_effr","tgcr","effr"]:
    if col in fred.columns: fred[col] = coerce_num(fred[col])
    if col in repo.columns: repo[col] = coerce_num(repo[col])


# If 'controls' was not set from intermediate analysis_panel, build it from raw sources.
if "controls" not in globals():
    controls = fred.merge(repo, on="date", how="outer").merge(d, on="date", how="outer").sort_values("date")
    # keep only needed controls (drop extras like tgcr/effr if not needed)
    keep = ["date"] + sorted(set(CONFIG["direct_controls"]) & set(controls.columns))
    controls = controls[keep].copy()
    logger.info("Built controls from raw sources. columns=%s", keep)


2026-02-26 22:30:49,969 INFO summary_pipeline - Built controls from raw sources. columns=['date', 'BAA10Y', 'HY_OAS', 'SOFR', 'VIX', 'issu_14_bil', 'issu_30_bil', 'issu_7_bil', 'spr_tgcr']


In [6]:
import re
import pandas as pd
import numpy as np

def _as_date(x):
    return pd.to_datetime(x, errors="coerce", utc=True).dt.tz_convert(None).dt.normalize()

import re
import pandas as pd

def _canon(x) -> str:
    # robust to non-string column names (float/int/None)
    if x is None:
        return ""
    s = str(x)
    return re.sub(r"[^a-z0-9]+", "", s.lower())

def _sanitize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [str(c) for c in df.columns]  # force all to strings
    return df

def _rename_to_canonical(df: pd.DataFrame, want: list[str]) -> pd.DataFrame:
    df = _sanitize_columns(df)
    m = {_canon(c): c for c in df.columns}
    ren = {}
    for w in want:
        key = _canon(w)
        if key in m:
            ren[m[key]] = w
    return df.rename(columns=ren)


def _collapse_daily(df: pd.DataFrame, how="mean") -> pd.DataFrame:
    df = df.copy()
    df["date"] = _as_date(df["date"])
    df = df.dropna(subset=["date"])
    num = [c for c in df.columns if c != "date"]
    if not num:
        return df[["date"]].drop_duplicates().sort_values("date")
    agg = "mean" if how == "mean" else "sum"
    return df.groupby("date", as_index=False)[num].agg(agg).sort_values("date")

# 0) Build a master date index from your OUTCOMES (arb panel)
# assumes you have arb_long/panel with a 'date' column
base_dates = _as_date(arb_long["date"]).dropna().drop_duplicates().sort_values()
controls = pd.DataFrame({"date": base_dates}).reset_index(drop=True)

# 1) FRED controls
fred_use = _rename_to_canonical(fred, ["date", "VIX", "HY_OAS", "BAA10Y"])
fred_use = _collapse_daily(fred_use[["date"] + [c for c in ["VIX","HY_OAS","BAA10Y"] if c in fred_use.columns]], how="mean")
controls = controls.merge(fred_use, on="date", how="left")

# 2) Repo / funding controls
repo_use = _rename_to_canonical(repo, ["date", "SOFR", "TGCR", "EFFR", "spr_tgcr", "spr_effr"])
repo_use = _collapse_daily(repo_use, how="mean")

# if spreads missing but levels exist, compute them
if "spr_tgcr" not in repo_use.columns and {"TGCR","SOFR"}.issubset(repo_use.columns):
    repo_use["spr_tgcr"] = repo_use["TGCR"] - repo_use["SOFR"]
if "spr_effr" not in repo_use.columns and {"EFFR","SOFR"}.issubset(repo_use.columns):
    repo_use["spr_effr"] = repo_use["EFFR"] - repo_use["SOFR"]

repo_keep = ["date"] + [c for c in ["SOFR","spr_tgcr","spr_effr"] if c in repo_use.columns]
controls = controls.merge(repo_use[repo_keep], on="date", how="left")

# 3) Issuance (event-based -> zeros on non-issuance days)
d_use = d.copy()
if "issue_date" in d_use.columns and "date" not in d_use.columns:
    d_use = d_use.rename(columns={"issue_date": "date"})
d_use = _rename_to_canonical(d_use, ["date", "issu_7_bil", "issu_14_bil", "issu_30_bil"])
issu_use = _collapse_daily(d_use[["date"] + [c for c in ["issu_7_bil","issu_14_bil","issu_30_bil"] if c in d_use.columns]], how="sum")
controls = controls.merge(issu_use, on="date", how="left")

for c in ["issu_7_bil","issu_14_bil","issu_30_bil"]:
    if c in controls.columns:
        controls[c] = controls[c].fillna(0.0)

# 4) Fill only market/funding gaps inside the outcome sample
fill_cols = [c for c in ["VIX","HY_OAS","BAA10Y","SOFR","spr_tgcr","spr_effr"] if c in controls.columns]
controls = controls.sort_values("date").reset_index(drop=True)
controls[fill_cols] = controls[fill_cols].ffill().bfill()

# 5) Quick diagnostic (within outcome sample only)
miss = controls[fill_cols + [c for c in ["issu_7_bil","issu_14_bil","issu_30_bil"] if c in controls.columns]].isna().mean().sort_values(ascending=False)
display(miss.to_frame("missing_share"))

Unnamed: 0,missing_share
VIX,0.0
HY_OAS,0.0
BAA10Y,0.0
SOFR,0.0
spr_tgcr,0.0
issu_7_bil,0.0
issu_14_bil,0.0
issu_30_bil,0.0


In [7]:
panel_long = arb_long.merge(controls, on="date", how="left")
need = CONFIG["direct_controls"]
miss = (panel_long[ [c for c in need if c in panel_long.columns] ]
        .isna().mean()
        .sort_values(ascending=False))
display(miss.to_frame("missing_share"))

import importlib
import slr_bucket.econometrics.event_study as es
importlib.reload(es)

# rebind local names to the reloaded module functions
add_event_time = es.add_event_time
event_study_regression = es.event_study_regression
jump_estimator = es.jump_estimator
w = add_event_time(panel_long, "2020-04-01")
print(w.columns)
for c in CONFIG["direct_controls"]:
    if c in panel_long.columns:
        panel_long[c] = pd.to_numeric(panel_long[c], errors="coerce")
panel_long.to_parquet(run_dir / "data" / "arb_panel_long.parquet", index=False)
panel_long.head()


Unnamed: 0,missing_share
VIX,0.0
HY_OAS,0.0
BAA10Y,0.0
issu_7_bil,0.0
issu_14_bil,0.0
issu_30_bil,0.0
SOFR,0.0
spr_tgcr,0.0


Index(['date', 'outcome', 'y', 'tenor', 'VIX', 'HY_OAS', 'BAA10Y', 'SOFR',
       'spr_tgcr', 'issu_7_bil', 'issu_14_bil', 'issu_30_bil', 'event_time'],
      dtype='object')


Unnamed: 0,date,outcome,y,tenor,VIX,HY_OAS,BAA10Y,SOFR,spr_tgcr,issu_7_bil,issu_14_bil,issu_30_bil
0,2010-01-04,arb_2,43.534609,2,23.22,5.35,2.45,3.15,-0.05,0.0,0.0,0.0
1,2010-01-05,arb_2,39.490811,2,23.22,5.35,2.45,3.15,-0.05,0.0,0.0,0.0
2,2010-01-06,arb_2,38.344764,2,23.22,5.35,2.45,3.15,-0.05,0.0,0.0,0.0
3,2010-01-07,arb_2,30.474542,2,23.22,5.35,2.45,3.15,-0.05,0.0,0.0,0.0
4,2010-01-08,arb_2,40.810449,2,23.22,5.35,2.45,3.15,-0.05,0.0,0.0,0.0


In [8]:
# Diagnostics: missingness (including columns that are entirely absent)
def missingness_report(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    out = []
    for c in cols:
        if c not in df.columns:
            out.append({"var": c, "present": False, "missing_share": 1.0, "n_nonmissing": 0})
        else:
            s = pd.to_numeric(df[c], errors="coerce")
            out.append({"var": c, "present": True, "missing_share": float(s.isna().mean()), "n_nonmissing": int(s.notna().sum())})
    return pd.DataFrame(out).sort_values(["present","missing_share"], ascending=[True, False])

diag = missingness_report(panel_long, CONFIG["direct_controls"])
display(diag)

# Warn if any direct controls have less than 90% coverage in the sample used for event studies
coverage = diag.set_index("var")["missing_share"]
bad = coverage[coverage > 0.10]
if len(bad):
    logger.warning("Low coverage direct controls (>10%% missing): %s", bad.to_dict())


Unnamed: 0,var,present,missing_share,n_nonmissing
8,spr_effr,False,1.0,0
0,VIX,True,0.0,14973
1,HY_OAS,True,0.0,14973
2,BAA10Y,True,0.0,14973
3,issu_7_bil,True,0.0,14973
4,issu_14_bil,True,0.0,14973
5,issu_30_bil,True,0.0,14973
6,SOFR,True,0.0,14973
7,spr_tgcr,True,0.0,14973




In [9]:
# Layer 1A: summary stats by tenor and regime
regimes = {
    "pre": (pd.Timestamp("2019-01-01"), pd.Timestamp("2020-03-31")),
    "relief": (pd.Timestamp("2020-04-01"), pd.Timestamp("2021-03-31")),
    "post": (pd.Timestamp("2021-04-01"), pd.Timestamp.max),
}

rows = []
for tenor, g in panel_long.groupby("tenor"):
    g = g.sort_values("date").set_index("date")
    for regime, (start, end) in regimes.items():
        s = g.loc[(g.index>=start) & (g.index<=end), "y"].dropna()
        if s.empty:
            continue
        lb_p = np.nan
        try:
            lb = acorr_ljungbox(s, lags=[min(10, max(1, len(s)//5))], return_df=True)
            lb_p = float(lb["lb_pvalue"].iloc[0])
        except Exception:
            pass
        rows.append({
            "tenor": int(tenor), "regime": regime,
            "sample_start": s.index.min(), "sample_end": s.index.max(), "N": int(s.shape[0]),
            "mean": float(s.mean()), "std": float(s.std()),
            "p1": float(s.quantile(0.01)), "p5": float(s.quantile(0.05)), "p50": float(s.quantile(0.5)),
            "p95": float(s.quantile(0.95)), "p99": float(s.quantile(0.99)),
            "autocorr1": float(s.autocorr(lag=1)), "ljungbox_pvalue": lb_p,
        })
summary_stats = pd.DataFrame(rows)
summary_stats.to_csv(run_dir / "tables" / "summary_stats.csv", index=False)
summary_stats.head()


Unnamed: 0,tenor,regime,sample_start,sample_end,N,mean,std,p1,p5,p50,p95,p99,autocorr1,ljungbox_pvalue
0,2,pre,2019-01-02,2020-03-31,312,16.613717,16.17569,-12.828959,-9.685756,19.461887,36.839795,39.364466,0.976701,0.0
1,2,relief,2020-04-01,2021-03-31,250,16.823866,16.023149,-23.192073,-16.742639,22.276635,33.689752,37.580588,0.977701,0.0
2,2,post,2021-04-01,2024-12-31,939,16.844803,15.79243,-21.495883,-10.992797,18.949318,38.029563,43.260277,0.980527,0.0
3,5,pre,2019-01-02,2020-03-31,312,16.313539,6.100507,5.159764,6.436984,16.787261,24.270456,24.920967,0.931524,0.0
4,5,relief,2020-04-01,2021-03-31,250,13.653405,7.647012,-4.95483,-2.080223,16.151514,22.281229,23.59544,0.985644,0.0


In [10]:
panel_long = panel_long.reset_index(drop=True)
if "date" not in panel_long.columns and isinstance(panel_long.index, pd.DatetimeIndex):
    panel_long["date"] = panel_long.index

panel_long["date"] = pd.to_datetime(panel_long["date"], errors="coerce")
panel_long = panel_long.dropna(subset=["date"]).sort_values(["tenor", "date"])

In [11]:
# Layer 1B: jump regressions (TOTAL vs DIRECT)
jump_rows = []
for event in CONFIG["events"]:
    for w in CONFIG["windows"]:
        for tenor, g in panel_long.groupby("tenor"):
            for spec, controls_list in [("TOTAL", CONFIG["total_controls"]), ("DIRECT", CONFIG["direct_controls"])]:
                est, se, n = jump_estimator(g, y_col="y", event_date=event, window=w, controls=controls_list, hac_lags=CONFIG["hac_lags"])
                jump_rows.append({
                    "event": event, "window": w, "tenor": int(tenor), "spec": spec,
                    "estimate": est, "se": se, "ci_low": est - 1.96*se if pd.notna(est) and pd.notna(se) else np.nan,
                    "ci_high": est + 1.96*se if pd.notna(est) and pd.notna(se) else np.nan, "N": n,
                })
jump_results = pd.DataFrame(jump_rows)
jump_results.to_csv(run_dir / "tables" / "jump_results.csv", index=False)
jump_results.head()


Unnamed: 0,event,window,tenor,spec,estimate,se,ci_low,ci_high,N
0,2020-04-01,20,2,TOTAL,5.667645,3.569588,-1.328747,12.664038,41
1,2020-04-01,20,2,DIRECT,5.644767,3.66945,-1.547355,12.836889,41
2,2020-04-01,20,5,TOTAL,0.842554,2.057037,-3.189237,4.874346,41
3,2020-04-01,20,5,DIRECT,1.497202,1.958602,-2.341658,5.336061,41
4,2020-04-01,20,10,TOTAL,0.408443,2.741157,-4.964224,5.78111,41


In [12]:
# Layer 1C: binned event-study + plots (improved ordering + overlay TOTAL vs DIRECT)
import matplotlib.pyplot as plt

def parse_bin_term(term: str):
    # Expected: "bin_[a,b]" where a,b are ints
    m = re.search(r"\[\s*(-?\d+)\s*,\s*(-?\d+)\s*\]", term)
    if not m:
        return None
    a, b = int(m.group(1)), int(m.group(2))
    mid = 0.5*(a+b)
    return a, b, mid

bin_rows = []
for event in CONFIG["events"]:
    for tenor, g in panel_long.groupby("tenor"):
        series_by_spec = {}
        for spec, controls_list in [("TOTAL", CONFIG["total_controls"]), ("DIRECT", CONFIG["direct_controls"])]:
            es = event_study_regression(
                g, y_col="y", event_date=event, bins=CONFIG["event_bins"],
                controls=controls_list, hac_lags=CONFIG["hac_lags"]
            )
            if es.empty:
                continue
            es["event"] = event
            es["tenor"] = int(tenor)
            es["spec"] = spec
            # ordering helpers
            parsed = es["term"].apply(parse_bin_term)
            es["bin_lo"] = parsed.apply(lambda x: x[0] if x else np.nan)
            es["bin_hi"] = parsed.apply(lambda x: x[1] if x else np.nan)
            es["bin_mid"] = parsed.apply(lambda x: x[2] if x else np.nan)
            series_by_spec[spec] = es.sort_values("bin_mid")
            bin_rows.append(es)

        # Overlay plot (only if we have at least one spec)
        if not series_by_spec:
            continue
        fig, ax = plt.subplots(figsize=(8,4))
        for spec, df_es in series_by_spec.items():
            x = df_es["bin_mid"].to_numpy()
            ax.plot(x, df_es["estimate"], marker="o", label=spec)
            ax.fill_between(x, df_es["ci_low"], df_es["ci_high"], alpha=0.15)
        ax.axhline(0, color="black", lw=1)
        ax.axvline(0, color="black", lw=1, ls="--")
        ax.set_xlabel("Event time (bin midpoint)")
        ax.set_ylabel("Estimated effect (bps)")
        ax.set_title(f"Binned event study | event={event} tenor={int(tenor)}")
        ax.legend()
        fig.tight_layout()
        fig.savefig(run_dir / "figures" / f"event_path_arb_{int(tenor)}y_{event}_overlay.png", dpi=150)
        plt.close(fig)

eventstudy_bins = pd.concat(bin_rows, ignore_index=True) if bin_rows else pd.DataFrame()
eventstudy_bins.to_csv(run_dir / "tables" / "eventstudy_bins.csv", index=False)
eventstudy_bins.head()


Unnamed: 0,term,estimate,se,ci_low,ci_high,n,event,tenor,spec,bin_lo,bin_hi,bin_mid
0,"bin_[-40,-21]",-1.112639,5.628199,-12.14391,9.918632,121,2020-04-01,2,TOTAL,-40,-21,-30.5
1,"bin_[-60,-41]",12.507368,6.585999,-0.401191,25.415927,121,2020-04-01,2,TOTAL,-60,-41,-50.5
2,"bin_[0,0]",12.415109,2.028347,8.439548,16.39067,121,2020-04-01,2,TOTAL,0,0,0.0
3,"bin_[1,20]",1.011966,4.125137,-7.073302,9.097234,121,2020-04-01,2,TOTAL,1,20,10.5
4,"bin_[21,40]",2.943115,3.958067,-4.814697,10.700926,121,2020-04-01,2,TOTAL,21,40,30.5


In [13]:
# Visual check: plot raw spreads around each event (per tenor), with event date highlighted
import matplotlib.pyplot as plt

def plot_event_window(df: pd.DataFrame, tenor: int, event: str, window: int = 120):
    d = df[df["tenor"] == tenor].copy()
    d = d.sort_values("date").set_index("date")
    t0 = pd.Timestamp(event)
    sub = d.loc[(d.index >= t0 - pd.Timedelta(days=window)) & (d.index <= t0 + pd.Timedelta(days=window)), ["y"]].dropna()
    if sub.empty:
        return
    fig, ax = plt.subplots(figsize=(9,4))
    ax.plot(sub.index, sub["y"])
    ax.axvline(t0, color="black", ls="--", lw=1)
    ax.set_title(f"arb_{tenor} | raw series around event {event} (±{window}d)")
    ax.set_ylabel("bps")
    fig.tight_layout()
    fig.savefig(run_dir / "figures" / f"raw_arb_{tenor}y_window_{event}.png", dpi=150)
    plt.close(fig)

for event in CONFIG["events"]:
    for tenor in sorted(panel_long["tenor"].dropna().unique().tolist()):
        plot_event_window(panel_long, int(tenor), event, window=120)


In [14]:
# Layer 1D pooled regression with tenor FE + stargazer export
from statsmodels.formula.api import ols

pooled_rows = []
stargazer_models = []
for event in CONFIG["events"]:
    work = add_event_time(panel_long, event)
    work = work[work["event_time"].between(-60, 60)].copy()
    work["post"] = (work["event_time"] >= 0).astype(int)
    for spec, controls_list in [("TOTAL", CONFIG["total_controls"]), ("DIRECT", CONFIG["direct_controls"])]:
        use_cols = ["y","post","tenor", *[c for c in controls_list if c in work.columns]]
        reg = keep_controls_with_coverage(work, use_cols).copy()
        if reg.empty:
            continue
        rhs = "post + C(tenor)"
        if len(use_cols) > 3:
            rhs += " + " + " + ".join([c for c in use_cols if c not in {"y","post","tenor"}])
        
        reg = reg.copy()

        # Drop NAs in core columns first
        reg = reg.dropna(subset=["y", "post", "tenor"])

        # Patsy cannot handle pandas nullable dtypes like Int64 / boolean
        # Make them numpy dtypes (or category for FE variables)
        reg["y"] = pd.to_numeric(reg["y"], errors="coerce").astype("float64")
        reg["post"] = pd.to_numeric(reg["post"], errors="coerce").astype("int64")

        # If you use C(tenor) in the formula, make tenor categorical
        reg["tenor"] = pd.to_numeric(reg["tenor"], errors="coerce").astype("int64").astype("category")

        # Coerce any remaining RHS columns to numeric (safe default)
        for c in reg.columns:
            if c in {"y"}:
                continue
            if str(reg[c].dtype) in {"Int64", "Int32", "Int16", "Int8", "boolean"}:
                reg[c] = pd.to_numeric(reg[c], errors="coerce")

        # Final NA drop after coercions
        reg = reg.dropna()

        res = ols(f"y ~ {rhs}", data=reg).fit()
        robust = res.get_robustcov_results(cov_type="HAC", maxlags=CONFIG["hac_lags"])
        stargazer_models.append(robust)
        post_idx = robust.model.exog_names.index("post") if "post" in robust.model.exog_names else None
        pooled_rows.append({"event": event, "spec": spec, "N": int(robust.nobs), "post": robust.params[post_idx] if post_idx is not None else np.nan, "se": robust.bse[post_idx] if post_idx is not None else np.nan})

pooled_table = pd.DataFrame(pooled_rows)
pooled_table.to_csv(run_dir / "tables" / "pooled_jump_results.csv", index=False)

html_out = run_dir / "tables" / "regression_table.html"
try:
    from stargazer.stargazer import Stargazer
    if stargazer_models:
        sg = Stargazer(stargazer_models)
        sg.title("Pooled jump regressions (HAC SE)")
        html_out.write_text(sg.render_html(), encoding="utf-8")
    else:
        html_out.write_text("<html><body><p>No pooled models available.</p></body></html>", encoding="utf-8")
except Exception as exc:
    html_out.write_text(f"<html><body><p>Stargazer unavailable: {exc}</p></body></html>", encoding="utf-8")

pooled_table




Unnamed: 0,event,spec,N,post,se
0,2020-04-01,TOTAL,484,4.99718,1.549459
1,2020-04-01,DIRECT,484,1.016332,2.597885
2,2021-03-19,TOTAL,484,-5.042961,2.146488
3,2021-03-19,DIRECT,484,-1.044632,1.667183
4,2021-03-31,TOTAL,484,1.849296,2.486391
5,2021-03-31,DIRECT,484,3.898105,1.710445


In [15]:
# Layer 2 mechanism (weekly), skip gracefully if required data missing
layer2_note = ""
try:
    if not CONFIG.get("run_layer2", True):
        raise RuntimeError("CONFIG.run_layer2=False")

    pd_long = load_any_table(resolve_dataset_path("primary_dealer_stats_ofr_stfm_nypd_long", expected_dir=repo_root / "data" / "raw" / "event_inputs"))
    bank = load_any_table(resolve_dataset_path("bank_exposure_y9c_agg_daily", expected_dir=repo_root / "data" / "raw" / "event_inputs"))

    pd_long["date"] = pd.to_datetime(pd_long["date"], errors="coerce")
    bank["date"] = pd.to_datetime(bank["date"], errors="coerce")

    # Weekly dealer utilization proxy
    pd_w = pd_long.pivot_table(index="date", columns="mnemonic", values="value", aggfunc="mean").resample("W-FRI").mean()
    pd_w["utilization_index"] = pd_w.sum(axis=1, min_count=1)
    pd_w["utilization_lag1w"] = pd_w["utilization_index"].shift(1)

    # Weekly bank exposure proxy
    if "agg_exempt_share" not in bank.columns:
        raise KeyError("bank_exposure_y9c_agg_daily missing 'agg_exempt_share'")
    b_w = bank.set_index("date").resample("W-FRI").mean()[["agg_exempt_share"]]

    # Weekly outcome (average across tenors)
    y_w = panel_long.set_index("date").groupby("tenor")["y"].resample("W-FRI").mean().reset_index()
    y_w = y_w.groupby("date", as_index=False)["y"].mean().set_index("date")

    # Weekly controls: take whatever is available (do not hard-fail on missing columns)
    desired = list(CONFIG.get("direct_controls", []))
    present = [c for c in desired if c in panel_long.columns]
    missing = sorted(set(desired) - set(present))
    if missing:
        logger.warning("Layer 2: dropping missing controls: %s", missing)

    if present:
        c_w = panel_long.set_index("date")[present].resample("W-FRI").mean()
    else:
        c_w = pd.DataFrame(index=y_w.index)

    mech = y_w.join([b_w, pd_w[["utilization_lag1w"]], c_w], how="inner").dropna()

    # Relief indicator (inclusive)
    mech["relief"] = ((mech.index >= "2020-04-01") & (mech.index <= "2021-03-31")).astype(int)

    # z-scores (guard against zero std)
    ex_std = mech["agg_exempt_share"].std()
    util_std = mech["utilization_lag1w"].std()
    mech["z_exempt"] = (mech["agg_exempt_share"] - mech["agg_exempt_share"].mean()) / (ex_std if ex_std and ex_std > 0 else 1.0)
    mech["z_util_l1"] = (mech["utilization_lag1w"] - mech["utilization_lag1w"].mean()) / (util_std if util_std and util_std > 0 else 1.0)

    mech["relief_x_exempt"] = mech["relief"] * mech["z_exempt"]
    mech["relief_x_util"] = mech["relief"] * mech["z_util_l1"]

    xcols = ["relief", "relief_x_exempt", "relief_x_util"] + present
    reg = mech[["y"] + xcols].dropna()
    if len(reg) < 20:
        raise RuntimeError(f"Layer 2 insufficient weekly observations after joins: n={len(reg)}")

    X = sm.add_constant(reg[xcols], has_constant="add")
    res = sm.OLS(reg["y"], X).fit(cov_type="HAC", cov_kwds={"maxlags":2})
    mech_out = pd.DataFrame({"term": res.params.index, "coef": res.params.values, "se": res.bse.values})
    mech_out.to_csv(run_dir / "tables" / "layer2_mechanism_weekly.csv", index=False)

    layer2_note = f"Layer 2 executed successfully. n={int(res.nobs)}; controls_used={present}"
except Exception as exc:
    layer2_note = f"Layer 2 skipped gracefully due to missing/unusable inputs: {exc}"

print(layer2_note)




Layer 2 executed successfully. n=92; controls_used=['VIX', 'HY_OAS', 'BAA10Y', 'issu_7_bil', 'issu_14_bil', 'issu_30_bil', 'SOFR', 'spr_tgcr']


In [16]:
pd_w

mnemonic,NYPD-PD_AFtD_AG-A,NYPD-PD_AFtD_AG_MBS-A,NYPD-PD_AFtD_AG_eMBS-A,NYPD-PD_AFtD_CORS-A,NYPD-PD_AFtD_OMBS-A,NYPD-PD_AFtD_T-A,NYPD-PD_AFtD_TIPS-A,NYPD-PD_AFtD_TOT-A,NYPD-PD_AFtD_T_eTIPS-A,NYPD-PD_AFtR_AG-A,...,NYPD-PD_SL_T_GE30-A,NYPD-PD_SL_T_L30-A,NYPD-PD_SL_T_OO-A,NYPD-PD_SL_T_TOT-A,NYPD-PD_SL_T_eTIPS_GE30-A,NYPD-PD_SL_T_eTIPS_L30-A,NYPD-PD_SL_T_eTIPS_OO-A,NYPD-PD_SL_T_eTIPS_TOT-A,utilization_index,utilization_lag1w
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-04,1.189500e+10,8.792000e+09,3.103000e+09,1.167000e+10,8.360000e+08,8.239700e+10,5.491000e+09,1.067980e+11,7.690600e+10,9.385000e+09,...,783000000.0,,1.334390e+11,,7.830000e+08,,1.198630e+11,,2.229933e+13,
2019-01-11,7.089000e+09,5.471000e+09,1.618000e+09,1.243200e+10,1.611000e+09,8.616900e+10,7.143000e+09,1.073010e+11,7.902600e+10,5.472000e+09,...,987000000.0,,1.376700e+11,,9.870000e+08,,1.230360e+11,,2.270962e+13,2.229933e+13
2019-01-18,1.613500e+10,1.427400e+10,1.861000e+09,1.736100e+10,2.830000e+09,7.544300e+10,4.461000e+09,1.117690e+11,7.098200e+10,1.285700e+10,...,383000000.0,,1.261680e+11,,3.830000e+08,,1.131710e+11,,2.560165e+13,2.270962e+13
2019-01-25,2.440400e+10,2.061400e+10,3.790000e+09,1.998200e+10,3.817000e+09,7.205900e+10,2.946000e+09,1.202620e+11,6.911300e+10,1.986500e+10,...,,,1.316560e+11,,5.950000e+08,,1.216570e+11,,2.555242e+13,2.560165e+13
2019-02-01,1.329600e+10,1.138500e+10,1.911000e+09,2.557700e+10,1.089000e+09,5.552000e+10,2.864000e+09,9.548200e+10,5.265600e+10,1.214500e+10,...,,,1.268540e+11,,8.360000e+08,,1.164310e+11,,2.518926e+13,2.555242e+13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-03,2.330200e+10,2.083000e+10,2.472000e+09,3.298700e+10,4.045000e+09,1.019330e+11,6.064000e+09,1.622670e+11,9.586900e+10,2.310100e+10,...,,,1.540720e+11,,8.040000e+08,,1.218500e+11,,2.826699e+13,2.759531e+13
2021-12-10,1.254600e+10,9.303000e+09,3.243000e+09,3.179900e+10,1.815000e+09,1.040740e+11,7.308000e+09,1.502340e+11,9.676600e+10,1.126800e+10,...,,,1.671070e+11,,1.985000e+09,,1.355160e+11,,2.835845e+13,2.826699e+13
2021-12-17,5.462400e+10,5.122200e+10,3.402000e+09,3.945700e+10,3.218000e+09,1.096370e+11,7.205000e+09,2.069360e+11,1.024320e+11,3.932900e+10,...,,,1.667610e+11,,3.026000e+09,,1.365890e+11,,2.560639e+13,2.835845e+13
2021-12-24,5.839900e+10,5.594000e+10,2.459000e+09,3.361400e+10,2.813000e+09,1.136600e+11,5.064000e+09,2.084860e+11,1.085960e+11,4.147200e+10,...,,,1.638660e+11,,5.609000e+09,,1.354370e+11,,2.570947e+13,2.560639e+13


In [17]:
y_w

Unnamed: 0_level_0,y
date,Unnamed: 1_level_1
2010-01-08,30.789828
2010-01-15,27.843675
2010-01-22,24.549639
2010-01-29,26.007036
2010-02-05,24.475120
...,...
2024-12-06,12.940388
2024-12-13,11.906851
2024-12-20,9.184051
2024-12-27,9.242316


In [18]:
mech_out

Unnamed: 0,term,coef,se
0,const,11.50282,7.01437
1,relief,-11.85649,2.348246
2,relief_x_exempt,-19.74754,4.228363
3,relief_x_util,0.9040651,0.8473749
4,VIX,0.004960368,0.1280114
5,HY_OAS,4.47993,1.906205
6,BAA10Y,-12.85126,5.660369
7,issu_7_bil,-8.461262e-14,7.725573e-15
8,issu_14_bil,-0.08693218,0.0751941
9,issu_30_bil,0.206227,0.1507609


In [19]:
# Refresh latest and write run metadata (works when CONFIG is a dict)

import json
import sys
import platform
import subprocess
import shutil
import hashlib
from datetime import datetime, timezone
from pathlib import Path

import pandas as pd

def _safe_git_info(repo_root: Path) -> dict:
    try:
        sha = subprocess.check_output(
            ["git", "rev-parse", "HEAD"], cwd=str(repo_root), stderr=subprocess.STDOUT, text=True
        ).strip()
        branch = subprocess.check_output(
            ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=str(repo_root), stderr=subprocess.STDOUT, text=True
        ).strip()
        status = subprocess.check_output(
            ["git", "status", "--porcelain"], cwd=str(repo_root), stderr=subprocess.STDOUT, text=True
        )
        return {"commit": sha, "branch": branch, "dirty": bool(status.strip())}
    except Exception as e:
        return {"commit": None, "branch": None, "dirty": None, "error": str(e)}

def _safe_pkg_versions(pkgs: list[str]) -> dict:
    out = {}
    try:
        import importlib.metadata as md
        for p in pkgs:
            try:
                out[p] = md.version(p)
            except Exception:
                out[p] = None
    except Exception:
        for p in pkgs:
            out[p] = None
    return out

def _find_main_df() -> tuple[str | None, pd.DataFrame | None]:
    for name in ["analysis_panel", "arb_panel", "panel", "daily_long", "pivot"]:
        obj = globals().get(name, None)
        if isinstance(obj, pd.DataFrame):
            return name, obj
    return None, None

def _dataset_notes() -> str:
    name, df = _find_main_df()
    if df is None:
        return "No main dataframe found (looked for: analysis_panel, arb_panel, panel, daily_long, pivot)."
    notes = [f"Dataset: {name}", f"Rows: {len(df):,}", f"Columns: {df.shape[1]:,}"]
    if "date" in df.columns:
        d = pd.to_datetime(df["date"], errors="coerce")
        if d.notna().any():
            notes.append(f"Date range: {d.min().date()} → {d.max().date()}")
    if "tenor" in df.columns:
        try:
            notes.append(f"Tenors: {df['tenor'].nunique()}")
        except Exception:
            pass
    return "\n".join(notes)

def _stable_config_dict(cfg) -> dict:
    # If cfg is already dict-like, use it; else try to coerce.
    if isinstance(cfg, dict):
        return cfg
    try:
        return dict(cfg)
    except Exception:
        return {"CONFIG_repr": repr(cfg)}

def _config_hash(cfg_dict: dict) -> str:
    s = json.dumps(cfg_dict, sort_keys=True, default=str).encode("utf-8")
    return hashlib.sha1(s).hexdigest()[:10]

def _write_readme(run_dir: Path, cfg_dict: dict, notes: str):
    txt = []
    txt.append("# summary_pipeline run")
    txt.append("")
    txt.append("## Config")
    txt.append("```json")
    txt.append(json.dumps(cfg_dict, indent=2, default=str))
    txt.append("```")
    txt.append("")
    txt.append("## Notes")
    txt.append("```")
    txt.append(notes)
    txt.append("```")
    (run_dir / "README.md").write_text("\n".join(txt), encoding="utf-8")

# --- Preconditions
if "CONFIG" not in globals():
    raise RuntimeError("CONFIG not found. Run the CONFIG cell first.")

# Repo root (best-effort)
if "REPO_ROOT" in globals():
    repo_root = Path(REPO_ROOT)
else:
    repo_root = Path().cwd().resolve().parent  # assumes notebook is in notebooks/

# Output root (prefer CONFIG override if provided)
cfg_dict = _stable_config_dict(CONFIG)
out_root = cfg_dict.get("output_root", None)
if out_root:
    output_root = Path(out_root).expanduser()
    if not output_root.is_absolute():
        output_root = (repo_root / output_root).resolve()
else:
    output_root = (repo_root / "outputs" / "summary_pipeline").resolve()

output_root.mkdir(parents=True, exist_ok=True)

# Run folder naming
utc_now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
cfg_hash = _config_hash(cfg_dict)

run_dir = output_root / f"{ts}_{cfg_hash}"
figures_dir = run_dir / "figures"
tables_dir  = run_dir / "tables"
data_dir    = run_dir / "data"
logs_dir    = run_dir / "logs"

for p in [run_dir, figures_dir, tables_dir, data_dir, logs_dir]:
    p.mkdir(parents=True, exist_ok=True)

# Write README + metadata
notes = _dataset_notes()
_write_readme(run_dir, cfg_dict, notes)

metadata = {
    "utc_timestamp": utc_now,
    "run_dir": str(run_dir),
    "config_hash": cfg_hash,
    "config": cfg_dict,
    "git": _safe_git_info(repo_root),
    "python": {"version": sys.version, "executable": sys.executable},
    "platform": {
        "system": platform.system(),
        "release": platform.release(),
        "version": platform.version(),
        "machine": platform.machine(),
    },
    "packages": _safe_pkg_versions(["numpy", "pandas", "statsmodels", "matplotlib", "scipy"]),
    "notes": notes,
}
(run_dir / "run_metadata.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8")

# Refresh latest/ (copy entire run folder)
latest_dir = output_root / "latest"
if latest_dir.exists():
    shutil.rmtree(latest_dir)
shutil.copytree(run_dir, latest_dir)

# Expose run_dirs for downstream cells (compatible with older code)
run_dirs = {
    "run": run_dir,
    "figures": figures_dir,
    "tables": tables_dir,
    "data": data_dir,
    "logs": logs_dir,
    "latest": latest_dir,
}

print("Run dir:", run_dir)
print("Latest refreshed:", latest_dir)
print("Config hash:", cfg_hash)

Run dir: C:\Users\Owner\Box\Winter26\slr_bucket\outputs\summary_pipeline\20260226_223058_278369efc0
Latest refreshed: C:\Users\Owner\Box\Winter26\slr_bucket\outputs\summary_pipeline\latest
Config hash: 278369efc0
