# Chronos-2 SFT+Lora — Statistical Significance Tests (Global vs Industry-Specific)

This notebook loads the evaluation dumps generated by `chronos2_sft_lora_eval_dump.ipynb` and tests whether performance differences between models are statistically robust.

We treat each **ticker** as one statistical unit (paired setting):
1. Aggregate metrics per ticker (averaged over sampled evaluation windows).
2. Compute paired deltas between models for the same tickers.
3. Run a Wilcoxon signed-rank test and a bootstrap 95% confidence interval for the mean delta.
4. For sector-level results (multiple sectors), apply Benjamini–Hochberg (FDR) correction.

**Sign convention:** Δ = metric(model B) − metric(model A). For MAE/MQL, lower is better, so **negative Δ means model B improves over model A**.


In [None]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon

# Repository root (to keep paths consistent across machines)
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
sys.path.append(project_root)


REPO_ROOT = Path(os.getcwd()).parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# Support both layouts (repo root vs notebooks working dir)
cand1 = REPO_ROOT / "notebooks" / "outputs"
cand2 = REPO_ROOT / "outputs"
OUTPUTS_BASE = cand1 if cand1.exists() else cand2

DUMPS_DIR = OUTPUTS_BASE / "eval_dumps" / "sft_lora"
if not DUMPS_DIR.exists():
    raise FileNotFoundError(f"Dump directory not found: {DUMPS_DIR}")

OUT_DIR = OUTPUTS_BASE / "stats_results" / "sft_lora"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("REPO_ROOT:", REPO_ROOT)
print("DUMPS_DIR:", DUMPS_DIR)
print("OUT_DIR:", OUT_DIR)


In [None]:
# Load all Parquet dumps

paths = sorted(DUMPS_DIR.glob("*.parquet"))
print("n_parquet:", len(paths))
if len(paths) == 0:
    raise RuntimeError("No parquet files found in DUMPS_DIR")

dfs = []
for p in paths:
    df = pd.read_parquet(p)
    df["source_file"] = p.name
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)
print("df_all:", df_all.shape)
df_all.head()


In [None]:
# -----------------------------
# 2) Aggregate per ticker (unit of analysis)
# -----------------------------
metrics = ["mae", "mql"]
agg = (
    df_all
    .groupby(["group", "model", "ticker"], as_index=False)[metrics]
    .mean()
)
print("agg:", agg.shape)
agg.head()


agg: (549, 5)


Unnamed: 0,group,model,ticker,mae,mql
0,communication_services,baseline,CMCSA,0.014223,0.006565
1,communication_services,baseline,CRM,0.02221,0.00998
2,communication_services,baseline,GOOGL,0.018694,0.008304
3,communication_services,baseline,T,0.012219,0.005621
4,communication_services,baseline,TMUS,0.01258,0.005571


In [None]:
# Statistical helpers

def bootstrap_mean_ci(delta: np.ndarray, n_boot: int = 2000, ci: float = 0.95, seed: int = 123):
    """Bootstrap CI for the mean of delta."""
    rng = np.random.default_rng(seed)
    n = len(delta)
    if n == 0:
        return (np.nan, np.nan)
    boots = []
    for _ in range(n_boot):
        sample = rng.choice(delta, size=n, replace=True)
        boots.append(sample.mean())
    boots = np.array(boots)
    lo = np.quantile(boots, (1-ci)/2)
    hi = np.quantile(boots, 1-(1-ci)/2)
    return float(lo), float(hi)

def bh_fdr(pvals: np.ndarray):
    """Benjamini–Hochberg FDR correction."""
    pvals = np.asarray(pvals, dtype=float)
    n = len(pvals)
    order = np.argsort(pvals)
    ranked = pvals[order]
    adj = np.empty(n, dtype=float)
    prev = 1.0
    for i in range(n-1, -1, -1):
        rank = i+1
        val = ranked[i] * n / rank
        prev = min(prev, val)
        adj[i] = prev
    out = np.empty(n, dtype=float)
    out[order] = np.clip(adj, 0, 1)
    return out

def paired_test_group(agg_df: pd.DataFrame, group: str, model_a: str, model_b: str, metric: str):
    """Paired test on ticker-level metric averages (delta = B - A)."""
    a = agg_df[(agg_df["group"] == group) & (agg_df["model"] == model_a)][["ticker", metric]]
    b = agg_df[(agg_df["group"] == group) & (agg_df["model"] == model_b)][["ticker", metric]]

    m = a.merge(b, on="ticker", suffixes=("_a", "_b"))
    if len(m) < 2:
        return None

    delta = (m[f"{metric}_b"] - m[f"{metric}_a"]).to_numpy()
    mean_d = float(np.mean(delta))
    med_d = float(np.median(delta))
    ci_lo, ci_hi = bootstrap_mean_ci(delta, n_boot=5000, ci=0.95, seed=123)

    # Wilcoxon signed-rank test (paired, non-parametric)
    try:
        p = float(wilcoxon(delta).pvalue)
    except Exception:
        p = np.nan

    return {
        "group": group,
        "metric": metric,
        "model_a": model_a,
        "model_b": model_b,
        "n_tickers": int(len(m)),
        "mean_delta_b_minus_a": mean_d,
        "median_delta_b_minus_a": med_d,
        "ci95_lo": ci_lo,
        "ci95_hi": ci_hi,
        "p_wilcoxon": p,
    }


In [None]:
# Build comparisons

comparisons = []

# GLOBAL: baseline vs LoRA general
for metric in metrics:
    r = paired_test_group(agg, "global", "baseline", "lora_general", metric)
    if r:
        comparisons.append(r)

# CATEGORY-LEVEL tests:
# We use the FAIR subset for categories:
#  - baseline:            __baseline.parquet
#  - lora_general (fair): __lora_general_ctx_cat.parquet
#  - lora_category:       __lora_category.parquet
cat_mask_general_fair = df_all["source_file"].str.contains("__lora_general_ctx_cat.parquet")
cat_mask_category     = df_all["source_file"].str.contains("__lora_category.parquet")
cat_mask_baseline     = df_all["source_file"].str.contains("__baseline.parquet") & (df_all["group"] != "global")

df_cat_fair = df_all[cat_mask_general_fair | cat_mask_category | cat_mask_baseline].copy()

agg_cat_fair = (
    df_cat_fair
    .groupby(["group", "model", "ticker"], as_index=False)[metrics]
    .mean()
)

cats = sorted([g for g in agg_cat_fair["group"].unique() if g != "global"])
print("n_categories in dumps:", len(cats))

for cat in cats:
    for metric in metrics:
        r1 = paired_test_group(agg_cat_fair, cat, "baseline", "lora_category", metric)
        if r1: comparisons.append(r1)

        r2 = paired_test_group(agg_cat_fair, cat, "lora_general", "lora_category", metric)
        if r2: comparisons.append(r2)

df_comp = pd.DataFrame(comparisons)
df_comp


In [None]:

# Multiple-testing correction (BH/FDR) for category comparisons

df_comp["p_adj_bh"] = np.nan

for (metric, model_a, model_b), sub in df_comp[df_comp["group"] != "global"].groupby(["metric", "model_a", "model_b"]):
    p = sub["p_wilcoxon"].to_numpy()
    df_comp.loc[sub.index, "p_adj_bh"] = bh_fdr(p)

df_comp_sorted = df_comp.sort_values(["metric", "model_a", "model_b", "p_adj_bh", "p_wilcoxon"])
df_comp_sorted.head(20)


In [None]:
# Save results

out_csv = OUT_DIR / "paired_tests_ticker_level.csv"
df_comp_sorted.to_csv(out_csv, index=False)
print("Saved:", out_csv)

# Quick view: global results + category results for (baseline -> lora_category) on MQL
display(df_comp_sorted[df_comp_sorted["group"] == "global"])
display(df_comp_sorted[
    (df_comp_sorted["group"] != "global") &
    (df_comp_sorted["metric"] == "mql") &
    (df_comp_sorted["model_a"] == "baseline") &
    (df_comp_sorted["model_b"] == "lora_category")
].sort_values("p_adj_bh"))
