In [2]:
# level100_evt_es_risk_budget.py
# Level-100: End-to-End Risk Engine — EVT (POT-GPD) Expected Shortfall Risk Budgeting
#            + Monthly Rebalancing (TRADING DAY) + Turnover/TC + Walk-Forward Backtest (SciPy-free)
#
# Key repairs vs your version:
# - Rebalance dates are now ALWAYS actual trading dates from rets.index (no calendar month-end rows injected).
# - w_daily never expands beyond rets.index, so no broadcasting mismatch.
# - Rebalance mask uses indexer (safe) instead of searchsorted on possibly-nonexistent dates.
# - More robust yfinance close extraction + alignment + cleaning.
# - Safer POT logic and clearer diagnostics.
#
# Run:
#   python level100_evt_es_risk_budget.py
#   python level100_evt_es_risk_budget.py --alpha 0.01 --q_u 0.95 --window 1260 --rebalance ME
#   python level100_evt_es_risk_budget.py --symbols SPY QQQ IWM EFA EEM TLT LQD GLD --tc_bps 2.0

import os
import json
import math
import argparse
from dataclasses import dataclass, asdict
from typing import Tuple, Dict, Any

import numpy as np
import pandas as pd
import yfinance as yf


# ----------------------------- Config -----------------------------
@dataclass
class Config:
    symbols: Tuple[str, ...] = ("SPY", "QQQ", "IWM", "EFA", "EEM", "TLT", "LQD", "GLD")
    start: str = "2010-01-01"

    use_log_returns: bool = True
    dropna: bool = True

    # EVT tail settings (on losses)
    alpha: float = 0.05          # tail probability (ES at alpha)
    q_u: float = 0.90            # threshold quantile for POT on losses (0.90 to 0.95 common)
    window: int = 1000           # rolling window length (trading days)
    min_exc: int = 40            # minimum exceedances required for POT fit
    xi_clip: Tuple[float, float] = (-0.45, 0.45)  # stabilize MoM

    # Rebalancing
    rebalance: str = "ME"        # pandas offset alias; "ME" month-end bucket, "W-FRI", etc.
    max_w: float = 0.35          # per-asset cap
    min_w: float = 0.00          # per-asset floor (0 for long-only)
    allow_cash: bool = False     # if True, residual weight goes to CASH (0% return)
    cash_symbol: str = "CASH"

    # Correlation diagnostics (optional; no leverage applied, we just compute a hint)
    use_corr_scale: bool = True
    corr_shrink: float = 0.15    # shrink corr toward identity to reduce noise

    # Transaction costs
    tc_bps: float = 1.0          # cost per 100% turnover at rebalance (bps)
                                 # applied: cost = tc_bps/1e4 * turnover

    seed: int = 42

    out_daily_csv: str = "level100_evt_es_rb_daily.csv"
    out_weights_csv: str = "level100_evt_es_rb_weights.csv"
    out_json: str = "level100_evt_es_rb_summary.json"


# ----------------------------- yfinance loader -----------------------------
def _extract_close(px: pd.DataFrame, symbol: str) -> pd.Series:
    if px is None or px.empty:
        raise RuntimeError(f"No data returned for {symbol}")

    # yfinance: MultiIndex columns often like ('Adj Close','SPY') or ('Close','SPY')
    if isinstance(px.columns, pd.MultiIndex):
        # Try common layouts
        candidates = [
            ("Adj Close", symbol),
            ("Close", symbol),
            (symbol, "Adj Close"),
            (symbol, "Close"),
        ]
        for key in candidates:
            if key in px.columns:
                s = px[key]
                if isinstance(s, pd.DataFrame):
                    s = s.iloc[:, 0]
                s = s.copy()
                s.name = symbol
                return s
        raise RuntimeError(f"Could not extract Close/Adj Close for {symbol} from MultiIndex columns.")

    # Single-index columns: may contain Adj Close/Close
    for col in ("Adj Close", "Close"):
        if col in px.columns:
            s = px[col]
            if isinstance(s, pd.DataFrame):
                s = s.iloc[:, 0]
            s = s.copy()
            s.name = symbol
            return s

    raise RuntimeError(f"Missing Close/Adj Close for {symbol}. Columns={list(px.columns)}")


def load_prices(symbols: Tuple[str, ...], start: str) -> pd.DataFrame:
    syms = tuple(symbols)

    # Batch download
    try:
        px_all = yf.download(list(syms), start=start, progress=False, group_by="column", auto_adjust=False)
        if px_all is not None and not px_all.empty:
            series = []
            for s in syms:
                series.append(_extract_close(px_all, s))
            out = pd.concat(series, axis=1).sort_index()
            return out
    except Exception:
        pass

    # Fallback single-ticker downloads
    series = []
    for s in syms:
        px = yf.download(s, start=start, progress=False, auto_adjust=False)
        series.append(_extract_close(px, s))
    return pd.concat(series, axis=1).sort_index()


def compute_returns(prices: pd.DataFrame, use_log: bool) -> pd.DataFrame:
    prices = prices.replace([np.inf, -np.inf], np.nan)
    if use_log:
        rets = np.log(prices).diff()
    else:
        rets = prices.pct_change()
    rets = rets.replace([np.inf, -np.inf], np.nan)
    rets = rets.dropna(how="all")
    return rets


# ----------------------------- EVT POT-GPD (fast MoM) -----------------------------
def gpd_mom_fit(exceed: np.ndarray, xi_clip: Tuple[float, float]) -> Tuple[float, float]:
    """
    Method-of-moments fit for GPD on exceedances y>0.
      mean = beta/(1-xi)
      var  = beta^2/((1-xi)^2*(1-2xi))
    => var/mean^2 = 1/(1-2xi)  => xi = 0.5*(1 - mean^2/var)
       beta = mean*(1-xi)
    """
    y = exceed.astype(float)
    y = y[np.isfinite(y)]
    if y.size == 0:
        return 0.0, 1e-12

    m = float(y.mean())
    v = float(y.var(ddof=1)) if y.size >= 2 else float(y.var())
    v = max(v, 1e-12)

    xi = 0.5 * (1.0 - (m * m) / v)
    xi = float(np.clip(xi, xi_clip[0], xi_clip[1]))

    beta = m * (1.0 - xi)
    beta = float(max(beta, 1e-12))
    return xi, beta


def pot_es_loss(
    losses: np.ndarray,
    alpha: float,
    q_u: float,
    min_exc: int,
    xi_clip: Tuple[float, float],
) -> Dict[str, float]:
    """
    Estimate ES on LOSSES using POT-GPD. Losses should be positive-ish for bad days.
    Returns dict:
      es_loss, var_loss, u, k, method
    Falls back to empirical ES if POT invalid/insufficient.
    """
    L = losses.astype(float)
    L = L[np.isfinite(L)]
    n = int(L.size)
    if n < 20:
        return {"es_loss": float("nan"), "var_loss": float("nan"), "u": float("nan"), "k": 0.0, "method": "na"}

    # Empirical right-tail VaR/ES on losses:
    # P(L > VaR) = alpha  => VaR = quantile at (1-alpha)
    var_emp = float(np.quantile(L, 1.0 - alpha))
    tail = L[L >= var_emp]
    es_emp = float(tail.mean()) if tail.size > 0 else float("nan")

    # POT threshold
    u = float(np.quantile(L, q_u))
    exc = L[L > u] - u
    k = int(exc.size)

    # Not enough exceedances
    if k < min_exc:
        return {"es_loss": es_emp, "var_loss": var_emp, "u": u, "k": float(k), "method": "empirical"}

    p_u = k / n  # P(L > u)

    # If alpha is not deeper than u's exceedance probability, POT extrapolation isn't needed/valid
    # Need alpha < p_u for VaR beyond u.
    if not (alpha < p_u):
        return {"es_loss": es_emp, "var_loss": var_emp, "u": u, "k": float(k), "method": "empirical"}

    xi, beta = gpd_mom_fit(exc, xi_clip=xi_clip)

    # POT VaR for losses where P(L > VaR)=alpha
    # VaR = u + (beta/xi)*((alpha/p_u)^(-xi) - 1) ; limit xi->0 uses log
    if abs(xi) < 1e-8:
        var_pot = u + beta * math.log(p_u / alpha)
    else:
        var_pot = u + (beta / xi) * (((alpha / p_u) ** (-xi)) - 1.0)

    # POT ES for losses (finite if xi < 1)
    if xi >= 0.999:
        return {"es_loss": es_emp, "var_loss": var_emp, "u": u, "k": float(k), "method": "empirical"}

    yq = var_pot - u
    es_pot = var_pot + (beta + xi * yq) / (1.0 - xi)

    # Guardrails
    if not np.isfinite(es_pot) or es_pot <= 0:
        return {"es_loss": es_emp, "var_loss": var_emp, "u": u, "k": float(k), "method": "empirical"}

    return {"es_loss": float(es_pot), "var_loss": float(var_pot), "u": u, "k": float(k), "method": "pot"}


# ----------------------------- Portfolio mechanics -----------------------------
def shrink_corr(corr: np.ndarray, shrink: float) -> np.ndarray:
    n = corr.shape[0]
    s = float(np.clip(shrink, 0.0, 1.0))
    out = (1.0 - s) * corr + s * np.eye(n)
    np.fill_diagonal(out, 1.0)
    return out


def corr_scale_for_weights(w: np.ndarray, corr: np.ndarray) -> float:
    """
    Scalar diagnostic: 1/sqrt(w' C w) where C is correlation.
    NOT applied as leverage here; just reported as leverage_hint.
    """
    v = float(w @ corr @ w)
    v = max(v, 1e-12)
    return 1.0 / math.sqrt(v)


def clip_and_renorm_longonly(w: np.ndarray, lo: float, hi: float) -> np.ndarray:
    w = np.clip(w, lo, hi)
    s = float(w.sum())
    if s <= 0:
        return np.ones_like(w) / len(w)
    return w / s


def perf_stats(r: np.ndarray) -> Dict[str, float]:
    r = r[np.isfinite(r)]
    if r.size == 0:
        return {"ann_ret": float("nan"), "ann_vol": float("nan"), "sharpe": float("nan"), "max_dd": float("nan")}
    ann_ret = float(r.mean() * 252.0)
    ann_vol = float(r.std(ddof=1) * math.sqrt(252.0))
    sharpe = float(ann_ret / ann_vol) if ann_vol > 0 else float("nan")
    eq = np.cumprod(1.0 + r)
    peak = np.maximum.accumulate(eq)
    dd = (eq / peak) - 1.0
    max_dd = float(dd.min()) if dd.size else float("nan")
    return {"ann_ret": ann_ret, "ann_vol": ann_vol, "sharpe": sharpe, "max_dd": max_dd}


def trading_rebalance_dates(index: pd.DatetimeIndex, freq: str) -> pd.DatetimeIndex:
    """
    Build rebalance dates that are ALWAYS actual dates in `index`.
    We bucket by freq and take the LAST trading day in each bucket.
    """
    if len(index) == 0:
        return pd.DatetimeIndex([])

    # groupby with Grouper creates buckets labelled by period end, but we take df.index[-1]
    s = pd.Series(index=index, data=np.ones(len(index)))
    # groupby on the index itself
    grouped = s.groupby(pd.Grouper(freq=freq))
    last_dates = grouped.apply(lambda x: x.index[-1] if len(x) else pd.NaT).dropna()
    return pd.DatetimeIndex(last_dates.values)


# ----------------------------- Pipeline -----------------------------
def run_pipeline(cfg: Config) -> Dict[str, Any]:
    np.random.seed(cfg.seed)

    print(f"[INFO] Downloading prices for {cfg.symbols} from {cfg.start} ...")
    prices = load_prices(cfg.symbols, cfg.start)

    # Basic cleaning
    prices = prices.replace([np.inf, -np.inf], np.nan).dropna(how="all")
    if prices.empty:
        raise RuntimeError("No prices after cleaning (check tickers/start).")

    rets = compute_returns(prices, cfg.use_log_returns)

    if cfg.dropna:
        rets = rets.dropna(how="any")
    if rets.empty:
        raise RuntimeError("No returns after cleaning (check tickers/start).")

    dates = rets.index
    n, m = rets.shape
    if n <= cfg.window + 10:
        raise RuntimeError(f"Not enough rows for window={cfg.window}. rows={n}")

    # Rebalance dates must be trading dates
    rebal_dates = trading_rebalance_dates(dates, cfg.rebalance)
    # Start once we have enough lookback
    rebal_dates = rebal_dates[rebal_dates >= dates[cfg.window]]

    if len(rebal_dates) == 0:
        raise RuntimeError("No rebalance dates found after applying window constraint.")

    print(f"[INFO] rows={n}, assets={m}, rebalances={len(rebal_dates)} ({cfg.rebalance}), window={cfg.window}")
    print(f"[INFO] EVT alpha={cfg.alpha}, q_u={cfg.q_u}, min_exc={cfg.min_exc}, tc_bps={cfg.tc_bps}")

    syms = list(cfg.symbols)
    rets_np = rets.values

    # Storage: rebalance outputs
    w_hist = pd.DataFrame(index=rebal_dates, columns=syms, dtype=float)
    lev_hint = pd.Series(index=rebal_dates, dtype=float, name="leverage_hint")

    es_hist = pd.DataFrame(index=rebal_dates, columns=[f"ESloss_{s}" for s in syms], dtype=float)
    method_hist = pd.DataFrame(index=rebal_dates, columns=[f"method_{s}" for s in syms], dtype=str)
    exc_hist = pd.DataFrame(index=rebal_dates, columns=[f"k_{s}" for s in syms], dtype=float)

    # Daily weights aligned EXACTLY to returns index
    w_daily = pd.DataFrame(index=dates, columns=syms, dtype=float)

    # Initial weights
    prev_w = np.ones(m, dtype=float) / m

    # Precompute mapping dt -> integer location in returns
    rebal_locs = dates.get_indexer(rebal_dates)
    if np.any(rebal_locs < 0):
        # should never happen since trading_rebalance_dates guarantees membership
        raise RuntimeError("Internal error: some rebalance dates not found in returns index.")

    for dt, t in zip(rebal_dates, rebal_locs):
        if t <= cfg.window:
            continue

        win = rets_np[t - cfg.window:t, :]  # (window, assets)
        losses = -win  # right tail = big losses

        es = np.full(m, np.nan, dtype=float)

        for j, s in enumerate(syms):
            res = pot_es_loss(
                losses=losses[:, j],
                alpha=cfg.alpha,
                q_u=cfg.q_u,
                min_exc=cfg.min_exc,
                xi_clip=cfg.xi_clip,
            )
            es[j] = res["es_loss"]
            es_hist.loc[dt, f"ESloss_{s}"] = res["es_loss"]
            method_hist.loc[dt, f"method_{s}"] = res["method"]
            exc_hist.loc[dt, f"k_{s}"] = res["k"]

        # Risk budgeting: w ∝ 1 / ES_loss
        es_ok = np.isfinite(es) & (es > 1e-12)
        if not np.any(es_ok):
            raw = np.ones(m, dtype=float) / m
        else:
            inv = np.zeros(m, dtype=float)
            inv[es_ok] = 1.0 / es[es_ok]
            s_inv = float(inv.sum())
            raw = inv / s_inv if s_inv > 0 else (np.ones(m, dtype=float) / m)

        # Caps/floors (long-only)
        w = clip_and_renorm_longonly(raw, cfg.min_w, cfg.max_w)

        # Corr-based diagnostic leverage hint
        if cfg.use_corr_scale:
            C = np.corrcoef(win, rowvar=False)
            C = np.nan_to_num(C, nan=0.0, posinf=0.0, neginf=0.0)
            C = shrink_corr(C, cfg.corr_shrink)
            lev_hint.loc[dt] = corr_scale_for_weights(w, C)
        else:
            lev_hint.loc[dt] = float("nan")

        # Store
        w_hist.loc[dt, :] = w
        w_daily.iloc[t, :] = w  # IMPORTANT: writes by integer location => never expands index
        prev_w = w

    # Forward-fill weights over trading days
    w_daily = w_daily.ffill()

    # Backfill leading NaNs with first computed weights
    first_valid = w_daily.dropna(how="any").index.min()
    if first_valid is None:
        raise RuntimeError("No weights computed. Try smaller window or lower min_exc or different rebalance freq.")

    w_daily.loc[:first_valid, :] = w_daily.loc[first_valid, :].values

    # Portfolio gross returns
    port_gross = np.sum(rets_np * w_daily.values, axis=1)

    # Rebalance mask (on trading dates)
    rebal_mask = np.zeros(n, dtype=bool)
    rebal_mask[rebal_locs] = True

    # Turnover and TC
    w_vals = w_daily.values
    w_prev = np.vstack([w_vals[0:1, :], w_vals[:-1, :]])
    turnover = np.sum(np.abs(w_vals - w_prev), axis=1)
    tc = (cfg.tc_bps / 1e4) * turnover * rebal_mask.astype(float)

    port_net = port_gross - tc

    # Daily output
    daily = pd.DataFrame(index=dates)
    daily["port_ret_gross"] = port_gross
    daily["port_ret_net"] = port_net
    daily["turnover"] = turnover
    daily["tc_cost"] = tc
    for j, s in enumerate(syms):
        daily[f"w_{s}"] = w_daily.iloc[:, j].values
        daily[f"ret_{s}"] = rets.iloc[:, j].values

    # Summary stats
    stats_gross = perf_stats(daily["port_ret_gross"].values)
    stats_net = perf_stats(daily["port_ret_net"].values)

    avg_turn = float(np.mean(turnover[rebal_mask])) if rebal_mask.any() else float("nan")
    med_turn = float(np.median(turnover[rebal_mask])) if rebal_mask.any() else float("nan")

    # Quick rolling empirical VaR exceptions on net returns (sanity)
    r_net = daily["port_ret_net"].values
    var_roll = np.full(n, np.nan)
    for t in range(cfg.window, n):
        winp = r_net[t - cfg.window:t]
        if np.isfinite(winp).sum() > 50:
            var_roll[t] = float(np.quantile(winp[np.isfinite(winp)], cfg.alpha))
    exc = np.isfinite(var_roll) & (r_net <= var_roll)
    denom = int(np.isfinite(var_roll).sum())
    exc_rate = float(exc.sum() / denom) if denom > 0 else float("nan")

    summary = {
        "config": asdict(cfg),
        "data_window": {"start": str(dates.min().date()), "end": str(dates.max().date()), "n_returns": int(n)},
        "rebalancing": {"freq": cfg.rebalance, "n_rebalances": int(len(rebal_dates))},
        "performance_gross": stats_gross,
        "performance_net": stats_net,
        "turnover": {"avg_rebalance_turnover_L1": avg_turn, "median_rebalance_turnover_L1": med_turn},
        "quick_var_backtest": {
            "alpha": float(cfg.alpha),
            "exception_rate_vs_empirical_rolling_var": exc_rate,
            "note": "Quick sanity check using rolling empirical VaR on portfolio net returns."
        },
        "notes": [
            "Weights are long-only and sum to 1 (no leverage).",
            "Tail-risk model uses EVT POT-GPD ES on losses per asset; falls back to empirical ES when exceedances are too few.",
            "Rebalance dates are mapped to actual trading days (prevents index expansion bugs).",
        ],
    }

    # Weights output with diagnostics bundled
    weights_out = w_hist.copy()
    weights_out["leverage_hint"] = lev_hint

    diagnostics = pd.concat([es_hist, exc_hist, method_hist], axis=1)
    weights_out_full = pd.concat([weights_out, diagnostics], axis=1)

    return {
        "daily": daily,
        "weights": weights_out_full,
        "summary": summary,
    }


def save_outputs(res: Dict[str, Any], cfg: Config) -> None:
    daily: pd.DataFrame = res["daily"]
    weights: pd.DataFrame = res["weights"]
    summary: Dict[str, Any] = res["summary"]

    os.makedirs(os.path.dirname(cfg.out_daily_csv) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.out_weights_csv) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.out_json) or ".", exist_ok=True)

    daily.to_csv(cfg.out_daily_csv)
    weights.to_csv(cfg.out_weights_csv)

    with open(cfg.out_json, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print(f"[OK] Saved daily → {cfg.out_daily_csv}")
    print(f"[OK] Saved weights/EVT diagnostics → {cfg.out_weights_csv}")
    print(f"[OK] Saved summary → {cfg.out_json}")

    pg = summary["performance_gross"]
    pn = summary["performance_net"]
    print(f"[PERF gross] AnnRet={pg['ann_ret']:.2%} AnnVol={pg['ann_vol']:.2%} Sharpe={pg['sharpe']:.2f} MaxDD={pg['max_dd']:.2%}")
    print(f"[PERF net  ] AnnRet={pn['ann_ret']:.2%} AnnVol={pn['ann_vol']:.2%} Sharpe={pn['sharpe']:.2f} MaxDD={pn['max_dd']:.2%}")
    t = summary["turnover"]
    print(f"[TURN] Avg L1 turnover/rebal={t['avg_rebalance_turnover_L1']:.3f}  Median={t['median_rebalance_turnover_L1']:.3f}")


# ----------------------------- CLI -----------------------------
def parse_args() -> Config:
    p = argparse.ArgumentParser(description="Level-100: EVT ES Risk Budgeting (POT-GPD) + Rebalance + TC (SciPy-free)")

    p.add_argument("--start", type=str, default=Config.start)
    p.add_argument("--symbols", nargs="+", default=list(Config.symbols))

    p.add_argument("--alpha", type=float, default=Config.alpha)
    p.add_argument("--q_u", type=float, default=Config.q_u)
    p.add_argument("--window", type=int, default=Config.window)
    p.add_argument("--min-exc", dest="min_exc", type=int, default=Config.min_exc)

    p.add_argument("--rebalance", type=str, default=Config.rebalance)
    p.add_argument("--max-w", dest="max_w", type=float, default=Config.max_w)
    p.add_argument("--min-w", dest="min_w", type=float, default=Config.min_w)

    p.add_argument("--no-corr-scale", action="store_true")
    p.add_argument("--corr-shrink", type=float, default=Config.corr_shrink)

    p.add_argument("--tc_bps", type=float, default=Config.tc_bps)

    p.add_argument("--simple-returns", action="store_true")
    p.add_argument("--no-dropna", action="store_true")
    p.add_argument("--seed", type=int, default=Config.seed)

    p.add_argument("--daily-csv", type=str, default=Config.out_daily_csv)
    p.add_argument("--weights-csv", type=str, default=Config.out_weights_csv)
    p.add_argument("--json", type=str, default=Config.out_json)

    a = p.parse_args()

    return Config(
        symbols=tuple(a.symbols),
        start=a.start,
        alpha=float(a.alpha),
        q_u=float(a.q_u),
        window=int(a.window),
        min_exc=int(a.min_exc),
        rebalance=str(a.rebalance),
        max_w=float(a.max_w),
        min_w=float(a.min_w),
        use_corr_scale=(not a.no_corr_scale),
        corr_shrink=float(a.corr_shrink),
        tc_bps=float(a.tc_bps),
        use_log_returns=(not a.simple_returns),
        dropna=(not a.no_dropna),
        seed=int(a.seed),
        out_daily_csv=a.daily_csv,
        out_weights_csv=a.weights_csv,
        out_json=a.json,
    )


def main() -> None:
    cfg = parse_args()
    res = run_pipeline(cfg)
    save_outputs(res, cfg)


if __name__ == "__main__":
    # Jupyter/PyCharm shim: drop kernel args
    import sys
    sys.argv = [sys.argv[0]] + [
        arg for arg in sys.argv[1:]
        if arg != "-f" and not (arg.endswith(".json") and "kernel" in arg)
    ]
    main()


[INFO] Downloading prices for ('SPY', 'QQQ', 'IWM', 'EFA', 'EEM', 'TLT', 'LQD', 'GLD') from 2010-01-01 ...
[INFO] rows=4025, assets=8, rebalances=146 (ME), window=1000
[INFO] EVT alpha=0.05, q_u=0.9, min_exc=40, tc_bps=1.0
[OK] Saved daily → level100_evt_es_rb_daily.csv
[OK] Saved weights/EVT diagnostics → level100_evt_es_rb_weights.csv
[OK] Saved summary → level100_evt_es_rb_summary.json
[PERF gross] AnnRet=7.42% AnnVol=9.63% Sharpe=0.77 MaxDD=-26.90%
[PERF net  ] AnnRet=7.41% AnnVol=9.63% Sharpe=0.77 MaxDD=-26.90%
[TURN] Avg L1 turnover/rebal=0.009  Median=0.006
