In [1]:
# level78_copula_scenarios.py
# Multi-Asset Gaussian Copula Scenario Engine
# -------------------------------------------
# - Downloads daily prices via yfinance
# - Builds empirical daily return distribution for each asset
# - Calibrates a Gaussian copula on the joint returns
# - Simulates joint scenarios:
#       * Draw correlated normals ~ N(0, Corr)
#       * Map to uniforms via Φ (normal CDF)
#       * Map uniforms to empirical quantiles for each asset (non-parametric marginals)
# - Builds portfolio equity paths from simulated returns
# - Outputs:
#       * CSV: portfolio equity paths
#       * CSV: per-path risk stats
#       * JSON: overall summary
#
# Example
#   python level78_copula_scenarios.py \
#       --symbols SPY,QQQ,IWM,EFA,EEM,TLT,LQD,GLD \
#       --start 2010-01-01 \
#       --horizon-days 252 \
#       --n-paths 1000

import os
import json
import math
import argparse
from dataclasses import dataclass, asdict
from typing import Sequence, Tuple

import numpy as np
import pandas as pd
import yfinance as yf


# ----------------------------- Config -----------------------------

@dataclass
class Config:
    symbols: Tuple[str, ...] = (
        "SPY", "QQQ", "IWM", "EFA", "EEM", "TLT", "LQD", "GLD"
    )
    start: str = "2010-01-01"
    horizon_days: int = 252
    n_paths: int = 1000
    out_paths_csv: str = "level78_copula_paths.csv"
    out_stats_csv: str = "level78_copula_stats.csv"
    out_summary_json: str = "level78_copula_summary.json"
    seed: int = 42
    corr_shrink: float = 0.05  # small diagonal shrink for robustness


# ----------------------------- Helpers -----------------------------

def parse_symbol_string(s: str) -> Tuple[str, ...]:
    parts = [p.strip().upper() for p in s.split(",") if p.strip()]
    if not parts:
        raise ValueError("No symbols parsed from --symbols.")
    return tuple(parts)


def load_prices(symbols: Sequence[str], start: str) -> pd.DataFrame:
    """
    Download adjusted close prices for given symbols from yfinance.
    Returns a DataFrame with Date index and one column per symbol.
    """
    data = yf.download(list(symbols), start=start, auto_adjust=True, progress=False)
    if data.empty:
        raise RuntimeError("No price data returned from yfinance.")

    # Handle possible MultiIndex columns (e.g. ('Adj Close','SPY'), ...)
    if isinstance(data.columns, pd.MultiIndex):
        cols0 = data.columns.get_level_values(0)
        if "Adj Close" in cols0:
            px = data["Adj Close"]
        elif "Close" in cols0:
            px = data["Close"]
        else:
            raise RuntimeError("Could not find 'Adj Close' or 'Close' in yfinance data.")
    else:
        if "Adj Close" in data.columns:
            px = data["Adj Close"]
        elif "Close" in data.columns:
            px = data["Close"]
        else:
            raise RuntimeError("Could not find 'Adj Close' or 'Close' in yfinance data.")

    # Ensure all requested symbols present
    missing = [s for s in symbols if s not in px.columns]
    if missing:
        raise RuntimeError(f"Missing symbols in downloaded prices: {missing}")

    px = px[list(symbols)].dropna(how="all")
    px = px.ffill().dropna()
    return px


def compute_returns(prices: pd.DataFrame) -> pd.DataFrame:
    """
    Log returns of adjusted close prices.
    """
    rets = np.log(prices / prices.shift(1))
    rets = rets.replace([np.inf, -np.inf], np.nan).dropna(how="all")
    rets = rets.dropna()
    return rets


def normal_cdf(x: np.ndarray) -> np.ndarray:
    """
    Vectorized approximation of Φ(x), the standard normal CDF.
    Abramowitz-Stegun style rational approximation, good enough for Monte Carlo.
    """
    x = np.asarray(x, dtype=float)
    sign = np.sign(x)
    x_abs = np.abs(x) / math.sqrt(2.0)

    # erf approximation
    t = 1.0 / (1.0 + 0.3275911 * x_abs)
    # Coefficients
    a1 = 0.254829592
    a2 = -0.284496736
    a3 = 1.421413741
    a4 = -1.453152027
    a5 = 1.061405429
    erf_approx = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * np.exp(-x_abs * x_abs)

    erf_val = sign * erf_approx
    cdf = 0.5 * (1.0 + erf_val)
    return cdf


def ensure_pos_def(corr: np.ndarray, shrink: float = 0.0) -> np.ndarray:
    """
    Ensure correlation matrix is positive definite via diagonal shrinkage if needed.
    """
    corr = np.asarray(corr, dtype=float)
    n = corr.shape[0]
    # Symmetrize
    corr = 0.5 * (corr + corr.T)

    # Apply small shrink toward identity
    if shrink > 0.0:
        corr = (1.0 - shrink) * corr + shrink * np.eye(n)

    # Try Cholesky; if fails, keep increasing shrink.
    eps = 1e-8
    for _ in range(8):
        try:
            np.linalg.cholesky(corr)
            return corr
        except np.linalg.LinAlgError:
            corr = corr + eps * np.eye(n)
            eps *= 10.0

    raise RuntimeError("Failed to obtain positive-definite correlation matrix.")


def empirical_inv_cdf(sample: np.ndarray, u: np.ndarray) -> np.ndarray:
    """
    Empirical inverse CDF: map uniform(0,1) draws u to sample quantiles.
    sample: 1D array of historical returns for one asset.
    u: array of uniforms in [0,1], any shape.
    """
    sample = np.asarray(sample, dtype=float)
    u = np.asarray(u, dtype=float)
    # clip to avoid edge issues
    u_clipped = np.clip(u, 1e-6, 1.0 - 1e-6)
    flat_u = u_clipped.ravel()
    q = np.quantile(sample, flat_u)
    return q.reshape(u.shape)


def max_drawdown(equity: np.ndarray) -> float:
    """
    Max drawdown (negative number) given equity curve.
    """
    equity = np.asarray(equity, dtype=float)
    if equity.size == 0:
        return float("nan")
    roll_max = np.maximum.accumulate(equity)
    dd = equity / roll_max - 1.0
    return float(dd.min())


# ------------------- Copula Scenario Engine -----------------------

def simulate_copula_paths(
    hist_rets: pd.DataFrame,
    horizon_days: int,
    n_paths: int,
    corr_shrink: float,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Simulate multi-asset return paths using a Gaussian copula with empirical marginals.

    hist_rets : T x N DataFrame of daily returns
    horizon_days : number of days to simulate per path
    n_paths : number of Monte Carlo paths
    corr_shrink : diagonal shrinkage to stabilize correlation matrix

    Returns:
        asset_ret_paths: array (horizon_days, n_paths, N) of simulated daily returns
        port_ret_paths:  array (horizon_days, n_paths) for an equal-weight portfolio
    """
    r = hist_rets.values   # T x N
    T_hist, N = r.shape

    # Correlation matrix of historical returns
    corr = np.corrcoef(r, rowvar=False)
    corr_pd = ensure_pos_def(corr, shrink=corr_shrink)
    chol = np.linalg.cholesky(corr_pd)  # N x N

    # Draw independent normals and apply correlation
    H = horizon_days
    P = n_paths
    Z_iid = np.random.normal(size=(H * P, N))
    Z_corr_flat = Z_iid @ chol.T  # (H*P) x N
    Z_corr = Z_corr_flat.reshape(H, P, N)

    # Convert to uniforms via Φ
    U = normal_cdf(Z_corr)

    # Map uniforms to empirical marginals asset by asset
    asset_ret_paths = np.empty_like(U)
    for j in range(N):
        asset_ret_paths[:, :, j] = empirical_inv_cdf(r[:, j], U[:, :, j])

    # Equal-weight portfolio returns
    w = np.full(N, 1.0 / N)
    port_ret_paths = np.tensordot(asset_ret_paths, w, axes=([2], [0]))  # (H, P)

    return asset_ret_paths, port_ret_paths


def build_portfolio_equity(port_ret_paths: np.ndarray) -> np.ndarray:
    """
    Convert portfolio daily returns into equity curves.
    port_ret_paths: (H, P)
    Returns:
        equity_paths: (H, P)
    """
    H, P = port_ret_paths.shape
    equity = np.empty((H, P), dtype=float)
    # Start with 1.0 and compound
    equity[0, :] = 1.0 * (1.0 + port_ret_paths[0, :])
    for t in range(1, H):
        equity[t, :] = equity[t - 1, :] * (1.0 + port_ret_paths[t, :])
    return equity


def compute_path_stats(
    equity_paths: np.ndarray,
    port_ret_paths: np.ndarray,
    horizon_days: int,
) -> pd.DataFrame:
    """
    Per-path stats: final wealth, ann_ret, ann_vol, Sharpe, max_drawdown.
    """
    H, P = equity_paths.shape
    stats = {
        "final_wealth": np.empty(P, dtype=float),
        "ann_ret": np.empty(P, dtype=float),
        "ann_vol": np.empty(P, dtype=float),
        "sharpe": np.empty(P, dtype=float),
        "max_drawdown": np.empty(P, dtype=float),
    }

    for i in range(P):
        eq = equity_paths[:, i]
        rets = port_ret_paths[:, i]
        final_w = float(eq[-1])

        # Annualized metrics (assuming 252 trading days/year)
        if H > 0:
            ann_ret = final_w ** (252.0 / H) - 1.0
        else:
            ann_ret = float("nan")

        if H > 1:
            vol = float(np.std(rets, ddof=1)) * math.sqrt(252.0)
        else:
            vol = float("nan")

        if vol > 0 and math.isfinite(ann_ret):
            sharpe = ann_ret / vol
        else:
            sharpe = float("nan")

        mdd = max_drawdown(eq)

        stats["final_wealth"][i] = final_w
        stats["ann_ret"][i] = float(ann_ret)
        stats["ann_vol"][i] = float(vol)
        stats["sharpe"][i] = float(sharpe)
        stats["max_drawdown"][i] = float(mdd)

    idx = [f"path_{i}" for i in range(P)]
    stats_df = pd.DataFrame(stats, index=idx)
    return stats_df


# ------------------------- I/O + Orchestration --------------------

def save_outputs(
    equity_paths: np.ndarray,
    stats_df: pd.DataFrame,
    cfg: Config,
) -> None:
    os.makedirs(os.path.dirname(cfg.out_paths_csv) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.out_stats_csv) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.out_summary_json) or ".", exist_ok=True)

    H, P = equity_paths.shape

    # Equity paths
    paths_index = pd.RangeIndex(start=1, stop=H + 1, step=1, name="t")
    paths_cols = [f"path_{i}" for i in range(P)]
    paths_df = pd.DataFrame(equity_paths, index=paths_index, columns=paths_cols)
    paths_df.to_csv(cfg.out_paths_csv)

    # Stats
    stats_df.to_csv(cfg.out_stats_csv)

    # Summary JSON
    fw = stats_df["final_wealth"].values
    mdd = stats_df["max_drawdown"].values
    shr = stats_df["sharpe"].values

    summary = {
        "config": asdict(cfg),
        "n_paths": P,
        "horizon_days": cfg.horizon_days,
        "final_wealth": {
            "mean": float(np.mean(fw)),
            "median": float(np.median(fw)),
            "p05": float(np.percentile(fw, 5)),
            "p25": float(np.percentile(fw, 25)),
            "p75": float(np.percentile(fw, 75)),
            "p95": float(np.percentile(fw, 95)),
        },
        "max_drawdown": {
            "mean": float(np.mean(mdd)),
            "median": float(np.median(mdd)),
            "p05": float(np.percentile(mdd, 5)),
            "p25": float(np.percentile(mdd, 25)),
            "p75": float(np.percentile(mdd, 75)),
            "p95": float(np.percentile(mdd, 95)),
        },
        "sharpe": {
            "mean": float(np.nanmean(shr)),
            "median": float(np.nanmedian(shr)),
        },
    }

    with open(cfg.out_summary_json, "w") as f:
        json.dump(summary, f, indent=2)

    print(f"[OK] Saved equity paths -> {cfg.out_paths_csv}")
    print(f"[OK] Saved path stats -> {cfg.out_stats_csv}")
    print(f"[OK] Saved summary -> {cfg.out_summary_json}")
    print(
        "Final wealth median={:.3f}, p05={:.3f}, p95={:.3f}".format(
            summary["final_wealth"]["median"],
            summary["final_wealth"]["p05"],
            summary["final_wealth"]["p95"],
        )
    )
    print(
        "MaxDD median={:.1f}%, mean={:.1f}%".format(
            100.0 * summary["max_drawdown"]["median"],
            100.0 * summary["max_drawdown"]["mean"],
        )
    )
    print(
        "Sharpe mean={:.2f}, median={:.2f}".format(
            summary["sharpe"]["mean"],
            summary["sharpe"]["median"],
        )
    )


# ----------------------------- CLI -----------------------------

def parse_args() -> Config:
    p = argparse.ArgumentParser(description="Level-78: Gaussian Copula Scenario Engine")
    p.add_argument(
        "--symbols",
        type=str,
        default="SPY,QQQ,IWM,EFA,EEM,TLT,LQD,GLD",
        help="Comma-separated list of tickers.",
    )
    p.add_argument("--start", type=str, default="2010-01-01")
    p.add_argument("--horizon-days", type=int, default=252)
    p.add_argument("--n-paths", type=int, default=1000)
    p.add_argument("--paths-csv", type=str, default="level78_copula_paths.csv")
    p.add_argument("--stats-csv", type=str, default="level78_copula_stats.csv")
    p.add_argument("--summary-json", type=str, default="level78_copula_summary.json")
    p.add_argument("--seed", type=int, default=42)
    p.add_argument("--corr-shrink", type=float, default=0.05)

    a = p.parse_args()
    symbols = parse_symbol_string(a.symbols)
    return Config(
        symbols=symbols,
        start=a.start,
        horizon_days=a.horizon_days,
        n_paths=a.n_paths,
        out_paths_csv=a.paths_csv,
        out_stats_csv=a.stats_csv,
        out_summary_json=a.summary_json,
        seed=a.seed,
        corr_shrink=a.corr_shrink,
    )


def run_pipeline(cfg: Config) -> None:
    np.random.seed(cfg.seed)

    print(f"[INFO] Downloading prices for {cfg.symbols} from {cfg.start} ...")
    prices = load_prices(cfg.symbols, cfg.start)
    rets = compute_returns(prices)
    print(
        f"[INFO] Got {len(prices)} price rows, {len(rets)} return rows, "
        f"{rets.shape[1]} assets."
    )

    asset_ret_paths, port_ret_paths = simulate_copula_paths(
        hist_rets=rets,
        horizon_days=cfg.horizon_days,
        n_paths=cfg.n_paths,
        corr_shrink=cfg.corr_shrink,
    )

    equity_paths = build_portfolio_equity(port_ret_paths)
    stats_df = compute_path_stats(
        equity_paths=equity_paths,
        port_ret_paths=port_ret_paths,
        horizon_days=cfg.horizon_days,
    )

    save_outputs(equity_paths, stats_df, cfg)


def main() -> None:
    cfg = parse_args()
    run_pipeline(cfg)


if __name__ == "__main__":
    # Jupyter/PyCharm shim: strip kernel-related args
    import sys

    sys.argv = [sys.argv[0]] + [
        arg
        for arg in sys.argv[1:]
        if arg != "-f" and not (arg.endswith(".json") and "kernel" in arg)
    ]
    main()


[INFO] Downloading prices for ('SPY', 'QQQ', 'IWM', 'EFA', 'EEM', 'TLT', 'LQD', 'GLD') from 2010-01-01 ...
[INFO] Got 4013 price rows, 4012 return rows, 8 assets.
[OK] Saved equity paths -> level78_copula_paths.csv
[OK] Saved path stats -> level78_copula_stats.csv
[OK] Saved summary -> level78_copula_summary.json
Final wealth median=1.067, p05=0.903, p95=1.295
MaxDD median=-9.6%, mean=-10.3%
Sharpe mean=0.70, median=0.61
