In [2]:
# level88_granger_network.py
# Level-88: Granger-Causality Network + Influence Ranking (free-data)
#
# Outputs:
#   - level88_gc_pvalues.csv
#   - level88_gc_adjacency.csv
#   - level88_gc_centrality.csv
#   - level88_gc_summary.json
#
# Run:
#   python level88_granger_network.py
#   python level88_granger_network.py --symbols SPY QQQ IWM EFA EEM TLT LQD GLD --start 2010-01-01
#   python level88_granger_network.py --p 2 --alpha 0.05
#   python level88_granger_network.py --rolling --window 756 --step 21   (optional, slower)

import os
import json
import argparse
from dataclasses import dataclass, asdict
from typing import Tuple, Dict, List, Optional

import numpy as np
import pandas as pd
import yfinance as yf

from statsmodels.tsa.api import VAR


# ----------------------------- Config -----------------------------
@dataclass
class Config:
    symbols: Tuple[str, ...] = ("SPY", "QQQ", "IWM", "EFA", "EEM", "TLT", "LQD", "GLD")
    start: str = "2010-01-01"

    p: int = 2                  # VAR lags
    alpha: float = 0.05         # significance threshold for edges
    min_obs: int = 800

    rolling: bool = False
    window: int = 756           # ~3y
    step: int = 21              # monthly

    seed: int = 42

    out_pvals_csv: str = "level88_gc_pvalues.csv"
    out_adj_csv: str = "level88_gc_adjacency.csv"
    out_cent_csv: str = "level88_gc_centrality.csv"
    out_json: str = "level88_gc_summary.json"


# ----------------------------- yfinance loader -----------------------------
def _safe_close_series(px: pd.DataFrame, symbol: str) -> pd.Series:
    if isinstance(px.columns, pd.MultiIndex):
        for key in [("Close", symbol), ("Adj Close", symbol), (symbol, "Close"), (symbol, "Adj Close")]:
            if key in px.columns:
                s = px[key].copy()
                s.name = symbol
                return s
        candidates = [
            c for c in px.columns
            if isinstance(c, tuple) and (symbol in c) and ("Close" in c or "Adj Close" in c)
        ]
        if candidates:
            s = px[candidates[0]].copy()
            s.name = symbol
            return s
        raise RuntimeError(f"Could not locate Close/Adj Close for {symbol} in MultiIndex columns.")

    if "Close" in px.columns:
        s = px["Close"].copy()
        s.name = symbol
        return s
    if "Adj Close" in px.columns:
        s = px["Adj Close"].copy()
        s.name = symbol
        return s
    raise RuntimeError(f"'Close' missing for {symbol}. Columns={list(px.columns)}")


def load_prices(symbols: Tuple[str, ...], start: str) -> pd.DataFrame:
    px = yf.download(
        list(symbols),
        start=start,
        auto_adjust=True,
        progress=False,
        group_by="column",
        threads=True,
    )
    if px is None or px.empty:
        raise RuntimeError("No price data returned from yfinance.")
    frames = [_safe_close_series(px, s) for s in symbols]
    prices = pd.concat(frames, axis=1).sort_index().dropna(how="any")
    return prices


def compute_log_returns(prices: pd.DataFrame) -> pd.DataFrame:
    rets = np.log(prices).diff().dropna()
    rets = rets.replace([np.inf, -np.inf], np.nan).dropna(how="any")
    return rets


# ----------------------------- Centrality (no external deps) -----------------------------
def pagerank_from_adjacency(adj: np.ndarray, damping: float = 0.85, iters: int = 200, tol: float = 1e-10) -> np.ndarray:
    """
    Directed PageRank on adjacency matrix (i -> j) edges.
    We build transition matrix where outgoing edges distribute probability.
    """
    n = adj.shape[0]
    M = adj.astype(float).copy()
    out = M.sum(axis=1)
    # Handle dangling nodes
    for i in range(n):
        if out[i] > 0:
            M[i, :] /= out[i]
        else:
            M[i, :] = 1.0 / n

    r = np.ones(n) / n
    teleport = np.ones(n) / n

    for _ in range(iters):
        r_new = damping * (M.T @ r) + (1.0 - damping) * teleport
        if np.max(np.abs(r_new - r)) < tol:
            r = r_new
            break
        r = r_new
    return r


# ----------------------------- Pairwise GC matrix -----------------------------
def granger_matrix(rets: pd.DataFrame, p: int) -> pd.DataFrame:
    if len(rets) < 10 * p:
        raise RuntimeError("Too few observations for VAR / Granger tests.")
    model = VAR(rets)
    res = model.fit(p)

    syms = list(rets.columns)
    n = len(syms)
    pvals = np.ones((n, n), dtype=float)

    # pvals[i,j] = p-value for i -> j (i causes j)
    for i, src in enumerate(syms):
        for j, tgt in enumerate(syms):
            if i == j:
                pvals[i, j] = np.nan
                continue
            try:
                test = res.test_causality(caused=tgt, causing=[src], kind="f")
                pvals[i, j] = float(test.pvalue)
            except Exception:
                pvals[i, j] = np.nan

    return pd.DataFrame(pvals, index=syms, columns=syms)


def compute_centrality(pvals: pd.DataFrame, alpha: float) -> Dict[str, pd.DataFrame]:
    syms = list(pvals.index)
    n = len(syms)

    adj = (pvals.values < alpha).astype(int)
    np.fill_diagonal(adj, 0)

    out_deg = adj.sum(axis=1)
    in_deg = adj.sum(axis=0)
    net = out_deg - in_deg
    pr = pagerank_from_adjacency(adj)

    cent = pd.DataFrame({
        "out_degree": out_deg,
        "in_degree": in_deg,
        "net_influence": net,
        "pagerank": pr
    }, index=syms).sort_values(["net_influence", "pagerank"], ascending=[False, False])

    return {
        "adjacency": pd.DataFrame(adj, index=syms, columns=syms),
        "centrality": cent
    }


# ----------------------------- Rolling total influence (optional) -----------------------------
def rolling_net_influence(rets: pd.DataFrame, cfg: Config) -> pd.DataFrame:
    idx = rets.index
    syms = list(rets.columns)

    dates = []
    rows = []

    for start_i in range(0, len(rets) - cfg.window + 1, cfg.step):
        end_i = start_i + cfg.window
        sub = rets.iloc[start_i:end_i]
        pvals = granger_matrix(sub, cfg.p)
        adj = (pvals.values < cfg.alpha).astype(int)
        np.fill_diagonal(adj, 0)
        out_deg = adj.sum(axis=1)
        in_deg = adj.sum(axis=0)
        net = out_deg - in_deg

        dates.append(idx[end_i - 1])
        rows.append(net.astype(float))

    return pd.DataFrame(rows, index=pd.DatetimeIndex(dates), columns=syms)


# ----------------------------- Pipeline -----------------------------
def run_pipeline(cfg: Config) -> Dict[str, object]:
    np.random.seed(cfg.seed)

    print(f"[INFO] Downloading prices for {cfg.symbols} from {cfg.start} ...")
    prices = load_prices(cfg.symbols, cfg.start)
    rets = compute_log_returns(prices)

    if len(rets) < cfg.min_obs:
        raise RuntimeError(f"Not enough observations: {len(rets)} < min_obs={cfg.min_obs}")

    print(f"[INFO] Got {len(prices)} price rows, {len(rets)} return rows, assets={rets.shape[1]}")

    pvals = granger_matrix(rets, cfg.p)
    out = compute_centrality(pvals, cfg.alpha)

    roll = None
    if cfg.rolling:
        print(f"[INFO] Rolling influence: window={cfg.window}, step={cfg.step} ...")
        roll = rolling_net_influence(rets, cfg)

    summary = {
        "config": asdict(cfg),
        "data_window": {
            "start": str(rets.index.min().date()),
            "end": str(rets.index.max().date()),
            "n_returns": int(len(rets)),
            "assets": int(rets.shape[1]),
        },
        "var": {"lags": int(cfg.p)},
        "alpha_edge_threshold": float(cfg.alpha),
        "top_emitters_by_net_influence": out["centrality"].head(10).index.tolist(),
    }

    return {
        "returns": rets,
        "pvalues": pvals,
        "adjacency": out["adjacency"],
        "centrality": out["centrality"],
        "rolling_net": roll,
        "summary": summary
    }


def save_outputs(result: Dict[str, object], cfg: Config) -> None:
    pvals: pd.DataFrame = result["pvalues"]  # type: ignore
    adj: pd.DataFrame = result["adjacency"]  # type: ignore
    cent: pd.DataFrame = result["centrality"]  # type: ignore
    summary: Dict = result["summary"]  # type: ignore
    roll = result.get("rolling_net", None)

    os.makedirs(os.path.dirname(cfg.out_pvals_csv) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.out_adj_csv) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.out_cent_csv) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.out_json) or ".", exist_ok=True)

    pvals.to_csv(cfg.out_pvals_csv)
    adj.to_csv(cfg.out_adj_csv)
    cent.to_csv(cfg.out_cent_csv)

    if roll is not None:
        roll_path = cfg.out_cent_csv.replace(".csv", "_rolling_net.csv")
        roll.to_csv(roll_path)
        summary["rolling_net_csv"] = roll_path
        print(f"[OK] Saved rolling net influence → {roll_path}")

    with open(cfg.out_json, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print(f"[OK] Saved p-values   → {cfg.out_pvals_csv}")
    print(f"[OK] Saved adjacency  → {cfg.out_adj_csv}")
    print(f"[OK] Saved centrality → {cfg.out_cent_csv}")
    print(f"[OK] Saved summary    → {cfg.out_json}")

    print("[TOP] Net influence (out - in) & PageRank:")
    top = cent.head(min(10, len(cent)))
    for sym, r in top.iterrows():
        print(
            f"  {sym:>5s}  net={int(r['net_influence']):>3d}  "
            f"out={int(r['out_degree']):>3d}  in={int(r['in_degree']):>3d}  "
            f"pagerank={r['pagerank']:.4f}"
        )


# ----------------------------- CLI -----------------------------
def parse_args() -> Config:
    p = argparse.ArgumentParser(description="Level-88: Granger-Causality Network + Influence Ranking")

    p.add_argument("--start", type=str, default="2010-01-01")
    p.add_argument("--symbols", nargs="+", default=list(Config.symbols))

    p.add_argument("--p", type=int, default=2)
    p.add_argument("--alpha", type=float, default=0.05)
    p.add_argument("--min-obs", type=int, default=800)

    p.add_argument("--rolling", action="store_true")
    p.add_argument("--window", type=int, default=756)
    p.add_argument("--step", type=int, default=21)

    p.add_argument("--seed", type=int, default=42)

    p.add_argument("--pvals-csv", type=str, default="level88_gc_pvalues.csv")
    p.add_argument("--adj-csv", type=str, default="level88_gc_adjacency.csv")
    p.add_argument("--cent-csv", type=str, default="level88_gc_centrality.csv")
    p.add_argument("--json", type=str, default="level88_gc_summary.json")

    a = p.parse_args()
    return Config(
        symbols=tuple(a.symbols),
        start=a.start,
        p=int(a.p),
        alpha=float(a.alpha),
        min_obs=int(a.min_obs),
        rolling=bool(a.rolling),
        window=int(a.window),
        step=int(a.step),
        seed=int(a.seed),
        out_pvals_csv=a.pvals_csv,
        out_adj_csv=a.adj_csv,
        out_cent_csv=a.cent_csv,
        out_json=a.json
    )


def main() -> None:
    cfg = parse_args()
    result = run_pipeline(cfg)
    save_outputs(result, cfg)


if __name__ == "__main__":
    # Jupyter/PyCharm shim: strip "-f kernel.json" etc.
    import sys
    sys.argv = [sys.argv[0]] + [
        arg for arg in sys.argv[1:]
        if arg != "-f" and not (arg.endswith(".json") and "kernel" in arg)
    ]
    main()


[INFO] Downloading prices for ('SPY', 'QQQ', 'IWM', 'EFA', 'EEM', 'TLT', 'LQD', 'GLD') from 2010-01-01 ...
[INFO] Got 4021 price rows, 4020 return rows, assets=8
[OK] Saved p-values   → level88_gc_pvalues.csv
[OK] Saved adjacency  → level88_gc_adjacency.csv
[OK] Saved centrality → level88_gc_centrality.csv
[OK] Saved summary    → level88_gc_summary.json
[TOP] Net influence (out - in) & PageRank:
    LQD  net=  5  out=  7  in=  2  pagerank=0.1846
    TLT  net=  3  out=  4  in=  1  pagerank=0.0826
    SPY  net=  2  out=  4  in=  2  pagerank=0.1001
    GLD  net= -2  out=  0  in=  2  pagerank=0.1858
    IWM  net= -2  out=  1  in=  3  pagerank=0.1214
    EEM  net= -2  out=  1  in=  3  pagerank=0.1214
    QQQ  net= -2  out=  0  in=  2  pagerank=0.1039
    EFA  net= -2  out=  0  in=  2  pagerank=0.1001


  self._init_dates(dates, freq)
