In [1]:
#!/usr/bin/env python
"""
Level-46 — Feature Stationarity Checker & Demeaning Pipeline

- Loads time-series features (default: derived from SPY daily prices via yfinance)
- Runs ADF + KPSS tests for each feature
- Classifies features as stationary / non-stationary / borderline
- Applies simple transformations (diff/logdiff + z-score) to improve stationarity
- Saves:
    * level46_features_raw.csv
    * level46_features_transformed.csv
    * level46_stationarity_report.csv
"""

import argparse
import json
import sys
from dataclasses import dataclass, asdict
from typing import Dict, Tuple, List

import numpy as np
import pandas as pd

import yfinance as yf
from statsmodels.tsa.stattools import adfuller, kpss


# ---------------------- Config ---------------------- #

@dataclass
class Config:
    symbol: str = "SPY"
    start: str = "2010-01-01"
    adf_alpha: float = 0.05
    kpss_alpha: float = 0.05
    out_raw_csv: str = "level46_features_raw.csv"
    out_transformed_csv: str = "level46_features_transformed.csv"
    out_report_csv: str = "level46_stationarity_report.csv"
    out_report_json: str = "level46_stationarity_report.json"
    max_diff_order: int = 1  # we keep it simple: at most 1 difference
    dropna_after_transform: bool = True


# ---------------------- Helpers ---------------------- #

def load_price_data(cfg: Config) -> pd.DataFrame:
    """
    Load daily OHLCV data for cfg.symbol from yfinance and build a basic feature matrix.

    You can replace this with your own feature CSV, e.g.:

        df = pd.read_csv("features.csv", parse_dates=["timestamp"], index_col="timestamp")

    expected: DateTimeIndex, numeric feature columns
    """
    df = yf.download(cfg.symbol, start=cfg.start, auto_adjust=True, progress=False)
    if df.empty:
        raise RuntimeError(f"Failed to load data for symbol={cfg.symbol}")
    df = df[["Close", "Volume"]].rename(columns={"Close": "close", "Volume": "volume"})
    df = df.dropna()

    # Build simple features:
    #   - log returns
    #   - rolling volatility
    #   - moving averages / spreads
    df["ret_1"] = np.log(df["close"]).diff()
    df["ret_5"] = np.log(df["close"]).diff(5)
    df["sma_20"] = df["close"].rolling(20).mean()
    df["sma_50"] = df["close"].rolling(50).mean()
    df["sma_spread"] = df["sma_20"] - df["sma_50"]
    df["rv_20"] = df["ret_1"].rolling(20).std()
    df["log_vol"] = np.log(df["volume"])

    return df.dropna()


def safe_adf(series: pd.Series) -> Tuple[float, float]:
    """Run ADF and return (statistic, p-value), handling short series gracefully."""
    s = series.dropna()
    if len(s) < 20:
        return np.nan, np.nan
    stat, pval, *_ = adfuller(s, autolag="AIC")
    return float(stat), float(pval)


def safe_kpss(series: pd.Series, regression: str = "c") -> Tuple[float, float]:
    """
    Run KPSS and return (statistic, p-value). Handle failures (e.g., constant series)
    by returning NaNs.
    """
    s = series.dropna()
    if len(s) < 20:
        return np.nan, np.nan
    try:
        stat, pval, *_ = kpss(s, regression=regression, nlags="auto")
        return float(stat), float(pval)
    except Exception:
        return np.nan, np.nan


def classify_stationarity(
    adf_p: float,
    kpss_p: float,
    adf_alpha: float,
    kpss_alpha: float,
) -> str:
    """
    Modeled on “ADF + KPSS combo” logic:

    - ADF rejects (p < alpha) & KPSS does NOT reject (p >= alpha) → Stationary
    - ADF does NOT reject & KPSS rejects → Clearly non-stationary
    - Else → Borderline / Inconclusive
    """
    adf_reject = (not np.isnan(adf_p)) and (adf_p < adf_alpha)
    kpss_reject = (not np.isnan(kpss_p)) and (kpss_p < kpss_alpha)

    if adf_reject and not kpss_reject:
        return "stationary"
    elif (not adf_reject) and kpss_reject:
        return "non_stationary"
    elif adf_reject and kpss_reject:
        return "mixed"
    else:
        return "borderline"


def recommend_transform(col_name: str, label: str) -> str:
    """
    Very simple rule-based mapping:
    - If looks like a price/level and non-stationary → log-diff
    - If non-stationary and NOT clearly a price → 1st diff
    - If stationary or borderline → zscore (demean/std)
    """
    name = col_name.lower()

    # Basic heuristics for "price-like" series
    price_like = any(
        kw in name for kw in ["close", "price", "level", "index"]
    )

    if label == "non_stationary":
        if price_like:
            return "logdiff"
        else:
            return "diff"
    elif label == "mixed":
        # play it safe
        return "diff"
    else:
        # stationary or borderline: standardize
        return "zscore"


def apply_transform(series: pd.Series, rule: str) -> pd.Series:
    """Apply transformation rule to a single series."""
    s = series.astype(float)

    if rule == "logdiff":
        s = np.log(s.replace(0, np.nan)).diff()
    elif rule == "diff":
        s = s.diff()
    elif rule == "zscore":
        # global (in-sample) z-score; for live use, prefer rolling or expanding
        mu = s.mean()
        sigma = s.std()
        if sigma == 0 or np.isnan(sigma):
            return s * 0.0
        s = (s - mu) / sigma
    elif rule == "none":
        return s
    else:
        # unknown rule, fall back to identity
        return s

    return s


def run_stationarity_pipeline(cfg: Config, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Main logic:
    - takes a DataFrame df with DateTimeIndex and numeric columns
    - runs ADF & KPSS per column
    - recommends & applies a transform
    - returns (raw_df, transformed_df, report_df)
    """
    df = df.copy()
    df = df.select_dtypes(include=[np.number])  # keep numeric columns only
    df = df.sort_index()

    report_rows: List[Dict] = {}
    transformed = pd.DataFrame(index=df.index)

    rows = []
    for col in df.columns:
        s = df[col]

        adf_stat, adf_p = safe_adf(s)
        kpss_stat, kpss_p = safe_kpss(s, regression="c")

        label = classify_stationarity(adf_p, kpss_p, cfg.adf_alpha, cfg.kpss_alpha)
        rule = recommend_transform(col, label)

        t_series = apply_transform(s, rule)

        rows.append({
            "feature": col,
            "adf_stat": adf_stat,
            "adf_p": adf_p,
            "kpss_stat": kpss_stat,
            "kpss_p": kpss_p,
            "stationarity_label": label,
            "recommended_transform": rule,
        })

        transformed[col] = t_series

    report_df = pd.DataFrame(rows).set_index("feature")

    if cfg.dropna_after_transform:
        transformed = transformed.dropna()

    return df, transformed, report_df


def save_outputs(
    raw_df: pd.DataFrame,
    transformed_df: pd.DataFrame,
    report_df: pd.DataFrame,
    cfg: Config,
) -> None:
    raw_df.to_csv(cfg.out_raw_csv)
    transformed_df.to_csv(cfg.out_transformed_csv)
    report_df.to_csv(cfg.out_report_csv)
    with open(cfg.out_report_json, "w", encoding="utf-8") as f:
        json.dump(report_df.reset_index().to_dict(orient="records"), f, indent=2)
    print(f"[OK] Saved raw features      → {cfg.out_raw_csv}")
    print(f"[OK] Saved transformed feats → {cfg.out_transformed_csv}")
    print(f"[OK] Saved stationarity rpt  → {cfg.out_report_csv}")
    print(f"[OK] Saved JSON report       → {cfg.out_report_json}")


# ---------------------- CLI ---------------------- #

def parse_args() -> Config:
    p = argparse.ArgumentParser(
        description="Level-46 Stationarity Checker & Demeaning Pipeline"
    )
    p.add_argument("--symbol", type=str, default="SPY", help="Ticker for demo (yfinance)")
    p.add_argument("--start", type=str, default="2010-01-01", help="Start date for data")
    p.add_argument("--adf-alpha", type=float, default=0.05, help="ADF significance level")
    p.add_argument("--kpss-alpha", type=float, default=0.05, help="KPSS significance level")
    p.add_argument("--raw-csv", type=str, default="level46_features_raw.csv")
    p.add_argument("--transformed-csv", type=str, default="level46_features_transformed.csv")
    p.add_argument("--report-csv", type=str, default="level46_stationarity_report.csv")
    p.add_argument("--report-json", type=str, default="level46_stationarity_report.json")
    p.add_argument("--dropna", action="store_true", help="Drop NaNs after transforms")

    args = p.parse_args()

    cfg = Config(
        symbol=args.symbol,
        start=args.start,
        adf_alpha=args.adf_alpha,
        kpss_alpha=args.kpss_alpha,
        out_raw_csv=args.raw_csv,
        out_transformed_csv=args.transformed_csv,
        out_report_csv=args.report_csv,
        out_report_json=args.report_json,
        max_diff_order=1,
        dropna_after_transform=args.dropna,
    )
    return cfg


def main() -> None:
    cfg = parse_args()
    print(f"[INFO] Loading data for {cfg.symbol} from {cfg.start}...")
    df = load_price_data(cfg)

    print(f"[INFO] Running stationarity pipeline on {df.shape[1]} features...")
    raw_df, transformed_df, report_df = run_stationarity_pipeline(cfg, df)

    save_outputs(raw_df, transformed_df, report_df, cfg)

    # Simple console summary
    print("\n[SUMMARY] Stationarity report:")
    print(report_df[["stationarity_label", "recommended_transform"]])


if __name__ == "__main__":
    # Make it Jupyter-friendly by stripping unwanted kernel args like "-f kernel-xxxx.json"
    if len(sys.argv) > 1:
        sys.argv = [sys.argv[0]] + [
            arg for arg in sys.argv[1:]
            if arg != "-f" and not (arg.endswith(".json") and "kernel" in arg)
        ]
    main()


ModuleNotFoundError: No module named 'statsmodels'