In [2]:
pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting patsy>=0.5.6
  Downloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.9/232.9 KB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: patsy, statsmodels
Successfully installed patsy-1.0.1 statsmodels-0.14.4
Note: you may need to restart the kernel to use updated packages.


In [7]:
"""
Run this notebook  to evaluate the stationarity of
1. **Yield levels**  – `Y_df.csv`
2. **Yield changes** – `Y_df_change_<h>.csv` for horizons 1, 5, 21, 63, 252

Outputs (saved automatically in `DATA_DIR`):
- `adf_results_levels.csv`
- `adf_results_h<1|5|21|63|252>.csv`
- `adf_stationarity_summary.csv`

All loops are wrapped with **tqdm** progress bars that render nicely in Jupyter Lab/Notebook.  
(The ADF test itself is CPU‑bound; GPU is detected only for information.)
"""

# ---------------------- Imports ---------------------- #
import os
from pathlib import Path
from typing import Dict, Any

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from tqdm.notebook import tqdm  # Jupyter‑friendly progress bar
import torch                    # just for GPU info

# ---------------------- Config ---------------------- #
DATA_DIR   = Path("./")   # ← change this to your CSV directory if needed
HORIZONS   = [1, 5, 21, 63, 252]
LAG        = 1            # fixed lag for ADF (speed‑friendly)

# ---------------------- Helper ---------------------- #

def run_adf(series: pd.Series, max_lag: int = LAG) -> Dict[str, Any]:
    """Run (fixed‑lag) Augmented Dickey‑Fuller test on a single Series."""
    series = series.replace([np.inf, -np.inf], np.nan).dropna()
    if len(series) < max_lag + 2:
        return {"statistic": np.nan, "pvalue": np.nan, "stationary": False, "n_obs": len(series)}
    try:
        stat, pval, *_ = adfuller(series, maxlag=max_lag, autolag=None)
        return {
            "statistic": stat,
            "pvalue"   : pval,
            "stationary": pval < 0.05,
            "n_obs"    : len(series),
        }
    except Exception as e:
        return {"statistic": np.nan, "pvalue": np.nan, "stationary": False, "n_obs": len(series), "error": str(e)}


def adf_dataframe(df: pd.DataFrame, max_lag: int = LAG) -> pd.DataFrame:
    """Apply ADF across all columns with a progress bar."""
    results = {}
    for col in tqdm(df.columns, desc="Columns", leave=False):
        results[col] = run_adf(df[col].astype(float), max_lag)
    return pd.DataFrame(results).T

# ---------------------- GPU Info (optional) ---------------------- #
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print(f"[INFO] GPU detected → {torch.cuda.get_device_name(0)}")
else:
    print("[INFO] Running on CPU (ADF is CPU‑bound).")

# ---------------------- 1. Levels ---------------------- #
levels_path = DATA_DIR / "Y_df.csv"
assert levels_path.exists(), f"Missing file: {levels_path}"

print("\n[LEVELS] Loading yield level data …")
levels_df = pd.read_csv(levels_path, index_col=0, parse_dates=True)

print("[LEVELS] Running ADF tests …")
adf_levels = adf_dataframe(levels_df)
levels_out = DATA_DIR / "adf_results_levels.csv"
adf_levels.to_csv(levels_out)
print(f"[LEVELS] Saved → {levels_out.name}")

# summary placeholder
summary_rows = [{
    "dataset"         : "levels",
    "n_cols"          : len(adf_levels),
    "stationary_cols" : int(adf_levels["stationary"].sum()),
}]

# ---------------------- 2. Changes ---------------------- #
for h in tqdm(HORIZONS, desc="Horizons"):
    change_path = DATA_DIR / f"Y_df_change_{h}.csv"
    if not change_path.exists():
        print(f"[WARN] {change_path.name} not found – skipped.")
        continue
    df = pd.read_csv(change_path, index_col=0, parse_dates=True)
    df = df.replace([np.inf, -np.inf], np.nan)

    print(f"\n[ΔY] Horizon {h}: running ADF …")
    adf_df = adf_dataframe(df)

    out_name = DATA_DIR / f"adf_results_h{h}.csv"
    adf_df.to_csv(out_name)
    print(f"[ΔY] Horizon {h}: saved → {out_name.name}")

    summary_rows.append({
        "dataset"         : f"h{h}",
        "n_cols"          : len(adf_df),
        "stationary_cols" : int(adf_df["stationary"].sum()),
    })

# ---------------------- 3. Aggregate Summary ---------------------- #
summary_df = pd.DataFrame(summary_rows)
summary_df["share_stationary"] = summary_df["stationary_cols"] / summary_df["n_cols"]
summary_csv = DATA_DIR / "adf_stationarity_summary.csv"
summary_df.to_csv(summary_csv, index=False)

print("\n✅ Diagnostic complete!")
print("Key outputs:")
for f in [levels_out, *[DATA_DIR / f"adf_results_h{h}.csv" for h in HORIZONS], summary_csv]:
    if f.exists():
        print(" •", f.name)

# ---------------------- 4. Display Summary ---------------------- #
summary_df


[INFO] GPU detected → NVIDIA GeForce RTX 4090

[LEVELS] Loading yield level data …
[LEVELS] Running ADF tests …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[LEVELS] Saved → adf_results_levels.csv


Horizons:   0%|          | 0/5 [00:00<?, ?it/s]


[ΔY] Horizon 1: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 1: saved → adf_results_h1.csv

[ΔY] Horizon 5: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 5: saved → adf_results_h5.csv

[ΔY] Horizon 21: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 21: saved → adf_results_h21.csv

[ΔY] Horizon 63: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 63: saved → adf_results_h63.csv

[ΔY] Horizon 252: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 252: saved → adf_results_h252.csv

✅ Diagnostic complete!
Key outputs:
 • adf_results_levels.csv
 • adf_results_h1.csv
 • adf_results_h5.csv
 • adf_results_h21.csv
 • adf_results_h63.csv
 • adf_results_h252.csv
 • adf_stationarity_summary.csv


Unnamed: 0,dataset,n_cols,stationary_cols,share_stationary
0,levels,6,0,0.0
1,h1,6,6,1.0
2,h5,6,6,1.0
3,h21,6,1,0.166667
4,h63,6,1,0.166667
5,h252,6,1,0.166667


In [6]:
"""
Run this notebook  to evaluate the stationarity of
1. **Yield levels**  – `Y_df.csv`
2. **Yield changes** – `Y_df_change_<h>.csv` for horizons 1, 5, 21, 63, 252

Outputs (saved automatically in `DATA_DIR`):
- `adf_results_levels.csv`
- `adf_results_h<1|5|21|63|252>.csv`
- `adf_stationarity_summary.csv`

All loops are wrapped with **tqdm** progress bars that render nicely in Jupyter Lab/Notebook.  
(The ADF test itself is CPU‑bound; GPU is detected only for information.)
"""

# ---------------------- Imports ---------------------- #
import os
from pathlib import Path
from typing import Dict, Any

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from tqdm.notebook import tqdm  # Jupyter‑friendly progress bar
import torch                    # just for GPU info

# ---------------------- Config ---------------------- #
DATA_DIR   = Path("./")   # ← change this to your CSV directory if needed
HORIZONS   = [1, 5, 21, 63, 252]
LAG        = 8            # fixed lag for ADF (speed‑friendly)

# ---------------------- Helper ---------------------- #

def run_adf(series: pd.Series, max_lag: int = LAG) -> Dict[str, Any]:
    """Run (fixed‑lag) Augmented Dickey‑Fuller test on a single Series."""
    series = series.replace([np.inf, -np.inf], np.nan).dropna()
    if len(series) < max_lag + 2:
        return {"statistic": np.nan, "pvalue": np.nan, "stationary": False, "n_obs": len(series)}
    try:
        stat, pval, *_ = adfuller(series, maxlag=max_lag, autolag=None)
        return {
            "statistic": stat,
            "pvalue"   : pval,
            "stationary": pval < 0.05,
            "n_obs"    : len(series),
        }
    except Exception as e:
        return {"statistic": np.nan, "pvalue": np.nan, "stationary": False, "n_obs": len(series), "error": str(e)}


def adf_dataframe(df: pd.DataFrame, max_lag: int = LAG) -> pd.DataFrame:
    """Apply ADF across all columns with a progress bar."""
    results = {}
    for col in tqdm(df.columns, desc="Columns", leave=False):
        results[col] = run_adf(df[col].astype(float), max_lag)
    return pd.DataFrame(results).T

# ---------------------- GPU Info (optional) ---------------------- #
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print(f"[INFO] GPU detected → {torch.cuda.get_device_name(0)}")
else:
    print("[INFO] Running on CPU (ADF is CPU‑bound).")

# ---------------------- 1. Levels ---------------------- #
levels_path = DATA_DIR / "Y_df.csv"
assert levels_path.exists(), f"Missing file: {levels_path}"

print("\n[LEVELS] Loading yield level data …")
levels_df = pd.read_csv(levels_path, index_col=0, parse_dates=True)

print("[LEVELS] Running ADF tests …")
adf_levels = adf_dataframe(levels_df)
levels_out = DATA_DIR / "adf_results_levels.csv"
adf_levels.to_csv(levels_out)
print(f"[LEVELS] Saved → {levels_out.name}")

# summary placeholder
summary_rows = [{
    "dataset"         : "levels",
    "n_cols"          : len(adf_levels),
    "stationary_cols" : int(adf_levels["stationary"].sum()),
}]

# ---------------------- 2. Changes ---------------------- #
for h in tqdm(HORIZONS, desc="Horizons"):
    change_path = DATA_DIR / f"Y_df_change_{h}.csv"
    if not change_path.exists():
        print(f"[WARN] {change_path.name} not found – skipped.")
        continue
    df = pd.read_csv(change_path, index_col=0, parse_dates=True)
    df = df.replace([np.inf, -np.inf], np.nan)

    print(f"\n[ΔY] Horizon {h}: running ADF …")
    adf_df = adf_dataframe(df)

    out_name = DATA_DIR / f"adf_results_h{h}.csv"
    adf_df.to_csv(out_name)
    print(f"[ΔY] Horizon {h}: saved → {out_name.name}")

    summary_rows.append({
        "dataset"         : f"h{h}",
        "n_cols"          : len(adf_df),
        "stationary_cols" : int(adf_df["stationary"].sum()),
    })

# ---------------------- 3. Aggregate Summary ---------------------- #
summary_df = pd.DataFrame(summary_rows)
summary_df["share_stationary"] = summary_df["stationary_cols"] / summary_df["n_cols"]
summary_csv = DATA_DIR / "adf_stationarity_summary.csv"
summary_df.to_csv(summary_csv, index=False)

print("\n✅ Diagnostic complete!")
print("Key outputs:")
for f in [levels_out, *[DATA_DIR / f"adf_results_h{h}.csv" for h in HORIZONS], summary_csv]:
    if f.exists():
        print(" •", f.name)

# ---------------------- 4. Display Summary ---------------------- #
summary_df


[INFO] GPU detected → NVIDIA GeForce RTX 4090

[LEVELS] Loading yield level data …
[LEVELS] Running ADF tests …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[LEVELS] Saved → adf_results_levels.csv


Horizons:   0%|          | 0/5 [00:00<?, ?it/s]


[ΔY] Horizon 1: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 1: saved → adf_results_h1.csv

[ΔY] Horizon 5: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 5: saved → adf_results_h5.csv

[ΔY] Horizon 21: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 21: saved → adf_results_h21.csv

[ΔY] Horizon 63: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 63: saved → adf_results_h63.csv

[ΔY] Horizon 252: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 252: saved → adf_results_h252.csv

✅ Diagnostic complete!
Key outputs:
 • adf_results_levels.csv
 • adf_results_h1.csv
 • adf_results_h5.csv
 • adf_results_h21.csv
 • adf_results_h63.csv
 • adf_results_h252.csv
 • adf_stationarity_summary.csv


Unnamed: 0,dataset,n_cols,stationary_cols,share_stationary
0,levels,6,0,0.0
1,h1,6,6,1.0
2,h5,6,6,1.0
3,h21,6,1,0.166667
4,h63,6,1,0.166667
5,h252,6,1,0.166667


In [8]:
# ---------- Second difference for long horizons ----------
for h in [21, 63, 252]:
    path = DATA_DIR / f"Y_df_change_{h}.csv"
    df_long = pd.read_csv(path, index_col=0, parse_dates=True)
    df_dd = df_long.diff().dropna()  # Δ²Y
    df_dd.to_csv(DATA_DIR / f"Y_df_change2_{h}.csv")

# ---------- KPSS check (optional) ----------
from statsmodels.tsa.stattools import kpss
def kpss_test(series):
    stat, pval, *_ = kpss(series.dropna(), regression='c', nlags='auto')
    return pval > 0.05  # True ⇒ stationary

In [9]:
# =============================================
# Stationarity Diagnostic (Notebook Edition)
# =============================================
# Author: ChatGPT (generated for Barak)
# Date: 2025-05-13 (rev. Δ² check)
"""
Run this notebook cell-by-cell to evaluate the stationarity of
1. **Yield levels**  – `Y_df.csv`
2. **Yield 1st-differences** – `Y_df_change_<h>.csv` where h ∈ {1, 5, 21, 63, 252}
3. **Yield 2nd-differences** (only for horizons 21, 63, 252) – generated on-the-fly and saved as `Y_df_change2_<h>.csv`

Outputs saved in `DATA_DIR`:
- `adf_results_levels.csv`
- `adf_results_h<1|5|21|63|252>.csv`                (1st-diff)
- `adf_results_h<21|63|252>_d2.csv`                 (2nd-diff)
- `adf_stationarity_summary.csv`                    (overall summary)

Progress bars (`tqdm.notebook`) render nicely in Jupyter. The ADF test itself is CPU-bound; GPU detection is for info only.
"""

# ---------------------- Imports ---------------------- #
import os
from pathlib import Path
from typing import Dict, Any

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from tqdm.notebook import tqdm
import torch  # GPU info only

# ---------------------- Config ---------------------- #
DATA_DIR    = Path("./")            # ← adjust to your CSV directory
HORIZONS    = [1, 5, 21, 63, 252]
LONG_HORIZ  = [21, 63, 252]         # horizons to generate Δ²Y
LAG         = 1                     # fixed lag for ADF (speed-friendly)

# ---------------------- Helper ---------------------- #

def run_adf(series: pd.Series, max_lag: int = LAG) -> Dict[str, Any]:
    """Run fixed-lag Augmented Dickey-Fuller test on a Series."""
    series = series.replace([np.inf, -np.inf], np.nan).dropna()
    if len(series) < max_lag + 2:
        return {"statistic": np.nan, "pvalue": np.nan, "stationary": False, "n_obs": len(series)}
    try:
        stat, pval, *_ = adfuller(series, maxlag=max_lag, autolag=None)
        return {
            "statistic": stat,
            "pvalue": pval,
            "stationary": pval < 0.05,
            "n_obs": len(series),
        }
    except Exception as e:
        return {"statistic": np.nan, "pvalue": np.nan, "stationary": False, "n_obs": len(series), "error": str(e)}


def adf_dataframe(df: pd.DataFrame, max_lag: int = LAG) -> pd.DataFrame:
    """Apply ADF across all columns with a progress bar."""
    results = {}
    for col in tqdm(df.columns, desc="Columns", leave=False):
        results[col] = run_adf(df[col].astype(float), max_lag)
    return pd.DataFrame(results).T

# ---------------------- GPU Info ---------------------- #
if torch.cuda.is_available():
    print(f"[INFO] GPU detected → {torch.cuda.get_device_name(0)} (not used for ADF)")
else:
    print("[INFO] Running on CPU (ADF is CPU-bound).")

# ---------------------- 1. Levels ---------------------- #
levels_path = DATA_DIR / "Y_df.csv"
assert levels_path.exists(), f"Missing file: {levels_path}"

print("\n[LEVELS] Loading yield level data …")
levels_df = pd.read_csv(levels_path, index_col=0, parse_dates=True)
print("[LEVELS] Running ADF tests …")
adf_levels = adf_dataframe(levels_df)
levels_out = DATA_DIR / "adf_results_levels.csv"
adf_levels.to_csv(levels_out)
print(f"[LEVELS] Saved → {levels_out.name}")

# ------------- summary init ------------- #
summary_rows = [{
    "dataset": "levels",
    "n_cols": len(adf_levels),
    "stationary_cols": int(adf_levels["stationary"].sum()),
}]

# ---------------------- 2. First Differences ---------------------- #
for h in tqdm(HORIZONS, desc="1st-Difference Horizons"):
    change_path = DATA_DIR / f"Y_df_change_{h}.csv"
    if not change_path.exists():
        print(f"[WARN] {change_path.name} not found – skipped 1st-diff.")
        continue

    df_1d = pd.read_csv(change_path, index_col=0, parse_dates=True).replace([np.inf, -np.inf], np.nan)

    print(f"\n[ΔY] Horizon {h}: running ADF …")
    adf_1d = adf_dataframe(df_1d)
    out_1d = DATA_DIR / f"adf_results_h{h}.csv"
    adf_1d.to_csv(out_1d)
    print(f"[ΔY] Horizon {h}: saved → {out_1d.name}")

    summary_rows.append({
        "dataset": f"h{h}",
        "n_cols": len(adf_1d),
        "stationary_cols": int(adf_1d["stationary"].sum()),
    })

    # ---------- 3. Second Differences for long horizons ---------- #
    if h in LONG_HORIZ:
        print(f"[Δ²Y] Horizon {h}: computing second difference …")
        df_2d = df_1d.diff().dropna()
        df_2d_path = DATA_DIR / f"Y_df_change2_{h}.csv"
        df_2d.to_csv(df_2d_path)

        print(f"[Δ²Y] Horizon {h}: running ADF …")
        adf_2d = adf_dataframe(df_2d)
        out_2d = DATA_DIR / f"adf_results_h{h}_d2.csv"
        adf_2d.to_csv(out_2d)
        print(f"[Δ²Y] Horizon {h}: saved → {out_2d.name}")

        summary_rows.append({
            "dataset": f"h{h}_d2",
            "n_cols": len(adf_2d),
            "stationary_cols": int(adf_2d["stationary"].sum()),
        })

# ---------------------- 4. Aggregate Summary ---------------------- #
summary_df = pd.DataFrame(summary_rows)
summary_df["share_stationary"] = summary_df["stationary_cols"] / summary_df["n_cols"]
summary_csv = DATA_DIR / "adf_stationarity_2_summary.csv"
summary_df.to_csv(summary_csv, index=False)

print("\n✅ Diagnostic complete!")
print("Key outputs:")
for f in [levels_out,
          *[DATA_DIR / f"adf_results_h{h}.csv" for h in HORIZONS],
          *[DATA_DIR / f"adf_results_h{h}_d2.csv" for h in LONG_HORIZ],
          summary_csv]:
    if f.exists():
        print(" •", f.name)

# ---------------------- 5. Display Summary ---------------------- #
summary_df

[INFO] GPU detected → NVIDIA GeForce RTX 4090 (not used for ADF)

[LEVELS] Loading yield level data …
[LEVELS] Running ADF tests …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[LEVELS] Saved → adf_results_levels.csv


1st-Difference Horizons:   0%|          | 0/5 [00:00<?, ?it/s]


[ΔY] Horizon 1: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 1: saved → adf_results_h1.csv

[ΔY] Horizon 5: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 5: saved → adf_results_h5.csv

[ΔY] Horizon 21: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 21: saved → adf_results_h21.csv
[Δ²Y] Horizon 21: computing second difference …
[Δ²Y] Horizon 21: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[Δ²Y] Horizon 21: saved → adf_results_h21_d2.csv

[ΔY] Horizon 63: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 63: saved → adf_results_h63.csv
[Δ²Y] Horizon 63: computing second difference …
[Δ²Y] Horizon 63: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[Δ²Y] Horizon 63: saved → adf_results_h63_d2.csv

[ΔY] Horizon 252: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[ΔY] Horizon 252: saved → adf_results_h252.csv
[Δ²Y] Horizon 252: computing second difference …
[Δ²Y] Horizon 252: running ADF …


Columns:   0%|          | 0/6 [00:00<?, ?it/s]

[Δ²Y] Horizon 252: saved → adf_results_h252_d2.csv

✅ Diagnostic complete!
Key outputs:
 • adf_results_levels.csv
 • adf_results_h1.csv
 • adf_results_h5.csv
 • adf_results_h21.csv
 • adf_results_h63.csv
 • adf_results_h252.csv
 • adf_results_h21_d2.csv
 • adf_results_h63_d2.csv
 • adf_results_h252_d2.csv
 • adf_stationarity_summary.csv


Unnamed: 0,dataset,n_cols,stationary_cols,share_stationary
0,levels,6,0,0.0
1,h1,6,6,1.0
2,h5,6,6,1.0
3,h21,6,1,0.166667
4,h21_d2,6,6,1.0
5,h63,6,1,0.166667
6,h63_d2,6,6,1.0
7,h252,6,1,0.166667
8,h252_d2,6,6,1.0
