In [2]:
# pip install yfinance pandas numpy
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime

START = "2018-11-01"
END = None  # None = up to today
CSV_OUT = "sp500_ft1.csv"

def realized_vol(returns, window):
    # annualized realiz vol from daily log returns
    return returns.rolling(window).std() * np.sqrt(252)

def rolling_slope(series, window=20):
    """
    Approx slope via simple OLS on [0..window-1] vs values.
    Returns per-day slope; use pct slope if desired: slope / series.rolling(window).mean()
    """
    idx = np.arange(window, dtype=float)
    def _s(x):
        y = x.values
        x_mean, y_mean = idx.mean(), y.mean()
        num = np.sum((idx - x_mean) * (y - y_mean))
        den = np.sum((idx - x_mean) ** 2)
        return num / den if den != 0 else np.nan
    return series.rolling(window).apply(_s, raw=False)

def build_sp500_features(start=START, end=END):
    # ^GSPC for index price/returns; SPY for OHLCV & volume-based features
    spx = yf.download("^GSPC", start=start, end=end, auto_adjust=True, progress=False)
    spy = yf.download("SPY", start=start, end=end, auto_adjust=True, progress=False)

    # Keep only needed cols, standardize names
    spx = spx.rename(columns=str.lower)
    spy = spy.rename(columns=lambda c: f"spy_{c.lower()}")

    # Align on dates (inner join on index)
    df = spx.join(spy[["spy_open","spy_high","spy_low","spy_close","spy_volume"]], how="inner")

    # Core prices
    df["spx_close"] = df["close"]

    # Returns
    df["spx_ret_1d"]    = df["spx_close"].pct_change()
    df["spx_logret_1d"] = np.log(df["spx_close"] / df["spx_close"].shift(1))

    # Trailing momentum (simple cumulative returns)
    for w in [5, 20, 60, 252]:
        df[f"spx_trailing_ret_{w}d"] = df["spx_close"].pct_change(periods=w)

    # Realized volatility (annualized) from log returns
    df["spx_realized_vol_20d_ann"] = realized_vol(df["spx_logret_1d"], 20)
    df["spx_realized_vol_60d_ann"] = realized_vol(df["spx_logret_1d"], 60)

    # Volatility regime (percentiles over trailing 252d)
    def rolling_percentile(x, w=252):
        # rank percentile of last value within rolling window
        def _p(window_vals):
            v = window_vals[-1]
            if np.isnan(v): return np.nan
            return (np.sum(window_vals <= v) - 1) / (len(window_vals) - 1) if len(window_vals) > 1 else np.nan
        return x.rolling(w).apply(_p, raw=False)

    df["spx_vol20_pctile_252d"] = rolling_percentile(df["spx_realized_vol_20d_ann"], 252)

    # Moving averages & z-scores
    df["spx_ma_50"]  = df["spx_close"].rolling(50).mean()
    df["spx_ma_200"] = df["spx_close"].rolling(200).mean()
    df["spx_sd_50"]  = df["spx_close"].rolling(50).std()

    # Z-score relative to MA(50): (price - MA50) / rolling std(50)
    df["spx_z_to_ma50"] = (df["spx_close"] - df["spx_ma_50"]) / df["spx_sd_50"]

    # Distance from 52w high/low & drawdown
    df["spx_rolling_max_252"] = df["spx_close"].rolling(252).max()
    df["spx_rolling_min_252"] = df["spx_close"].rolling(252).min()
    df["spx_pct_from_52w_high"] = df["spx_close"] / df["spx_rolling_max_252"] - 1.0
    df["spx_pct_from_52w_low"]  = df["spx_close"] / df["spx_rolling_min_252"] - 1.0

    # Expanding drawdown from all-time-to-date high
    df["spx_running_max"] = df["spx_close"].cummax()
    df["spx_drawdown"]    = df["spx_close"] / df["spx_running_max"] - 1.0

    # Slope of MA(50) and MA(200) (per-day slope); also pct slope if you want
    df["spx_ma50_slope"]  = rolling_slope(df["spx_ma_50"], 20)
    df["spx_ma200_slope"] = rolling_slope(df["spx_ma_200"], 20)
    # Optional normalized slopes (per-day pct): uncomment if desired
    # df["spx_ma50_slope_pct"]  = df["spx_ma50_slope"]  / df["spx_ma_50"].rolling(20).mean()
    # df["spx_ma200_slope_pct"] = df["spx_ma200_slope"] / df["spx_ma_200"].rolling(20).mean()

    # Volume-based features from SPY
    df["spy_rel_volume_20"] = df["spy_volume"] / df["spy_volume"].rolling(20).mean()
    df["spy_vol_pct_change_1d"] = df["spy_volume"].pct_change()

    # ATR(14) and ATR(20) from SPY (auto_adjust=True means adjusted OHLC)
    def calc_atr(high, low, close, window):
        prev_close = close.shift(1)
        tr = np.maximum(high - low, np.maximum(abs(high - prev_close), abs(low - prev_close)))
        return tr.rolling(window).mean()

    df["spy_atr_14"] = calc_atr(df["spy_high"], df["spy_low"], df["spy_close"], 14)
    df["spy_atr_20"] = calc_atr(df["spy_high"], df["spy_low"], df["spy_close"], 20)

    # Clean up temporary columns
    drop_cols = ["open","high","low","close","adj close","volume","spx_sd_50","spx_running_max"]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])

    # Final tidy: keep only business days present in both series; drop leading NaNs from rolling windows
    df = df.dropna().copy()
    df.index.name = "date"

    return df

if __name__ == "__main__":
    spx_df = build_sp500_features(START, END)
    # Flatten multi-index columns if they exist
    if isinstance(spx_df.columns, pd.MultiIndex):
        spx_df.columns = ["_".join(filter(None, map(str, col))).strip() for col in spx_df.columns]

    spx_df.to_csv(CSV_OUT)
    print(f"Saved {len(spx_df):,} rows x {spx_df.shape[1]} cols -> {CSV_OUT}")
    print(spx_df.tail(3))


  v = window_vals[-1]


Saved 1,433 rows x 29 cols -> sp500_ft1.csv
            spy_open_spy_spy  spy_high_spy_spy  spy_low_spy_spy  \
date                                                              
2025-08-12        638.289978        642.849976       636.789978   
2025-08-13        644.909973        646.190002       642.679993   
2025-08-14        642.789978        645.619995       642.340027   

            spy_close_spy_spy  spy_volume_spy_spy    spx_close  spx_ret_1d  \
date                                                                         
2025-08-12         642.690002            64730800  6445.759766    0.011345   
2025-08-13         644.890015            60092800  6466.580078    0.003230   
2025-08-14         644.950012            59274300  6468.540039    0.000303   

            spx_logret_1d  spx_trailing_ret_5d  spx_trailing_ret_20d  ...  \
date                                                                  ...   
2025-08-12       0.011282             0.023268              0.032352  ...  

  df = df.drop(columns=[c for c in drop_cols if c in df.columns])
