In [1]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
spyTicker = yf.Ticker("SPY") # SPDR S&P 500 ETF Trust
vixTicker = yf.Ticker("^VIX") # VIX
df_spy = spyTicker.history(period="max")
df_vix = vixTicker.history(period="max")

In [3]:
df_spy

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1993-01-29 00:00:00-05:00,24.330315,24.330315,24.209269,24.313023,1003200,0.0,0.0,0.0
1993-02-01 00:00:00-05:00,24.330329,24.485960,24.330329,24.485960,480500,0.0,0.0,0.0
1993-02-02 00:00:00-05:00,24.468667,24.555128,24.416790,24.537836,201300,0.0,0.0,0.0
1993-02-03 00:00:00-05:00,24.572414,24.814507,24.555122,24.797215,529400,0.0,0.0,0.0
1993-02-04 00:00:00-05:00,24.883695,24.952865,24.607018,24.900988,531500,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2025-11-10 00:00:00-05:00,677.239990,682.179993,675.030029,681.440002,75842900,0.0,0.0,0.0
2025-11-11 00:00:00-05:00,679.950012,683.570007,678.729980,683.000000,58953400,0.0,0.0,0.0
2025-11-12 00:00:00-05:00,684.789978,684.960022,680.950012,683.380005,62312500,0.0,0.0,0.0
2025-11-13 00:00:00-05:00,680.500000,680.859985,670.520020,672.039978,103457800,0.0,0.0,0.0


In [4]:
df_vix

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-01-02 00:00:00-06:00,17.240000,17.240000,17.240000,17.240000,0,0.0,0.0
1990-01-03 00:00:00-06:00,18.190001,18.190001,18.190001,18.190001,0,0.0,0.0
1990-01-04 00:00:00-06:00,19.219999,19.219999,19.219999,19.219999,0,0.0,0.0
1990-01-05 00:00:00-06:00,20.110001,20.110001,20.110001,20.110001,0,0.0,0.0
1990-01-08 00:00:00-06:00,20.260000,20.260000,20.260000,20.260000,0,0.0,0.0
...,...,...,...,...,...,...,...
2025-11-10 00:00:00-06:00,18.580000,18.820000,17.600000,17.600000,0,0.0,0.0
2025-11-11 00:00:00-06:00,17.900000,18.010000,17.250000,17.280001,0,0.0,0.0
2025-11-12 00:00:00-06:00,17.209999,18.059999,17.100000,17.510000,0,0.0,0.0
2025-11-13 00:00:00-06:00,17.610001,21.309999,17.510000,20.000000,0,0.0,0.0


In [5]:
# SPY features
# daily log price and log-returns for SPY
df_spy["log_close"] = np.log(df_spy["Close"])
df_spy["ret_1d"] = df_spy["log_close"].diff(1)   # daily log-return

# lagged returns (1 and 2 days)
df_spy["ret_lag1"] = df_spy["ret_1d"].shift(1)
df_spy["ret_lag2"] = df_spy["ret_1d"].shift(2)

# rolling statistics of returns
ret_window = 10
df_spy["ret_roll_mean_10"] = df_spy["ret_1d"].rolling(window=ret_window).mean()
df_spy["ret_roll_std_10"] = df_spy["ret_1d"].rolling(window=ret_window).std()

# Technical indicators: moving averages on Close (5, 10, 20 days)
df_spy["ma_5"] = df_spy["Close"].rolling(window=5).mean()
df_spy["ma_10"] = df_spy["Close"].rolling(window=10).mean()
df_spy["ma_20"] = df_spy["Close"].rolling(window=20).mean()

# Rate of change (ROC) – 5 and 10 day
df_spy["roc_5"] = df_spy["Close"].pct_change(periods=5)
df_spy["roc_10"] = df_spy["Close"].pct_change(periods=10)

# Volume-related feature (log-volume + 10-day rolling mean)
df_spy["log_volume"] = np.log(df_spy["Volume"].replace(0, np.nan))
df_spy["vol_roll_mean_10"] = df_spy["log_volume"].rolling(window=10).mean()

In [None]:
# # VIX features
# df_vix["vix_close"] = df_vix["Close"]
# df_vix["vix_log_close"] = np.log(df_vix["vix_close"])
# df_vix["vix_ret_1d"] = df_vix["vix_log_close"].diff(1)
# df_vix["vix_lag1"] = df_vix["vix_log_close"].shift(1)
# df_vix["vix_lag2"] = df_vix["vix_log_close"].shift(2)

# df_vix

# # Align VIX to SPY dates
# df_vix_aligned = df_vix[["vix_close", "vix_log_close", "vix_ret_1d", "vix_lag1", "vix_lag2"]]
# df_vix_aligned = df_vix_aligned.reindex(df_spy.index).ffill()

# df_vix_aligned.tail()

Unnamed: 0_level_0,vix_close,vix_log_close,vix_ret_1d,vix_lag1,vix_lag2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-11-10 00:00:00-05:00,,,,,
2025-11-11 00:00:00-05:00,,,,,
2025-11-12 00:00:00-05:00,,,,,
2025-11-13 00:00:00-05:00,,,,,
2025-11-14 00:00:00-05:00,,,,,


In [7]:
df = pd.concat([df_spy, df_vix_aligned], axis=1)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,log_close,ret_1d,...,ma_20,roc_5,roc_10,log_volume,vol_roll_mean_10,vix_close,vix_log_close,vix_ret_1d,vix_lag1,vix_lag2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-29 00:00:00-05:00,24.330315,24.330315,24.209269,24.313023,1003200,0.0,0.0,0.0,3.191012,,...,,,,13.818705,,,,,,
1993-02-01 00:00:00-05:00,24.330329,24.485960,24.330329,24.485960,480500,0.0,0.0,0.0,3.198100,0.007088,...,,,,13.082583,,,,,,
1993-02-02 00:00:00-05:00,24.468667,24.555128,24.416790,24.537836,201300,0.0,0.0,0.0,3.200216,0.002116,...,,,,12.212552,,,,,,
1993-02-03 00:00:00-05:00,24.572414,24.814507,24.555122,24.797215,529400,0.0,0.0,0.0,3.210731,0.010515,...,,,,13.179500,,,,,,
1993-02-04 00:00:00-05:00,24.883695,24.952865,24.607018,24.900988,531500,0.0,0.0,0.0,3.214907,0.004176,...,,,,13.183458,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-11-10 00:00:00-05:00,677.239990,682.179993,675.030029,681.440002,75842900,0.0,0.0,0.0,6.524208,0.015484,...,674.614499,-0.002780,-0.005545,18.144175,18.163432,,,,,
2025-11-11 00:00:00-05:00,679.950012,683.570007,678.729980,683.000000,58953400,0.0,0.0,0.0,6.526495,0.002287,...,675.653000,0.011492,-0.005909,17.892258,18.158817,,,,,
2025-11-12 00:00:00-05:00,684.789978,684.960022,680.950012,683.380005,62312500,0.0,0.0,0.0,6.527051,0.000556,...,676.563501,0.008560,-0.005834,17.947673,18.126998,,,,,
2025-11-13 00:00:00-05:00,680.500000,680.859985,670.520020,672.039978,103457800,0.0,0.0,0.0,6.510318,-0.016733,...,677.133499,0.002581,-0.011459,18.454674,18.157400,,,,,
