In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    mean_absolute_percentage_error,
)
import warnings
from pandas.errors import SettingWithCopyWarning

# suppress pandas copy warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:

# 1. LOAD EXPANDED DATASET & FILTER YEARS
# ---------------------------------------------------
mr = (
    pd.read_csv(
        "updated_QQQ_MarketRisk.csv",
        parse_dates=["Date"]
    )
    .sort_values("Date")
    .query("Date.dt.year >= 2015 and Date.dt.year <= 2023")
    .reset_index(drop=True)
)

In [3]:
print(mr.sample(10))

           Date  QQQ_Return  QQQ_Realized_Volatility  QQQ_Volume        VIX  \
1164 2019-10-09    0.009762                 0.152848    20147400  18.639999   
1953 2022-11-25   -0.006579                 0.366570    15498800  20.500000   
286  2016-04-13    0.012784                 0.128893    25356700  13.840000   
1189 2019-11-13    0.000248                 0.092099    15774600  13.000000   
83   2015-06-23    0.000813                 0.125085    18127900  12.110000   
118  2015-08-12    0.003450                 0.136909    41889000  13.610000   
1972 2022-12-22   -0.024465                 0.269826    60278400  21.969999   
164  2015-10-16    0.004180                 0.189346    27091100  15.050000   
357  2016-07-25    0.000088                 0.213713    12968100  12.870000   
744  2018-02-07   -0.012938                 0.215796    91028300  27.730000   

         RSI_14        VXN  FedRate  
1164  40.075998  21.879999    1.628  
1953  68.995505  26.790001    4.175  
286   66.140339 

In [4]:
# 2. ENGINEER NEXT-DAY REALIZED VOLATILITY
# ---------------------------------------------------
mr["NextVol"] = mr["QQQ_Realized_Volatility"].shift(-1)

In [5]:
# 3. SELECT FEATURES & CLEAN
# ---------------------------------------------------
features = [
    "QQQ_Return",
    "QQQ_Realized_Volatility",
    "QQQ_Volume",
    "VIX",
    "RSI_14",
    "VXN",
    "FedRate",
]

# coerce numeric, drop any rows missing features or target
for col in features + ["NextVol"]:
    mr[col] = pd.to_numeric(mr[col], errors="coerce")

mr.dropna(subset=features + ["NextVol"], inplace=True)
mr.reset_index(drop=True, inplace=True)

In [6]:
# 4. ROLLING 3-YEAR WINDOW → PREDICT ONLY FOR 2020–2023
# ---------------------------------------------------
results = []
first_date = mr["Date"].min()

for _, row in mr.iterrows():
    today = row["Date"]
    year  = today.year

    # only record predictions in test window
    if year < 2020 or year > 2023:
        continue

    # require 3 full years of history
    if today < (first_date + pd.DateOffset(years=3)):
        continue

    # build train window [today - 3y, today)
    window_start = today - pd.DateOffset(years=3)
    train_df     = mr[(mr["Date"] >= window_start) & (mr["Date"] < today)]

    # skip if too few training rows
    if len(train_df) < 200:
        continue

    X_train = train_df[features]
    y_train = train_df["NextVol"]

    # today's features → one‐step forecast
    X_today = row[features].values.reshape(1, -1)
    y_true  = row["NextVol"]

    # train & predict
    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        n_estimators=50,
        learning_rate=0.05,
        random_state=42,
    )
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_today)[0]

    results.append({
        "Date":              today,
        "PredictedNextVol":  y_pred,
        "ActualNextVol":     y_true
    })

In [8]:
# 5. ASSEMBLE & SAVE
# ---------------------------------------------------
out_df = pd.DataFrame(results)
out_df.to_csv("rolling_4yr_preds_2020_23.csv", index=False)
print(f"Saved {len(out_df)} predictions → rolling_3yr_preds_2020_23.csv")

Saved 1005 predictions → rolling_3yr_preds_2020_23.csv


In [9]:
# 6. COMPUTE TEST METRICS (2020–2023)
# ---------------------------------------------------
y_pred = out_df["PredictedNextVol"]
y_true = out_df["ActualNextVol"]

mse  = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2   = r2_score(y_true, y_pred)
mae  = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

print("\n--- Test Metrics (2020–2023) ---")
print(f"Test MSE : {mse:.6f}")
print(f"Test RMSE: {rmse:.6f}")
print(f"Test R²  : {r2:.4f}")
print(f"Test MAE : {mae:.6f}")
print(f"Test MAPE: {mape:.2%}")


--- Test Metrics (2020–2023) ---
Test MSE : 0.000778
Test RMSE: 0.027893
Test R²  : 0.9521
Test MAE : 0.014613
Test MAPE: 5.91%
