# Model Prototyping

The goal is to test ideas, debug model architectures, and choose hyperparameters before formalizing the code

## Key Activities:

1. Sanity Check Models

2. Test Data Shapes

3. Hyperparameter Tuning

4. Iterate Rapidly

## 1. Sanity Check Models

In [1]:
import sys
sys.path.append('..')
from src.config import data_paths, model_params
from src.models import fit_arima_model, fit_garch_model, fit_xgboost_model, fit_catboost_model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load final dataset
final_data = pd.read_csv(f'{data_paths["processed_data"]}/final_feature_dataset.csv', parse_dates=['Date'])
final_data.set_index('Date', inplace=True)

# Select one ticker for prototyping
ticker = 'AAPL'
data = final_data[final_data['Ticker'] == ticker].copy()

# Prepare target: next day's volatility
data['Target'] = data['Realized_Volatility'].shift(-1)
data = data.dropna()

# Simple sanity check with ARIMA on returns
returns = data['Returns']
arima_model = fit_arima_model(returns, (5, 1, 0))
print('ARIMA model summary:')
print(arima_model.summary())

# Sanity check with GARCH
garch_model = fit_garch_model(returns, 1, 1)
print('GARCH model summary:')
print(garch_model.summary())

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
INFO:root:ARIMA model fitted with order (5, 1, 0)
estimating the model parameters. The scale of y is 0.000333. Parameter
estimation work better when this value is between 1 and 1000. The recommended
rescaling is 100 * y.

model or by setting rescale=False.

INFO:root:GARCH model fitted with p=1, q=1


ARIMA model summary:
                               SARIMAX Results                                
Dep. Variable:                Returns   No. Observations:                 2960
Model:                 ARIMA(5, 1, 0)   Log Likelihood                7414.250
Date:                Thu, 18 Sep 2025   AIC                         -14816.500
Time:                        19:51:03   BIC                         -14780.544
Sample:                             0   HQIC                        -14803.558
                               - 2960                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.8829      0.013    -66.938      0.000      -0.909      -0.857
ar.L2         -0.7035      0.018    -38.041      0.000      -0.740      -0.667
ar.L3         -0.5424      0.02

## 2. Test Data Shapes

In [2]:
# Prepare features and target
features = [col for col in data.columns if col not in ['Ticker', 'Target', 'Realized_Volatility']]
X = data[features]
y = data['Target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')
print(f'Features: {features}')

X_train shape: (2368, 41)
X_test shape: (592, 41)
y_train shape: (2368,)
y_test shape: (592,)
Features: ['Close', 'High', 'Low', 'Open', 'Volume', 'SMA_25', 'SMA_50', 'EMA_25', 'EMA_50', 'Returns', 'Close_Lag_1', 'Returns_Lag_1', 'Volatility_Lag_1', 'Close_Lag_2', 'Returns_Lag_2', 'Volatility_Lag_2', 'Close_Lag_3', 'Returns_Lag_3', 'Volatility_Lag_3', 'Close_Lag_5', 'Returns_Lag_5', 'Volatility_Lag_5', 'Close_Lag_10', 'Returns_Lag_10', 'Volatility_Lag_10', 'Rolling_Mean_5', 'Rolling_Std_5', 'Rolling_Skew_5', 'Rolling_Kurt_5', 'Rolling_Mean_10', 'Rolling_Std_10', 'Rolling_Skew_10', 'Rolling_Kurt_10', 'Rolling_Mean_20', 'Rolling_Std_20', 'Rolling_Skew_20', 'Rolling_Kurt_20', 'Rolling_Mean_50', 'Rolling_Std_50', 'Rolling_Skew_50', 'Rolling_Kurt_50']


## 3. Hyperparameter Tuning

In [3]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Simple hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

xgb = XGBRegressor()
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

# Fit best model
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Test MSE: {mse}')

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best score: -0.0004029930511135966
Test MSE: 0.0006535169166552385


## 4. Iterate Rapidly

In [4]:
# Quick iteration with CatBoost
catboost_model = fit_catboost_model(X_train.values, y_train.values, params={'iterations': 50, 'verbose': 0})
y_pred_cat = catboost_model.predict(X_test.values)
mse_cat = mean_squared_error(y_test, y_pred_cat)
print(f'CatBoost Test MSE: {mse_cat}')

# Compare with XGBoost
print(f'XGBoost MSE: {mse}')
print(f'CatBoost MSE: {mse_cat}')
print('Better model:', 'XGBoost' if mse < mse_cat else 'CatBoost')

INFO:root:CatBoost model fitted


CatBoost Test MSE: 0.0008417286213650223
XGBoost MSE: 0.0006535169166552385
CatBoost MSE: 0.0008417286213650223
Better model: XGBoost
