In [19]:
import numpy as np
from ETL.LinearRegressionETL import LinearRegressionETL
from ETL.utils import rmspe

In [20]:
etl = LinearRegressionETL(
    orderbook_path="optiver-realized-volatility-prediction/book_train.parquet/*",
    train_path='optiver-realized-volatility-prediction/train.csv'
)

In [21]:
%%time

etl.denormalize_prices()

  0%|          | 0/112 [00:40<?, ?it/s]0/112 [00:00<?, ?file/s]
Denormalizing prices:   0%|          | 0/112 [00:40<?, ?file/s]

CPU times: user 167 ms, sys: 189 ms, total: 355 ms
Wall time: 40.7 s





Unnamed: 0,time_id,price,stock_id
0,5,354.254181,17
1,11,336.246613,17
2,16,305.269318,17
3,31,342.323792,17
4,62,331.955383,17
...,...,...,...
3825,32751,449.804657,98
3826,32753,450.801147,98
3827,32758,461.180573,98
3828,32763,386.533691,98


In [22]:
%%time
features = etl.compute_features()

Computing features: 100%|██████████| 112/112 [01:01<00:00,  1.83file/s]


CPU times: user 1min 10s, sys: 4.63 s, total: 1min 15s
Wall time: 1min 1s


In [23]:
print(f'The RMSPE score of the native prediciton for the training set is {rmspe(features["target"], features["rel_vol"])}')

The RMSPE score of the native prediciton for the training set is 0.3412102154423102


## OLS

In [24]:
import statsmodels.api as sm
from sklearn.model_selection import KFold

In [25]:
# Initialize KFold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)

preds = []
cv_scores = []

# Iterate through each stock
for i, stock_id in enumerate(features.index.unique()):
    # Extract features and target
    X = features.loc[stock_id, ['price', 'rel_vol', 'vol_gk', 'imbalance', 'bidask']].values
    #X = joined.loc[stock_id, ['price', 'rel_vol', 'vol_gk', 'bidask']].values
    y = features.loc[stock_id, 'target'].values

    # Compute sample weights
    weights = 1 / np.square(y)
    # weights = np.ones(X.shape[0])  # Uncomment this line to use equal weights

    stock_preds = np.zeros_like(y, dtype=float)
    fold_scores = []

    # KFold CV for each stock
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        w_train = weights[train_idx]

        # Add intercept term explicitly
        X_train = sm.add_constant(X_train, has_constant='add')
        X_test = sm.add_constant(X_test, has_constant='add')

        # Train the model with sample weights
        model = sm.WLS(y_train, X_train, weights=w_train)
        results = model.fit()

        # Predict and store
        y_pred = results.predict(X_test)
        stock_preds[test_idx] = y_pred

        # RMSPE calculation
        fold_rmspe = np.sqrt(np.mean(np.square((y_test - y_pred) / y_test)))
        fold_scores.append(fold_rmspe)

    # Store CV scores and predictions
    cv_scores.append(np.mean(fold_scores))
    preds += list(stock_preds)

    # Print R-style summary for this stock
    with open(f"ols_summary/stock_{stock_id}_summary.txt", "w") as f:
        f.write(results.summary().as_text())

# Assign predictions back to DataFrame
features["pred"] = preds

# Print average CV RMSPE across all stocks
print(f"\nAverage CV RMSPE: {np.mean(cv_scores):.5f}")


Average CV RMSPE: 0.24679
