In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
import statsmodels.api as sm

In [2]:
def qlike_loss(actual, pred, eps=1e-12):
    pred = np.maximum(pred, eps)         
    actual = np.maximum(actual, eps)
    ratio = actual / pred
    return np.mean(ratio - np.log(ratio) - 1.0)

In [3]:
feature_cols = ['stock_id','mid_price', 'spread', 'imbalance',
       'book_pressure', 'LOB_entropy', 'log_return', 'bipower_var',
       'log_wap_return', 'imbalance_lag1', 'imbalance_lag2',
       'book_pressure_lag1', 'book_pressure_lag2', 'log_return_lag1',
       'log_return_lag2', 'rolling_vol_30', 'rolling_imbalance_mean_30',
       'sec_sin', 'sec_cos', 'bid_size1_log', 'ask_size1_log', 'bid_size2_log',
       'ask_size2_log']
target_col   = 'rv_future'

In [4]:
df = pd.read_parquet("/Users/ayush/Documents/University/Year 03/Sem 01/DATA3888/Optiver-07/Data/FE30Stocks.parquet")

In [5]:
X = df[feature_cols].astype('float32')  
y = df[target_col].astype('float32')

w = 1.0 / (y.rolling(2000, min_periods=1).var().fillna(y.var()))

In [6]:
split_idx    = int(len(df) * 0.8)       # 80 % for training
X_train, X_test = X.iloc[:split_idx],  X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx],  y.iloc[split_idx:]
w_train, w_test = w.iloc[:split_idx],  w.iloc[split_idx:]

In [7]:
X_train_c = sm.add_constant(X_train, has_constant='add')
X_test_c  = sm.add_constant(X_test,  has_constant='add')

In [8]:
model     = sm.WLS(y_train, X_train_c, weights=w_train)
results   = model.fit()
print(results.summary())  

                            WLS Regression Results                            
Dep. Variable:              rv_future   R-squared:                       0.168
Model:                            WLS   Adj. R-squared:                  0.168
Method:                 Least Squares   F-statistic:                 3.108e+05
Date:                Sat, 17 May 2025   Prob (F-statistic):               0.00
Time:                        22:36:08   Log-Likelihood:             1.9092e+08
No. Observations:            35512899   AIC:                        -3.818e+08
Df Residuals:                35512875   BIC:                        -3.818e+08
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

In [9]:
y_pred = results.predict(X_test_c)

In [10]:
r2     = r2_score(y_test, y_pred)      
qlike  = qlike_loss(y_test.values, y_pred)

print(f"Out-of-sample R²   : {r2:0.4f}")
print(f"Out-of-sample QLIKE: {qlike:0.6f}")

Out-of-sample R²   : 0.1759
Out-of-sample QLIKE: 0.220704
