In [21]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import r2_score

In [22]:
EPS = 1e-10     

In [23]:
def qlike(actual, pred, eps=EPS):
    actual = np.maximum(actual, eps)
    pred   = np.maximum(pred, eps)
    ratio  = actual / pred
    return np.mean(ratio - np.log(ratio) - 1.0)

In [24]:
def make_har_features(df, rv_col="rv", d=1, w=5, m=22):
    out = df.copy()
    out["rv_d"] = out[rv_col].shift(d)
    out["rv_w"] = out[rv_col].rolling(w).mean().shift(1)
    out["rv_m"] = out[rv_col].rolling(m).mean().shift(1)
    return out.dropna()

In [25]:
def fit_har_rv(df, rv_col="rv", test_size=0.2, weighted=False):
    df_feat = make_har_features(df, rv_col)

    df_feat = df_feat.sort_values(["stock_id", "time_id"]).reset_index(drop=True)

    y = df_feat[rv_col]
    X = sm.add_constant(df_feat[["rv_d", "rv_w", "rv_m"]])

    if "stock_id" in df_feat.columns:
        test_idx = []
        for stock_id, group in df_feat.groupby("stock_id"):
            n = len(group)
            cutoff = int(n * (1.0 - test_size))
            test_idx.extend(group.index[cutoff:])
        test_idx = sorted(test_idx)
        train_idx = sorted(set(df_feat.index) - set(test_idx))
    else:
        split = int(len(df_feat) * (1.0 - test_size))
        train_idx = range(split)
        test_idx = range(split, len(df_feat))

    X_train, X_test = X.loc[train_idx], X.loc[test_idx]
    y_train, y_test = y.loc[train_idx], y.loc[test_idx]

    if weighted:
        w = 1.0 / np.square(np.clip(y_train, EPS, None))
        mask = np.isfinite(w)
        X_train, y_train, w = X_train[mask], y_train[mask], w[mask]
        model = sm.WLS(y_train, X_train, weights=w).fit()
    else:
        model = sm.OLS(y_train, X_train).fit()

    y_pred_log = model.predict(X_test)
    metrics = {
        "R2"   : r2_score(np.exp(y_test).clip(EPS), np.exp(y_pred_log).clip(EPS)),
        "QLIKE": qlike(np.exp(y_test).clip(EPS), np.exp(y_pred_log).clip(EPS)),
        "coef" : model.params,
    }
    return model, metrics, y_test, y_pred_log

In [26]:
df = pd.read_parquet("/Users/ayush/Documents/University/Year 03/Sem 01/DATA3888/Optiver-07/Data/FE30Stocks.parquet")

In [27]:
df = df.rename(columns={"rv_future": "rv"})
df["log_rv"] = np.log(df["rv"].clip(lower=EPS))

In [None]:
model, metrics, y_test_log, y_pred_log = fit_har_rv(
    df,
    rv_col="log_rv",
    test_size=0.2,
    weighted=False # OLS
)

In [29]:
print(f"β̂ (const, daily, weekly, monthly):\n{metrics['coef']}")

β̂ (const, daily, weekly, monthly):
const    0.000050
rv_d     0.795511
rv_w     0.213496
rv_m    -0.055662
dtype: float64


In [34]:
print("Out-of-sample R²   :", metrics["R2"])
print("Out-of-sample QLIKE:", metrics["QLIKE"])

Out-of-sample R²   : 0.9070958975241714
Out-of-sample QLIKE: 1183.3793396368822
