In [2]:
import itertools
import math
import numpy as np
import pandas as pd

try:
    from scipy.stats import norm
    _has_scipy = True
except Exception:
    _has_scipy = False


In [3]:
base_path       = "../Data/btc_final_df.csv"
lstm_path       = "../Results/btc_lstm_prediction.csv"
egarch_path     = "../Results/btc_egarch_prediction.csv"
xgb_path        = "../Results/btc_xgb_prediction.csv"
vanilla_t_path  = "../Results/btc_transformer_prediction.csv"
# dense_t_path = 
# random_t_path = 


H = 1  # forecast horizon for DM (h-step ahead)
ASSUME_INPUTS_ARE_VOL = True  # True if columns are volatility (sigma), False if already variance 
SAVE_DM_CSV = "../Results/btc_dm_results.csv"  

def load_actual(base_path):
    df = pd.read_csv(base_path, parse_dates=["timestamp"])
    need = ["timestamp", "vol_future"]
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError("Missing columns in base file: " + ", ".join(missing))
    df = df[need].copy()
    if df["timestamp"].dt.tz is None:
        df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
    df = df.drop_duplicates(subset="timestamp").sort_values("timestamp")
    return df.rename(columns={"vol_future": "actual_vol"})

def load_pred(path, new_col_name):
    d = pd.read_csv(path, parse_dates=["timestamp"])
    if d["timestamp"].dt.tz is None:
        d["timestamp"] = d["timestamp"].dt.tz_localize("UTC")
    pred_col = None
    for c in d.columns:
        if c.lower() in {"pred_vol_future", "predicted_volatility"}:
            pred_col = c
            break
    if pred_col is None:
        raise ValueError("Could not find prediction column in " + path)
    d = d[["timestamp", pred_col]].rename(columns={pred_col: new_col_name})
    return d.drop_duplicates(subset="timestamp").sort_values("timestamp")


In [4]:
actual = load_actual(base_path)

pred_dfs = []
for path, name in [
    (lstm_path, "pred_lstm"),
    (egarch_path, "pred_egarch"),
    (xgb_path, "pred_xgb"),
    (vanilla_t_path, "pred_vanilla_t"),
    #(dense_t_path, "pred_dense_t"),
    #(random_t_path, "pred_random_t")
]:
    pred_dfs.append(load_pred(path, name))
    

df = actual.copy()
for p in pred_dfs:
    df = df.merge(p, on="timestamp", how="inner")

df = df.sort_values("timestamp").drop_duplicates(subset=["timestamp"]).reset_index(drop=True)

print("=== Joined Data Preview (first 10 rows) ===")
display(df.head(10))
print("[info] shape:", df.shape)
print("[info] columns:", list(df.columns))


=== Joined Data Preview (first 10 rows) ===


Unnamed: 0,timestamp,actual_vol,pred_lstm,pred_egarch,pred_xgb,pred_vanilla_t
0,2025-08-23 16:00:00+00:00,-6.557003,0.002926,0.002871,-5.891117,0.002761
1,2025-08-23 17:00:00+00:00,-6.819089,0.002479,0.00271,-6.006941,0.002204
2,2025-08-23 18:00:00+00:00,-6.679108,0.002281,0.002592,-6.18061,0.002341
3,2025-08-23 19:00:00+00:00,-6.530043,0.002172,0.002433,-6.333597,0.002042
4,2025-08-23 20:00:00+00:00,-6.561964,0.001964,0.002287,-6.383061,0.002099
5,2025-08-23 21:00:00+00:00,-6.342669,0.001962,0.002212,-6.356463,0.001603
6,2025-08-23 22:00:00+00:00,-6.549995,0.002017,0.002135,-6.294103,0.001521
7,2025-08-23 23:00:00+00:00,-6.508754,0.002176,0.002149,-6.299031,0.001675
8,2025-08-24 00:00:00+00:00,-5.929077,0.002242,0.002128,-6.289393,0.001695
9,2025-08-24 01:00:00+00:00,-6.038523,0.002154,0.001987,-6.2803,0.001638


[info] shape: (1208, 6)
[info] columns: ['timestamp', 'actual_vol', 'pred_lstm', 'pred_egarch', 'pred_xgb', 'pred_vanilla_t']


In [5]:
def squared_error(y, yhat):
    return (y - yhat) ** 2

def qlike(y, yhat, inputs_are_volatility=True, eps=1e-12):
    if inputs_are_volatility:
        y_var = (y ** 2).astype(float)
        yhat_var = (yhat ** 2).astype(float)
    else:
        y_var = y.astype(float)
        yhat_var = yhat.astype(float)
    yhat_var = np.clip(yhat_var, eps, None)
    return np.log(yhat_var) + (y_var / yhat_var)

model_cols = [c for c in df.columns if c.startswith("pred_")]
for mc in model_cols:
    se_col = mc.replace("pred_", "") + "_se"
    ql_col = mc.replace("pred_", "") + "_qlike"
    df[se_col] = squared_error(df["actual_vol"], df[mc])
    df[ql_col] = qlike(df["actual_vol"], df[mc], inputs_are_volatility=ASSUME_INPUTS_ARE_VOL)

print("=== Loss Columns Preview (first 10 rows) ===")
loss_cols = ["timestamp", "actual_vol"] + model_cols + \
            [mc.replace("pred_", "") + "_se" for mc in model_cols] + \
            [mc.replace("pred_", "") + "_qlike" for mc in model_cols]
display(df[loss_cols].head(10))


=== Loss Columns Preview (first 10 rows) ===


Unnamed: 0,timestamp,actual_vol,pred_lstm,pred_egarch,pred_xgb,pred_vanilla_t,lstm_se,egarch_se,xgb_se,vanilla_t_se,lstm_qlike,egarch_qlike,xgb_qlike,vanilla_t_qlike
0,2025-08-23 16:00:00+00:00,-6.557003,0.002926,0.002871,-5.891117,0.002761,43.032668,43.031951,0.443405,43.030506,5022866.0,5215784.0,4.785732,5640270.0
1,2025-08-23 17:00:00+00:00,-6.819089,0.002479,0.00271,-6.006941,0.002204,46.533788,46.536938,0.659585,46.53004,7567796.0,6333098.0,4.874514,9572092.0
2,2025-08-23 18:00:00+00:00,-6.679108,0.002281,0.002592,-6.18061,0.002341,44.640949,44.645116,0.2485,44.641758,8577005.0,6637901.0,4.810649,8138928.0
3,2025-08-23 19:00:00+00:00,-6.530043,0.002172,0.002433,-6.333597,0.002042,42.669828,42.673238,0.038591,42.668139,9040582.0,7204950.0,4.754732,10221610.0
4,2025-08-23 20:00:00+00:00,-6.561964,0.001964,0.002287,-6.383061,0.002099,43.085157,43.089391,0.032006,43.086924,11160150.0,8234207.0,4.764137,9774480.0
5,2025-08-23 21:00:00+00:00,-6.342669,0.001962,0.002212,-6.356463,0.001603,40.254351,40.257514,0.00019,40.249786,10446990.0,8224783.0,4.694609,15663020.0
6,2025-08-23 22:00:00+00:00,-6.549995,0.002017,0.002135,-6.294103,0.001521,42.928866,42.93041,0.065481,42.922364,10542720.0,9411243.0,4.762191,18543090.0
7,2025-08-23 23:00:00+00:00,-6.508754,0.002176,0.002149,-6.299031,0.001675,42.392208,42.391861,0.043984,42.38568,8947713.0,9170825.0,4.748489,15107060.0
8,2025-08-24 00:00:00+00:00,-5.929077,0.002242,0.002128,-6.289393,0.001695,35.180542,35.17919,0.129828,35.174047,6993273.0,7762337.0,4.566432,12242660.0
9,2025-08-24 01:00:00+00:00,-6.038523,0.002154,0.001987,-6.2803,0.001638,36.489782,36.487766,0.058456,36.483542,7857119.0,9231528.0,4.599322,13594320.0


In [6]:
def _nw_longrun_var(d, h):
    d = np.asarray(d, dtype=float)
    T = d.size
    d = d - d.mean()
    def acov(k):
        return np.dot(d[:T - k], d[k:]) / T
    gamma0 = acov(0)
    q = max(h - 1, 0)
    lrv = gamma0
    for k in range(1, q + 1):
        w = 1.0 - k / (q + 1.0)
        lrv += 2.0 * w * acov(k)
    return lrv

def dm_test_from_losses(loss_a, loss_b, h=1):
    d = (loss_a - loss_b).dropna().to_numpy()
    T = d.size
    if T < 5:
        raise ValueError("Not enough overlapping observations for DM test.")
    dbar = d.mean()
    lrv = _nw_longrun_var(d, h)
    if lrv <= 0:
        lrv = np.var(d, ddof=1)
    dm = dbar / math.sqrt(lrv / T)
    hln = math.sqrt((T + 1 - 2 * h + (h * (h - 1)) / T) / T)
    dm_hln = dm * hln
    if _has_scipy:
        pval = 2.0 * (1.0 - norm.cdf(abs(dm_hln)))
    else:
        def norm_cdf(x):
            return 0.5 * (1.0 + math.erf(x / math.sqrt(2)))
        pval = 2.0 * (1.0 - norm_cdf(abs(dm_hln)))
    return dm_hln, pval, dbar


In [7]:
# build maps
name_map = {mc: mc.replace("pred_", "") for mc in model_cols}
se_map = {name_map[mc]: name_map[mc] + "_se" for mc in model_cols}
ql_map = {name_map[mc]: name_map[mc] + "_qlike" for mc in model_cols}

# pairwise comparisons
results = []
pairs = list(itertools.combinations(name_map.values(), 2))
for a, b in pairs:
    stat_se, p_se, mean_diff_se = dm_test_from_losses(df[se_map[a]], df[se_map[b]], h=H)
    stat_ql, p_ql, mean_diff_ql = dm_test_from_losses(df[ql_map[a]], df[ql_map[b]], h=H)
    results.append({
        "loss": "SE",
        "model_A": a, "model_B": b,
        "T": int(df[[se_map[a], se_map[b]]].dropna().shape[0]),
        "dm_stat": stat_se, "p_value": p_se, "mean_diff_A_minus_B": mean_diff_se,
        "better_model": b if mean_diff_se > 0 else a if mean_diff_se < 0 else "tie"
    })
    results.append({
        "loss": "QLIKE",
        "model_A": a, "model_B": b,
        "T": int(df[[ql_map[a], ql_map[b]]].dropna().shape[0]),
        "dm_stat": stat_ql, "p_value": p_ql, "mean_diff_A_minus_B": mean_diff_ql,
        "better_model": b if mean_diff_ql > 0 else a if mean_diff_ql < 0 else "tie"
    })

dm_df = pd.DataFrame(results)

# save combined 
if SAVE_DM_CSV:
    dm_df.sort_values(["loss", "p_value"]).reset_index(drop=True).to_csv(SAVE_DM_CSV, index=False)
    print("[info] saved DM test summary to", SAVE_DM_CSV)


dm_se = dm_df[dm_df["loss"] == "SE"].sort_values("p_value").reset_index(drop=True)
dm_ql = dm_df[dm_df["loss"] == "QLIKE"].sort_values("p_value").reset_index(drop=True)

print("=== Diebold–Mariano Results (SE / RMSE basis) ===")
display(dm_se)

print("=== Diebold–Mariano Results (QLIKE basis) ===")
display(dm_ql)


[info] saved DM test summary to ../Results/btc_dm_results.csv
=== Diebold–Mariano Results (SE / RMSE basis) ===


Unnamed: 0,loss,model_A,model_B,T,dm_stat,p_value,mean_diff_A_minus_B,better_model
0,SE,lstm,xgb,1208,167.972486,0.0,35.314581,xgb
1,SE,egarch,xgb,1208,167.884103,0.0,35.317851,xgb
2,SE,xgb,vanilla_t,1208,-167.986288,0.0,-35.315442,xgb
3,SE,lstm,egarch,1208,-8.213606,2.220446e-16,-0.003269,lstm
4,SE,egarch,vanilla_t,1208,5.717744,1.079477e-08,0.002408,vanilla_t
5,SE,lstm,vanilla_t,1208,-3.361894,0.0007740982,-0.000861,lstm


=== Diebold–Mariano Results (QLIKE basis) ===


Unnamed: 0,loss,model_A,model_B,T,dm_stat,p_value,mean_diff_A_minus_B,better_model
0,QLIKE,lstm,egarch,1208,13.57042,0.0,1336923.0,egarch
1,QLIKE,lstm,xgb,1208,35.32306,0.0,5543392.0,xgb
2,QLIKE,egarch,xgb,1208,40.121981,0.0,4206469.0,xgb
3,QLIKE,egarch,vanilla_t,1208,-11.730473,0.0,-1142924.0,egarch
4,QLIKE,xgb,vanilla_t,1208,-36.398931,0.0,-5349393.0,xgb
5,QLIKE,lstm,vanilla_t,1208,2.869157,0.004116,193998.4,vanilla_t
