In [1]:
import itertools
import math
import numpy as np
import pandas as pd

try:
    from scipy.stats import norm
    _has_scipy = True
except Exception:
    _has_scipy = False


Load dataframe and join data 

In [2]:
base_path       = "../Data/btc_final_df.csv"
egarch_path     = "../Results/btc_egarch_prediction.csv"
lstm_path       = "../Results/btc_lstm_prediction.csv"
xgb_path        = "../Results/btc_xgb_prediction.csv"
vanilla_t_path  = "../Results/btc_transformer_prediction.csv"
dense_t_path = "../Results/btc_transformer_dense_prediction.csv"
random_t_path = "../Results/btc_transformer_random_prediction.csv"


H = 1  # forecast horizon for DM (h-step ahead)
ASSUME_INPUTS_ARE_VOL = True  # True if columns are volatility (sigma), False if already variance 
SAVE_DM_CSV = "../Results/btc_dm_results.csv"  

def load_actual(base_path):
    df = pd.read_csv(base_path, parse_dates=["timestamp"])
    need = ["timestamp", "vol_future"]
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError("Missing columns in base file: " + ", ".join(missing))
    df = df[need].copy()
    if df["timestamp"].dt.tz is None:
        df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
    df = df.drop_duplicates(subset="timestamp").sort_values("timestamp")
    return df.rename(columns={"vol_future": "actual_vol"})

def load_pred(path, new_col_name):
    d = pd.read_csv(path, parse_dates=["timestamp"])
    if d["timestamp"].dt.tz is None:
        d["timestamp"] = d["timestamp"].dt.tz_localize("UTC")
    pred_col = None
    for c in d.columns:
        if c.lower() in {"pred_vol_future", "predicted_volatility", "predicted"}:
            pred_col = c
            break
    if pred_col is None:
        raise ValueError("Could not find prediction column in " + path)
    d = d[["timestamp", pred_col]].rename(columns={pred_col: new_col_name})
    return d.drop_duplicates(subset="timestamp").sort_values("timestamp")

actual = load_actual(base_path)

pred_dfs = []
for path, name in [
    (egarch_path, "pred_egarch"),
    (xgb_path, "pred_xgb"),
    (lstm_path, "pred_lstm"),
    (vanilla_t_path, "pred_vanilla_t"),
    (dense_t_path, "pred_dense_t"),
    (random_t_path, "pred_random_t")
]:
    pred_dfs.append(load_pred(path, name))
    

df = actual.copy()
for p in pred_dfs:
    df = df.merge(p, on="timestamp", how="inner")

df = df.sort_values("timestamp").drop_duplicates(subset=["timestamp"]).reset_index(drop=True)

print("=== Joined Data Preview (first 10 rows) ===")
display(df.head(10))
print("[info] shape:", df.shape)
print("[info] columns:", list(df.columns))

=== Joined Data Preview (first 10 rows) ===


Unnamed: 0,timestamp,actual_vol,pred_egarch,pred_xgb,pred_lstm,pred_vanilla_t,pred_dense_t,pred_random_t
0,2025-08-22 17:00:00+00:00,-4.97706,0.009963,-5.077758,-5.275604,0.006482,0.006052,1.0
1,2025-08-22 18:00:00+00:00,-5.685779,0.008877,-5.30043,-5.425174,0.004094,0.004495,1.0
2,2025-08-22 19:00:00+00:00,-5.964853,0.007697,-5.21711,-5.316171,0.005164,0.005559,1.0
3,2025-08-22 20:00:00+00:00,-5.934881,0.006712,-5.439029,-5.512246,0.003481,0.003988,1.0
4,2025-08-22 21:00:00+00:00,-5.818036,0.006079,-5.692475,-5.676215,0.003132,0.003309,1.0
5,2025-08-22 22:00:00+00:00,-6.370835,0.005446,-5.666158,-5.68663,0.002885,0.003076,1.0
6,2025-08-22 23:00:00+00:00,-5.801953,0.005306,-5.595378,-5.726196,0.00291,0.00294,1.0
7,2025-08-23 00:00:00+00:00,-5.839813,0.004814,-5.901203,-5.916484,0.002316,0.002447,1.0
8,2025-08-23 01:00:00+00:00,-5.34304,0.004268,-5.722793,-5.78728,0.002842,0.002963,1.0
9,2025-08-23 02:00:00+00:00,-5.839386,0.00442,-5.810834,-5.814778,0.002759,0.002848,1.0


[info] shape: (1231, 8)
[info] columns: ['timestamp', 'actual_vol', 'pred_egarch', 'pred_xgb', 'pred_lstm', 'pred_vanilla_t', 'pred_dense_t', 'pred_random_t']


Define Squared Error and QLIKE 

In [3]:
def squared_error(y, yhat):
    return (y - yhat) ** 2

def qlike(y, yhat, inputs_are_volatility=True, eps=1e-12):
    if inputs_are_volatility:
        y_var = (y ** 2).astype(float)
        yhat_var = (yhat ** 2).astype(float)
    else:
        y_var = y.astype(float)
        yhat_var = yhat.astype(float)
    yhat_var = np.clip(yhat_var, eps, None)
    return np.log(yhat_var) + (y_var / yhat_var)

model_cols = [c for c in df.columns if c.startswith("pred_")]
for mc in model_cols:
    se_col = mc.replace("pred_", "") + "_se"
    ql_col = mc.replace("pred_", "") + "_qlike"
    df[se_col] = squared_error(df["actual_vol"], df[mc])
    df[ql_col] = qlike(df["actual_vol"], df[mc], inputs_are_volatility=ASSUME_INPUTS_ARE_VOL)

print("=== Loss Columns Preview (first 10 rows) ===")
loss_cols = ["timestamp", "actual_vol"] + model_cols + \
            [mc.replace("pred_", "") + "_se" for mc in model_cols] + \
            [mc.replace("pred_", "") + "_qlike" for mc in model_cols]
display(df[loss_cols].head(10))


=== Loss Columns Preview (first 10 rows) ===


Unnamed: 0,timestamp,actual_vol,pred_egarch,pred_xgb,pred_lstm,pred_vanilla_t,pred_dense_t,pred_random_t,egarch_se,xgb_se,lstm_se,vanilla_t_se,dense_t_se,random_t_se,egarch_qlike,xgb_qlike,lstm_qlike,vanilla_t_qlike,dense_t_qlike,random_t_qlike
0,2025-08-22 17:00:00+00:00,-4.97706,0.009963,-5.077758,-5.275604,0.006482,0.006052,1.0,24.870402,0.01014,0.089129,24.835686,24.831406,35.725245,249521.5,4.21047,4.21621,589611.5,676251.5,24.771124
1,2025-08-22 18:00:00+00:00,-5.685779,0.008877,-5.30043,-5.425174,0.004094,0.004495,1.0,32.42911,0.148494,0.067915,32.374653,32.37922,44.699643,410226.8,4.486264,4.48048,1929012.0,1599948.0,32.328083
2,2025-08-22 19:00:00+00:00,-5.964853,0.007697,-5.21711,-5.316171,0.005164,0.005559,1.0,35.671352,0.559119,0.420789,35.641099,35.645822,48.509176,600546.5,4.61108,4.600437,1334303.0,1151188.0,35.579468
3,2025-08-22 20:00:00+00:00,-5.934881,0.006712,-5.439029,-5.512246,0.003481,0.003988,1.0,35.302535,0.24587,0.178621,35.264141,35.270173,48.092579,781753.4,4.577843,4.573167,2907590.0,2214232.0,35.222815
4,2025-08-22 21:00:00+00:00,-5.818036,0.006079,-5.692475,-5.676215,0.003132,0.003309,1.0,33.920321,0.015766,0.020113,33.886,33.888061,46.48562,915938.8,4.522892,4.523164,3450967.0,3091506.0,33.849546
5,2025-08-22 22:00:00+00:00,-6.370835,0.005446,-5.666158,-5.68663,0.002885,0.003076,1.0,40.656955,0.496569,0.468136,40.624299,40.626738,54.329204,1368478.0,4.733221,4.731348,4877412.0,4289535.0,40.587533
6,2025-08-22 23:00:00+00:00,-5.801953,0.005306,-5.595378,-5.726196,0.00291,0.00294,1.0,33.724263,0.042673,0.005739,33.696438,33.696785,46.266566,1195480.0,4.519083,4.516738,3974612.0,3894095.0,33.662658
7,2025-08-23 00:00:00+00:00,-5.839813,0.004814,-5.901203,-5.916484,0.002316,0.002447,1.0,34.159658,0.003769,0.005879,34.130463,34.131995,46.783037,1471680.0,4.529615,4.529735,6359510.0,5696214.0,34.10341
8,2025-08-23 01:00:00+00:00,-5.34304,0.004268,-5.722793,-5.78728,0.002842,0.002963,1.0,28.593702,0.144212,0.197349,28.578453,28.579752,40.23416,1567446.0,4.360601,4.363694,3535578.0,3251568.0,28.548078
9,2025-08-23 02:00:00+00:00,-5.839386,0.00442,-5.810834,-5.814778,0.002759,0.002848,1.0,34.150067,0.000815,0.000606,34.13066,34.1317,46.777205,1745681.0,4.5293,4.529287,4479970.0,4204105.0,34.098431


Create DM helpers 

In [4]:
# Compute a robust variance of that difference series (accounting for autocorrelation)
def _nw_longrun_var(d, h):
    d = np.asarray(d, dtype=float)
    T = d.size
    d = d - d.mean()
    def acov(k):
        return np.dot(d[:T - k], d[k:]) / T
    gamma0 = acov(0)
    q = max(h - 1, 0)
    lrv = gamma0
    for k in range(1, q + 1):
        w = 1.0 - k / (q + 1.0)
        lrv += 2.0 * w * acov(k)
    return lrv

#Takes two loss series (e.g. squared error or QLIKE) from two models and computes DM statistic and p-value

def dm_test_from_losses(loss_a, loss_b, h=1):
    d = (loss_a - loss_b).dropna().to_numpy()
    T = d.size
    if T < 5:
        raise ValueError("Not enough overlapping observations for DM test.")
    dbar = d.mean()
    lrv = _nw_longrun_var(d, h)
    if lrv <= 0:
        lrv = np.var(d, ddof=1)
    dm = dbar / math.sqrt(lrv / T)
    hln = math.sqrt((T + 1 - 2 * h + (h * (h - 1)) / T) / T)
    dm_hln = dm * hln
    if _has_scipy:
        pval = 2.0 * (1.0 - norm.cdf(abs(dm_hln)))
    else:
        def norm_cdf(x):
            return 0.5 * (1.0 + math.erf(x / math.sqrt(2)))
        pval = 2.0 * (1.0 - norm_cdf(abs(dm_hln)))
    return dm_hln, pval, dbar


Run pairwise DM tests

In [6]:
# build maps
name_map = {mc: mc.replace("pred_", "") for mc in model_cols}
se_map = {name_map[mc]: name_map[mc] + "_se" for mc in model_cols}
ql_map = {name_map[mc]: name_map[mc] + "_qlike" for mc in model_cols}

# pairwise comparisons
results = []
pairs = list(itertools.combinations(name_map.values(), 2))
for a, b in pairs:
    stat_se, p_se, mean_diff_se = dm_test_from_losses(df[se_map[a]], df[se_map[b]], h=H)
    stat_ql, p_ql, mean_diff_ql = dm_test_from_losses(df[ql_map[a]], df[ql_map[b]], h=H)
    results.append({
        "loss": "SE",
        "model_A": a, "model_B": b,
        "T": int(df[[se_map[a], se_map[b]]].dropna().shape[0]),
        "dm_stat": stat_se, "p_value": p_se, "mean_diff_A_minus_B": mean_diff_se,
        "better_model": b if mean_diff_se > 0 else a if mean_diff_se < 0 else "tie"
    })
    results.append({
        "loss": "QLIKE",
        "model_A": a, "model_B": b,
        "T": int(df[[ql_map[a], ql_map[b]]].dropna().shape[0]),
        "dm_stat": stat_ql, "p_value": p_ql, "mean_diff_A_minus_B": mean_diff_ql,
        "better_model": b if mean_diff_ql > 0 else a if mean_diff_ql < 0 else "tie"
    })

dm_df = pd.DataFrame(results)

# save combined 
if SAVE_DM_CSV:
    dm_df.sort_values(["loss", "p_value"]).reset_index(drop=True).to_csv(SAVE_DM_CSV, index=False)
    print("[info] saved DM test summary to", SAVE_DM_CSV)


dm_se = dm_df[dm_df["loss"] == "SE"].sort_values("p_value").reset_index(drop=True)
dm_ql = dm_df[dm_df["loss"] == "QLIKE"].sort_values("p_value").reset_index(drop=True)

print("=== Diebold–Mariano Results (Squared Error basis) ===")
display(dm_se)

print("=== Diebold–Mariano Results (QLIKE basis) ===")
display(dm_ql)


[info] saved DM test summary to ../Results/btc_dm_results.csv
=== Diebold–Mariano Results (Squared Error basis) ===


Unnamed: 0,loss,model_A,model_B,T,dm_stat,p_value,mean_diff_A_minus_B,better_model
0,SE,egarch,xgb,1231,170.487802,0.0,35.320531,xgb
1,SE,egarch,lstm,1231,168.673214,0.0,35.338923,lstm
2,SE,egarch,vanilla_t,1231,21.041578,0.0,0.008127,vanilla_t
3,SE,egarch,dense_t,1231,19.477587,0.0,0.007183,dense_t
4,SE,egarch,random_t,1231,-361.030537,0.0,-12.808495,egarch
5,SE,xgb,vanilla_t,1231,-170.529317,0.0,-35.312403,xgb
6,SE,xgb,dense_t,1231,-170.536315,0.0,-35.313348,xgb
7,SE,xgb,random_t,1231,-198.391594,0.0,-48.129026,xgb
8,SE,lstm,vanilla_t,1231,-168.712747,0.0,-35.330795,lstm
9,SE,lstm,dense_t,1231,-168.719661,0.0,-35.33174,lstm


=== Diebold–Mariano Results (QLIKE basis) ===


Unnamed: 0,loss,model_A,model_B,T,dm_stat,p_value,mean_diff_A_minus_B,better_model
0,QLIKE,egarch,xgb,1231,40.373753,0.0,4170668.0,xgb
1,QLIKE,egarch,lstm,1231,40.373753,0.0,4170668.0,lstm
2,QLIKE,egarch,vanilla_t,1231,-26.566978,0.0,-2489534.0,egarch
3,QLIKE,egarch,dense_t,1231,-23.756141,0.0,-2292151.0,egarch
4,QLIKE,egarch,random_t,1231,40.373501,0.0,4170637.0,random_t
5,QLIKE,xgb,vanilla_t,1231,-47.345211,0.0,-6660202.0,xgb
6,QLIKE,xgb,dense_t,1231,-42.375467,0.0,-6462819.0,xgb
7,QLIKE,xgb,random_t,1231,-153.543437,0.0,-30.93699,xgb
8,QLIKE,lstm,vanilla_t,1231,-47.345211,0.0,-6660202.0,lstm
9,QLIKE,lstm,dense_t,1231,-42.375467,0.0,-6462819.0,lstm
