In [1]:
import itertools
import math
import numpy as np
import pandas as pd

try:
    from scipy.stats import norm
    _has_scipy = True
except Exception:
    _has_scipy = False


In [2]:
base_path = "../Data/eth_final_df.csv"
lstm_path = "../Results/eth_lstm_prediction.csv"
egarch_path = "../Results/eth_egarch_prediction.csv"
xgb_path = "../Results/eth_xgb_prediction.csv"
vanilla_t_path  = "../Results/eth_transformer_prediction.csv"
# dense_t_path = 
# random_t_path = 

H = 1  # forecast horizon for DM (h-step ahead)
ASSUME_INPUTS_ARE_VOL = True  # True if columns are volatility (sigma), False if already variance 
SAVE_DM_CSV = "../Results/eth_dm_results.csv"  

def load_actual(base_path):
    df = pd.read_csv(base_path, parse_dates=["timestamp"])
    need = ["timestamp", "vol_future"]
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError("Missing columns in base file: " + ", ".join(missing))
    df = df[need].copy()
    if df["timestamp"].dt.tz is None:
        df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
    df = df.drop_duplicates(subset="timestamp").sort_values("timestamp")
    return df.rename(columns={"vol_future": "actual_vol"})

def load_pred(path, new_col_name):
    d = pd.read_csv(path, parse_dates=["timestamp"])
    if d["timestamp"].dt.tz is None:
        d["timestamp"] = d["timestamp"].dt.tz_localize("UTC")
    pred_col = None
    for c in d.columns:
        if c.lower() in {"pred_vol_future", "predicted_volatility"}:
            pred_col = c
            break
    if pred_col is None:
        raise ValueError("Could not find prediction column in " + path)
    d = d[["timestamp", pred_col]].rename(columns={pred_col: new_col_name})
    return d.drop_duplicates(subset="timestamp").sort_values("timestamp")

actual = load_actual(base_path)

pred_dfs = []
for path, name in [
    (egarch_path, "pred_egarch"),
    (xgb_path, "pred_xgb"),
    (lstm_path, "pred_lstm"),
    (vanilla_t_path, "pred_vanilla_t"),
    #(dense_t_path, "pred_dense_t"),
    #(random_t_path, "pred_random_t")
]:
    pred_dfs.append(load_pred(path, name))
    

df = actual.copy()
for p in pred_dfs:
    df = df.merge(p, on="timestamp", how="inner")

df = df.sort_values("timestamp").drop_duplicates(subset=["timestamp"]).reset_index(drop=True)

print("=== Joined Data Preview (first 10 rows) ===")
display(df.head(10))
print("[info] shape:", df.shape)
print("[info] columns:", list(df.columns))

=== Joined Data Preview (first 10 rows) ===


Unnamed: 0,timestamp,actual_vol,pred_egarch,pred_xgb,pred_lstm,pred_vanilla_t
0,2025-08-23 16:00:00+00:00,-5.533552,0.006138,-5.356733,0.007318,0.004824
1,2025-08-23 17:00:00+00:00,-6.066282,0.006046,-5.528158,0.004697,0.003909
2,2025-08-23 18:00:00+00:00,-6.17332,0.00564,-5.497612,0.004786,0.003884
3,2025-08-23 19:00:00+00:00,-5.852506,0.005354,-5.617051,0.004175,0.00357
4,2025-08-23 20:00:00+00:00,-6.283275,0.005124,-5.640494,0.004121,0.003557
5,2025-08-23 21:00:00+00:00,-5.948457,0.004994,-5.674194,0.00428,0.003776
6,2025-08-23 22:00:00+00:00,-5.222315,0.004719,-5.700621,0.00433,0.003487
7,2025-08-23 23:00:00+00:00,-5.326897,0.004878,-5.794,0.003494,0.003267
8,2025-08-24 00:00:00+00:00,-5.210163,0.00471,-5.56985,0.005892,0.00584
9,2025-08-24 01:00:00+00:00,-5.403565,0.005031,-5.576059,0.004427,0.006546


[info] shape: (1208, 6)
[info] columns: ['timestamp', 'actual_vol', 'pred_egarch', 'pred_xgb', 'pred_lstm', 'pred_vanilla_t']


In [3]:
def squared_error(y, yhat):
    return (y - yhat) ** 2

def qlike(y, yhat, inputs_are_volatility=True, eps=1e-12):
    if inputs_are_volatility:
        y_var = (y ** 2).astype(float)
        yhat_var = (yhat ** 2).astype(float)
    else:
        y_var = y.astype(float)
        yhat_var = yhat.astype(float)
    yhat_var = np.clip(yhat_var, eps, None)
    return np.log(yhat_var) + (y_var / yhat_var)

model_cols = [c for c in df.columns if c.startswith("pred_")]
for mc in model_cols:
    se_col = mc.replace("pred_", "") + "_se"
    ql_col = mc.replace("pred_", "") + "_qlike"
    df[se_col] = squared_error(df["actual_vol"], df[mc])
    df[ql_col] = qlike(df["actual_vol"], df[mc], inputs_are_volatility=ASSUME_INPUTS_ARE_VOL)

print("=== Loss Columns Preview (first 10 rows) ===")
loss_cols = ["timestamp", "actual_vol"] + model_cols + \
            [mc.replace("pred_", "") + "_se" for mc in model_cols] + \
            [mc.replace("pred_", "") + "_qlike" for mc in model_cols]
display(df[loss_cols].head(10))


=== Loss Columns Preview (first 10 rows) ===


Unnamed: 0,timestamp,actual_vol,pred_egarch,pred_xgb,pred_lstm,pred_vanilla_t,egarch_se,xgb_se,lstm_se,vanilla_t_se,egarch_qlike,xgb_qlike,lstm_qlike,vanilla_t_qlike
0,2025-08-23 16:00:00+00:00,-5.533552,0.006138,-5.356733,0.007318,0.004824,30.688172,0.031265,30.70125,30.673618,812706.8,4.423816,571707.8,1315594.0
1,2025-08-23 17:00:00+00:00,-6.066282,0.006046,-5.528158,0.004697,0.003909,36.873168,0.289578,36.856789,36.847215,1006749.0,4.62387,1667993.0,2408907.0
2,2025-08-23 18:00:00+00:00,-6.17332,0.00564,-5.497612,0.004786,0.003884,38.179549,0.456581,38.16899,38.157849,1197859.0,4.669553,1663799.0,2525896.0
3,2025-08-23 19:00:00+00:00,-5.852506,0.005354,-5.617051,0.004175,0.00357,34.314524,0.055439,34.300722,34.29363,1195079.0,4.537207,1964701.0,2687726.0
4,2025-08-23 20:00:00+00:00,-6.283275,0.005124,-5.640494,0.004121,0.003557,39.543957,0.413167,39.531352,39.524251,1503938.0,4.700846,2324402.0,3121189.0
5,2025-08-23 21:00:00+00:00,-5.948457,0.004994,-5.674194,0.00428,0.003776,35.443577,0.07522,35.435078,35.429081,1418878.0,4.570864,1931647.0,2481297.0
6,2025-08-23 22:00:00+00:00,-5.222315,0.004719,-5.700621,0.00433,0.003487,27.321887,0.228777,27.317818,27.309002,1224481.0,4.320382,1454479.0,2243347.0
7,2025-08-23 23:00:00+00:00,-5.326897,0.004878,-5.794,0.003494,0.003267,28.427823,0.218185,28.413065,28.410644,1192479.0,4.358908,2324451.0,2658862.0
8,2025-08-24 00:00:00+00:00,-5.210163,0.00471,-5.56985,0.005892,0.00584,27.194899,0.129375,27.207223,27.206684,1223561.0,4.309751,782013.8,795927.4
9,2025-08-24 01:00:00+00:00,-5.403565,0.005031,-5.576059,0.004427,0.006546,29.252912,0.029754,29.246377,29.269297,1153501.0,4.376052,1489789.0,681456.8


In [4]:
# Compute a robust variance of that difference series (accounting for autocorrelation)
def _nw_longrun_var(d, h):
    d = np.asarray(d, dtype=float)
    T = d.size
    d = d - d.mean()
    def acov(k):
        return np.dot(d[:T - k], d[k:]) / T
    gamma0 = acov(0)
    q = max(h - 1, 0)
    lrv = gamma0
    for k in range(1, q + 1):
        w = 1.0 - k / (q + 1.0)
        lrv += 2.0 * w * acov(k)
    return lrv

#Takes two loss series (e.g. squared error or QLIKE) from two models and computes DM statistic and p-value

def dm_test_from_losses(loss_a, loss_b, h=1):
    d = (loss_a - loss_b).dropna().to_numpy()
    T = d.size
    if T < 5:
        raise ValueError("Not enough overlapping observations for DM test.")
    dbar = d.mean()
    lrv = _nw_longrun_var(d, h)
    if lrv <= 0:
        lrv = np.var(d, ddof=1)
    dm = dbar / math.sqrt(lrv / T)
    hln = math.sqrt((T + 1 - 2 * h + (h * (h - 1)) / T) / T)
    dm_hln = dm * hln
    if _has_scipy:
        pval = 2.0 * (1.0 - norm.cdf(abs(dm_hln)))
    else:
        def norm_cdf(x):
            return 0.5 * (1.0 + math.erf(x / math.sqrt(2)))
        pval = 2.0 * (1.0 - norm_cdf(abs(dm_hln)))
    return dm_hln, pval, dbar


In [5]:
# build maps
name_map = {mc: mc.replace("pred_", "") for mc in model_cols}
se_map = {name_map[mc]: name_map[mc] + "_se" for mc in model_cols}
ql_map = {name_map[mc]: name_map[mc] + "_qlike" for mc in model_cols}

# pairwise comparisons
results = []
pairs = list(itertools.combinations(name_map.values(), 2))
for a, b in pairs:
    stat_se, p_se, mean_diff_se = dm_test_from_losses(df[se_map[a]], df[se_map[b]], h=H)
    stat_ql, p_ql, mean_diff_ql = dm_test_from_losses(df[ql_map[a]], df[ql_map[b]], h=H)
    results.append({
        "loss": "SE",
        "model_A": a, "model_B": b,
        "T": int(df[[se_map[a], se_map[b]]].dropna().shape[0]),
        "dm_stat": stat_se, "p_value": p_se, "mean_diff_A_minus_B": mean_diff_se,
        "better_model": b if mean_diff_se > 0 else a if mean_diff_se < 0 else "tie"
    })
    results.append({
        "loss": "QLIKE",
        "model_A": a, "model_B": b,
        "T": int(df[[ql_map[a], ql_map[b]]].dropna().shape[0]),
        "dm_stat": stat_ql, "p_value": p_ql, "mean_diff_A_minus_B": mean_diff_ql,
        "better_model": b if mean_diff_ql > 0 else a if mean_diff_ql < 0 else "tie"
    })

dm_df = pd.DataFrame(results)

# save combined 
if SAVE_DM_CSV:
    dm_df.sort_values(["loss", "p_value"]).reset_index(drop=True).to_csv(SAVE_DM_CSV, index=False)
    print("[info] saved DM test summary to", SAVE_DM_CSV)


dm_se = dm_df[dm_df["loss"] == "SE"].sort_values("p_value").reset_index(drop=True)
dm_ql = dm_df[dm_df["loss"] == "QLIKE"].sort_values("p_value").reset_index(drop=True)

print("=== Diebold–Mariano Results (SE / RMSE basis) ===")
display(dm_se)

print("=== Diebold–Mariano Results (QLIKE basis) ===")
display(dm_ql)


[info] saved DM test summary to ../Results/eth_dm_results.csv
=== Diebold–Mariano Results (SE / RMSE basis) ===


Unnamed: 0,loss,model_A,model_B,T,dm_stat,p_value,mean_diff_A_minus_B,better_model
0,SE,egarch,xgb,1208,171.841618,0.0,30.496846,xgb
1,SE,egarch,lstm,1208,25.565908,0.0,0.013184,lstm
2,SE,egarch,vanilla_t,1208,28.051093,0.0,0.016306,vanilla_t
3,SE,xgb,lstm,1208,-171.876709,0.0,-30.483662,xgb
4,SE,xgb,vanilla_t,1208,-171.704374,0.0,-30.48054,xgb
5,SE,lstm,vanilla_t,1208,6.554415,5.586043e-11,0.003122,vanilla_t


=== Diebold–Mariano Results (QLIKE basis) ===


Unnamed: 0,loss,model_A,model_B,T,dm_stat,p_value,mean_diff_A_minus_B,better_model
0,QLIKE,egarch,xgb,1208,51.857301,0.0,1003680.0,xgb
1,QLIKE,egarch,lstm,1208,-20.821166,0.0,-795813.9,egarch
2,QLIKE,egarch,vanilla_t,1208,-29.519518,0.0,-745802.0,egarch
3,QLIKE,xgb,lstm,1208,-36.139909,0.0,-1799494.0,xgb
4,QLIKE,xgb,vanilla_t,1208,-49.395366,0.0,-1749482.0,xgb
5,QLIKE,lstm,vanilla_t,1208,1.733336,0.083036,50011.85,vanilla_t
