In [1]:
import math, random, time
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

<torch._C.Generator at 0x145244fb0>

In [2]:
CSV_PATH = "../../Data/eth_final_df.csv"

df = pd.read_csv(CSV_PATH)
if "timestamp" not in df.columns:
    raise ValueError("Expected a datetime column named 'timestamp' in the CSV.")
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")
df = df.sort_values("timestamp").dropna(subset=["timestamp"]).reset_index(drop=True)

TARGET_CANDIDATES = ["vol_future", "target", "y"]
target_col = None
for c in TARGET_CANDIDATES:
    if c in df.columns:
        target_col = c
        break
if target_col is None:
    raise ValueError(f"Could not find a target column among {TARGET_CANDIDATES}. "
                     "Please rename your target to one of these or edit the list.")

feature_cols = [c for c in df.columns if c not in ["timestamp", target_col]]
num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
X_raw = df[["timestamp"] + num_cols].copy()
y_raw = df[["timestamp", target_col]].copy()


In [3]:

# -----------------------------
# Random Synthesizer Attention
# -----------------------------

class MultiheadRandomSynthesizer(nn.Module):
    def __init__(self, embed_dim: int, num_heads: int, attn_dropout: float = 0.1, causal: bool = False):
        super().__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(attn_dropout)
        self.causal = causal

        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)

    def _shape(self, x: torch.Tensor, B: int, L: int) -> torch.Tensor:
        return x.view(B, L, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        x: torch.Tensor,
        key_padding_mask: Optional[torch.Tensor] = None,
        need_weights: bool = False,
        attn_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        B, L, D = x.size()
        V = self._shape(self.v_proj(x), B, L) 

        rand_logits = torch.rand(B, self.num_heads, L, L, device=x.device, dtype=x.dtype)

        if self.causal:
            causal_mask = torch.triu(torch.ones(L, L, device=x.device, dtype=torch.bool), diagonal=1)
            rand_logits = rand_logits.masked_fill(causal_mask, float("-inf"))

        if attn_mask is not None:
            rand_logits = rand_logits.masked_fill(attn_mask.bool().unsqueeze(0).unsqueeze(0), float("-inf"))

        if key_padding_mask is not None:
            kpm = key_padding_mask.bool().unsqueeze(1).unsqueeze(1)  # [B,1,1,L]
            rand_logits = rand_logits.masked_fill(kpm, float("-inf"))

        attn = F.softmax(rand_logits, dim=-1) 
        attn = self.dropout(attn)

        out = torch.matmul(attn, V)  
        out = out.transpose(1, 2).contiguous().view(B, L, D) 
        out = self.out_proj(out)

        return out, (attn if need_weights else None)


# -------------------------------------
# Encoder Layer using Random Synthesizer
# -------------------------------------

class RandomSynthesizerEncoderLayer(nn.Module):
    def __init__(
        self,
        d_model: int,
        nhead: int,
        dim_feedforward: int = 256,
        dropout: float = 0.1,
        attn_dropout: float = 0.1,
        causal: bool = False,
        layer_norm_eps: float = 1e-5,
    ):
        super().__init__()
        self.self_attn = MultiheadRandomSynthesizer(
            embed_dim=d_model,
            num_heads=nhead,
            attn_dropout=attn_dropout,
            causal=causal,
        )
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(
        self,
        src: torch.Tensor,                     
        src_key_padding_mask: Optional[torch.Tensor] = None,  
        need_weights: bool = False,
    ):
        attn_out, attn_w = self.self_attn(src, key_padding_mask=src_key_padding_mask, need_weights=need_weights)
        src = src + self.dropout1(attn_out)
        src = self.norm1(src)

        ffn = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(ffn)
        src = self.norm2(src)

        return src, attn_w


# -----------------------------
# Random Synthesizer Transformer
# -----------------------------

class RandomSynthesizerTransformer(nn.Module):
    def __init__(
        self,
        input_dim: int,
        d_model: int = 96,
        nhead: int = 2,
        num_layers: int = 1,
        dim_feedforward: int = 384,
        dropout: float = 0.1,
        attn_dropout: float = 0.1,
        causal: bool = False,
        out_dim: int = 1,
    ):
        super().__init__()
        self.in_proj = nn.Linear(input_dim, d_model)
        self.layers = nn.ModuleList([
            RandomSynthesizerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                attn_dropout=attn_dropout,
                causal=causal,
            )
            for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.out_head = nn.Linear(d_model, out_dim)

    def forward(
        self,
        x: torch.Tensor,                        
        key_padding_mask: Optional[torch.Tensor] = None,  
        return_attn: bool = False,
    ):
        attns = []
        h = self.in_proj(x)                      
        for layer in self.layers:
            h, a = layer(h, src_key_padding_mask=key_padding_mask, need_weights=return_attn)
            if return_attn:
                attns.append(a)

        last = h[:, -1, :]                       
        y = self.out_head(self.dropout(last))    
        return (y, attns) if return_attn else y


# -----------------------------
# Tiny glue to integrate quickly
# -----------------------------

@dataclass
class TrainConfig:
    seq_len: int = 12
    batch_size: int = 8
    lr: float = 3e-4
    weight_decay: float = 1e-4
    max_epochs: int = 60
    patience: int = 6
    d_model: int = 96
    nhead: int = 2
    num_layers: int = 1
    dim_feedforward: int = 384
    dropout: float = 0.3
    attn_dropout: float = 0.1
    causal: bool = False

def build_model_random_synth(input_dim: int, cfg: TrainConfig, out_dim: int = 1) -> nn.Module:
    return RandomSynthesizerTransformer(
        input_dim=input_dim,
        d_model=cfg.d_model,
        nhead=cfg.nhead,
        num_layers=cfg.num_layers,
        dim_feedforward=cfg.dim_feedforward,
        dropout=cfg.dropout,
        attn_dropout=cfg.attn_dropout,
        causal=cfg.causal,
        out_dim=out_dim,
    )


In [4]:
# ---------- Metrics ----------
def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def rmpse(y_true, y_pred, eps=1e-12):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = np.maximum(np.abs(y_true), eps)
    return float(np.sqrt(np.mean(((y_pred - y_true) / denom) ** 2)) * 100.0)

def qlike_variance(v_true, v_pred, eps=1e-12):
    v_true = np.maximum(np.asarray(v_true, float), eps)
    v_pred = np.maximum(np.asarray(v_pred, float), eps)
    m = np.isfinite(v_true) & np.isfinite(v_pred)
    v_true = np.maximum(v_true[m], eps)
    v_pred = np.maximum(v_pred[m], eps)
    ratio = v_true / v_pred
    return float(np.mean(ratio - np.log(ratio) - 1.0))

# ---------- Dataset ----------
class SeqDataset(Dataset):
    def __init__(self, X_seq: np.ndarray, y_seq: np.ndarray):
        self.X = torch.tensor(X_seq, dtype=torch.float32)
        self.y = torch.tensor(y_seq, dtype=torch.float32).view(-1, 1)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# ---------- Windowing ----------
def make_sequences(X_df: pd.DataFrame, y_df: pd.DataFrame, seq_len: int
                  ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    assert np.all(X_df["timestamp"].values == y_df["timestamp"].values)
    X_values = X_df.drop(columns=["timestamp"]).values
    y_values = y_df.drop(columns=["timestamp"]).values.squeeze(-1)
    times = X_df["timestamp"].values

    X_seq, y_seq, t_seq = [], [], []
    for i in range(seq_len, len(X_values)):
        X_seq.append(X_values[i-seq_len:i, :])
        y_seq.append(y_values[i])
        t_seq.append(times[i])
    return np.array(X_seq), np.array(y_seq), np.array(t_seq)

# ---------- Time splits with purge ----------
def build_rolling_folds(t_seq: np.ndarray,
                        n_folds: int = 5,
                        purge_hours: int = 24
                       ) -> List[Dict[str, np.ndarray]]:

    t = pd.to_datetime(t_seq)
    N = len(t)
    cuts = np.linspace(0, N, n_folds + 1, dtype=int)  
    folds = []
    for k in range(n_folds):
        train_end_idx = cuts[k]  
        valid_end_idx = cuts[k+1]
        if train_end_idx == 0: 
            continue

        purge_until_time = t[train_end_idx - 1] + pd.Timedelta(hours=purge_hours)
        j = train_end_idx
        while j < valid_end_idx and t[j] < purge_until_time:
            j += 1

        train_idx = np.arange(0, train_end_idx)
        valid_idx = np.arange(j, valid_end_idx)

        if len(valid_idx) == 0:
            continue

        folds.append({"train_idx": train_idx, "valid_idx": valid_idx})
    return folds[:n_folds]

# ---------- Training helpers ----------
def train_one_epoch(model, loader, optim, loss_fn):
    model.train()
    total = 0.0; n = 0
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optim.zero_grad(); loss.backward(); optim.step()
        total += loss.item() * len(xb); n += len(xb)
    return total / max(n,1)

@torch.no_grad()
def eval_epoch(model, loader, loss_fn):
    model.eval()
    y_true, y_pred = [], []
    total = 0.0; n = 0
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        total += loss.item() * len(xb); n += len(xb)
        y_true.append(yb.detach().cpu().numpy().ravel())
        y_pred.append(pred.detach().cpu().numpy().ravel())
    y_true = np.concatenate(y_true) if y_true else np.array([])
    y_pred = np.concatenate(y_pred) if y_pred else np.array([])
    return (total / max(n,1)), y_true, y_pred

def early_stop_train(model, train_loader, val_loader, cfg, verbose=False):
    loss_fn = nn.MSELoss()
    optim = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    best_val = float("inf"); best_state = None; patience_left = cfg.patience
    for epoch in range(cfg.max_epochs):
        _ = train_one_epoch(model, train_loader, optim, loss_fn)
        val_loss, yv_true, yv_pred = eval_epoch(model, val_loader, loss_fn)
        if val_loss < best_val - 1e-8:
            best_val = val_loss; best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
            patience_left = cfg.patience
        else:
            patience_left -= 1
            if patience_left <= 0:
                break
    if best_state is not None:
        model.load_state_dict(best_state)
    return model

# ---------- Config + Search Space ----------
@dataclass
class TrainConfig:
    seq_len: int = 12
    batch_size: int = 8
    lr: float = 3e-4
    weight_decay: float = 1e-4
    max_epochs: int = 60
    patience: int = 6
    d_model: int = 96
    nhead: int = 2
    num_layers: int = 1
    dim_feedforward: int = 384
    dropout: float = 0.3
    attn_dropout: float = 0.1
    causal: bool = False

SEARCH_SPACE = {
    "seq_len":         [1, 2, 6, 12, 24, 36, 48, 72],
    "batch_size":      [8, 32, 64, 128],
    "lr":              [1e-3, 3e-4, 1e-4],
    "weight_decay":    [1e-5, 1e-4, 3e-4],
    "d_model":         [64, 96, 128],
    "nhead":           [2, 4, 8],
    "num_layers":      [1, 2, 3],
    "dim_feedforward": [128, 256, 384],
    "dropout":         [0.1, 0.2, 0.3],
    "max_epochs":      [40, 60],
    "patience":        [6, 8],
    "attn_dropout":    [0.0, 0.1],
    "nhead":           [1, 2, 4, 7],

}

def sample_cfg() -> TrainConfig:
    def s(k): return random.choice(SEARCH_SPACE[k])
    return TrainConfig(
        seq_len=s("seq_len"),
        batch_size=s("batch_size"),
        lr=s("lr"),
        weight_decay=s("weight_decay"),
        d_model=s("d_model"),
        nhead=s("nhead"),
        num_layers=s("num_layers"),
        dim_feedforward=s("dim_feedforward"),
        dropout=s("dropout"),
        attn_dropout=s("attn_dropout"),
        patience=s("patience"),
        max_epochs=s("max_epochs"),
        causal=False,
    )

# ---------- Orchestration ----------
def run_cv_random_synth(X_df: pd.DataFrame,
                        y_df: pd.DataFrame,
                        n_folds: int = 5,
                        purge_hours: int = 24,
                        n_trials: int = 25,
                        metric_for_early_select: str = "rmse"  
                       ):
    cv_records = []
    best_score = float("inf")
    best_cfg = None
    best_trial_metrics = None

    for trial in range(1, n_trials+1):
        cfg = sample_cfg()

        X_seq, y_seq, t_seq = make_sequences(X_df, y_df, seq_len=cfg.seq_len)
        folds = build_rolling_folds(t_seq, n_folds=n_folds, purge_hours=purge_hours)
        if len(folds) == 0:
            raise ValueError("No valid folds produced. Check timestamps / seq_len.")

        fold_metrics = []
        for k, f in enumerate(folds, 1):
            tr_idx, va_idx = f["train_idx"], f["valid_idx"]
            X_tr, y_tr = X_seq[tr_idx], y_seq[tr_idx]
            X_va, y_va = X_seq[va_idx], y_seq[va_idx]

            tr_loader = DataLoader(SeqDataset(X_tr, y_tr), batch_size=cfg.batch_size, shuffle=True, drop_last=False)
            va_loader = DataLoader(SeqDataset(X_va, y_va), batch_size=cfg.batch_size, shuffle=False, drop_last=False)

            model = build_model_random_synth(input_dim=X_seq.shape[-1], cfg=cfg, out_dim=1).to(DEVICE)
            model = early_stop_train(model, tr_loader, va_loader, cfg)

            _, yv_true, yv_pred = eval_epoch(model, va_loader, nn.MSELoss())
            v_true = yv_true
            v_pred = np.maximum(yv_pred, 1e-8)
            v_true = np.square(v_true)
            v_pred = np.square(v_pred)

            fold_res = {
                "rmse": rmse(v_true, v_pred),
                "rmpse": rmpse(v_true, v_pred),
                "qlike": qlike_variance(v_true, v_pred),
            }
            fold_metrics.append(fold_res)

        avg = {m: float(np.mean([fm[m] for fm in fold_metrics])) for m in ["rmse","rmpse","qlike"]}
        rec = {"trial": trial, **avg, **cfg.__dict__}
        cv_records.append(rec)

        key = metric_for_early_select
        score = avg[key]
        if score < best_score:
            best_score = score
            best_cfg = cfg
            best_trial_metrics = {"per_fold": fold_metrics, "avg": avg}

        print(f"[Trial {trial:02d}] mean RMSE={avg['rmse']:.6f} QLIKE={avg['qlike']:.6f}  <- {'BEST' if score==best_score else ''}")

    cv_summary = pd.DataFrame(cv_records).sort_values(metric_for_early_select, ascending=True).reset_index(drop=True)
    return best_cfg, cv_summary, best_trial_metrics

# ---------- Final Train on full train span (optional test split by time) ----------
def train_on_timespan(X_df, y_df, cfg: TrainConfig, train_until_time: Optional[pd.Timestamp]=None):
    X_seq, y_seq, t_seq = make_sequences(X_df, y_df, seq_len=cfg.seq_len)
    t_seq = pd.to_datetime(t_seq)

    if train_until_time is None:
        split_idx = int(len(X_seq) * 0.85)
        tr_idx = np.arange(0, split_idx)
        te_idx = np.arange(split_idx, len(X_seq))
    else:
        tr_idx = np.where(t_seq <= pd.Timestamp(train_until_time))[0]
        te_idx = np.where(t_seq >  pd.Timestamp(train_until_time))[0]

        if len(tr_idx) > 0 and len(te_idx) > 0:
            purge_until = t_seq[tr_idx[-1]] + pd.Timedelta(hours=24)
            te_idx = te_idx[t_seq[te_idx] >= purge_until]

    X_tr, y_tr = X_seq[tr_idx], y_seq[tr_idx]
    X_te, y_te = X_seq[te_idx], y_seq[te_idx]

    tr_loader = DataLoader(SeqDataset(X_tr, y_tr), batch_size=cfg.batch_size, shuffle=True)
    te_loader = DataLoader(SeqDataset(X_te, y_te), batch_size=cfg.batch_size, shuffle=False)

    model = build_model_random_synth(input_dim=X_seq.shape[-1], cfg=cfg, out_dim=1).to(DEVICE)
    model = early_stop_train(model, tr_loader, te_loader, cfg)

    _, y_true, y_pred = eval_epoch(model, te_loader, nn.MSELoss())
    v_true = y_true; v_pred = np.maximum(y_pred, 1e-8)
    v_true = np.square(v_true)
    v_pred = np.square(v_pred)
    metrics = {
        "RMSE": rmse(v_true, v_pred),
        "RMPSE": rmpse(v_true, v_pred),
        "QLIKE": qlike_variance(v_true, v_pred),
        "n_test": int(len(v_true)),
    }
    return model, (v_true, v_pred, t_seq[te_idx]), metrics

In [5]:
# Hyperparameter tuning with 5-fold purged rolling CV (24h)
cutoff_time = pd.Timestamp("2025-08-23 16:00:00+00:00")
X_raw["timestamp"] = pd.to_datetime(X_raw["timestamp"], utc=True)
y_raw["timestamp"] = pd.to_datetime(y_raw["timestamp"], utc=True)
X_raw = X_raw.sort_values("timestamp").reset_index(drop=True)
y_raw = y_raw.sort_values("timestamp").reset_index(drop=True)
X_train = X_raw[X_raw["timestamp"] < cutoff_time]
y_train = y_raw[y_raw["timestamp"] < cutoff_time]

best_cfg, cv_table, best_cv = run_cv_random_synth(
    X_df=X_train, y_df=y_train,
    n_folds=5, purge_hours=24,
    n_trials=20,                  
    metric_for_early_select="rmse" 
)

print("Best config:", best_cfg)
display(cv_table.head(10))

[Trial 01] mean RMSE=0.000214 QLIKE=22964.311073  <- BEST
[Trial 02] mean RMSE=0.000222 QLIKE=293363.290762  <- 
[Trial 03] mean RMSE=0.000258 QLIKE=29470609.192472  <- 
[Trial 04] mean RMSE=0.000217 QLIKE=420769.760768  <- 
[Trial 05] mean RMSE=0.000275 QLIKE=22008162.514865  <- 
[Trial 06] mean RMSE=0.000329 QLIKE=18258235.606144  <- 
[Trial 07] mean RMSE=0.000256 QLIKE=16912795.996858  <- 
[Trial 08] mean RMSE=0.000209 QLIKE=1255651.135352  <- BEST
[Trial 09] mean RMSE=0.000219 QLIKE=59029.292100  <- 
[Trial 10] mean RMSE=0.000255 QLIKE=3800477.780690  <- 
[Trial 11] mean RMSE=0.000274 QLIKE=26813670.223014  <- 
[Trial 12] mean RMSE=0.000258 QLIKE=11719735.758099  <- 
[Trial 13] mean RMSE=0.000620 QLIKE=14661189.277899  <- 
[Trial 14] mean RMSE=0.000226 QLIKE=24386016.050515  <- 


AssertionError: embed_dim must be divisible by num_heads

In [None]:
cutoff_time = pd.Timestamp("2025-08-23 16:00:00+00:00")
final_model, (y_true, y_pred, t_test), test_metrics = train_on_timespan(
    X_df=X_raw, y_df=y_raw, cfg=best_cfg, train_until_time=cutoff_time
)
print("Test metrics:", test_metrics)


Saving predicted result for backtesting

In [None]:
pred_df = pd.DataFrame({"timestamp": pd.to_datetime(t_test), "observed": y_true, "predicted": y_pred})
pred_df.to_csv("../../Results/predictions_random_synth_test.csv", index=False)