In [1]:
pip install numpy pandas tqdm torch scikit-learn optuna

Note: you may need to restart the kernel to use updated packages.


In [40]:
# lstm_dns_kf_error_regression.py

"""
LSTM Regression on DNS_KF Forecast Errors
=========================================
Forecasts one-step-ahead MAE of Kalman DNS model errors over a rolling LSTM.
"""

# ---------------------- Imports ---------------------- #
import os, ast, time, random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch import amp
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler

# ---------------------- Reproducibility ---------------------- #
RNG_SEED = 42
random.seed(RNG_SEED); np.random.seed(RNG_SEED); torch.manual_seed(RNG_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RNG_SEED)

# ---------------------- Device & CuDNN ---------------------- #
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Device: {device}")
if device.type == "cuda":
    import torch.backends.cudnn as cudnn
    print(f"  • GPU: {torch.cuda.get_device_name(0)}")
    cudnn.benchmark = True

# ---------------------- Constants ---------------------- #
BUSINESS_DAYS_YEAR = 252  # trading days ~1y
ROLL_YEARS         = 3    # sequence length in years
SEQ_LEN            = BUSINESS_DAYS_YEAR * ROLL_YEARS   # 756
VAL_WINDOW         = BUSINESS_DAYS_YEAR * 2            # 504
HOLDOUT_WINDOW     = BUSINESS_DAYS_YEAR * 3            # 756
EARLY_STOP_PATIENCE= 20

# Optuna search‑space
HSPACE = {
    "hidden_dim"   : (32, 192),
    "num_layers"   : [1, 2, 3],
    "dropout"      : (0.0, 0.6),
    "learning_rate": (1e-4, 5e-3),
    "batch_size"   : [32, 64, 128],
    "epochs"       : (40, 80),
}

# ---------------------- Model ---------------------- #
class LSTMRegressor(nn.Module):
    def __init__(self, in_dim: int, hid: int, layers: int, out_dim: int = 1, drop: float = 0.0):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=in_dim,
            hidden_size=hid,
            num_layers=layers,
            batch_first=True,
            dropout=(drop if layers > 1 else 0.0)
        )
        self.drop = nn.Dropout(drop)
        self.norm = nn.LayerNorm(hid)
        self.fc   = nn.Linear(hid, out_dim, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        _, (h_n, _) = self.lstm(x)
        return self.fc(self.norm(self.drop(h_n[-1])))

# ---------------------- Data Utilities ---------------------- #

def _parse_vec(col: str) -> np.ndarray:
    return np.asarray(ast.literal_eval(col), dtype=np.float32)

def load_target(horizon: int) -> pd.DataFrame:
    path = fr"C:\\Users\\azorb\\PycharmProjects\\Predicting the Yield Curve\\Model Fit\\Output\\DNS_Full_Forecast\\dns_kf_total_h{horizon}_full_dataset.csv"
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    df = pd.read_csv(path, parse_dates=["eval_date"]).sort_values("eval_date")

    true = df["true_yields"].apply(_parse_vec)
    pred = df["forecast_yields"].apply(_parse_vec)
    errors = pred.subtract(true)

    return pd.DataFrame(errors.tolist(),
                        index=df["eval_date"],
                        columns=[f"err_{i}" for i in range(6)])

def gen_sequences(X_df: pd.DataFrame, y_ser: pd.Series, seq_len: int = SEQ_LEN):
    X_arr = X_df.values.astype(np.float32)
    y_arr = y_ser.values.astype(np.float32)
    X_seq, y_seq = [], []
    for i in range(seq_len, len(X_df)):
        X_seq.append(X_arr[i - seq_len:i])
        y_seq.append(y_arr[i])
    if len(X_seq) == 0:
        return np.empty((0, seq_len, X_df.shape[1]), dtype=np.float32), np.empty((0, 1), dtype=np.float32)
    return np.stack(X_seq), np.asarray(y_seq)[:, None]

def trim_y_to_X(X_df: pd.DataFrame, y_ser: pd.Series, seq_len: int, horizon: int) -> pd.Series:
    x_dates = X_df.index
    y_dates = y_ser.index
    earliest_predictable_idx = seq_len + horizon - 1
    if earliest_predictable_idx >= len(x_dates):
        raise ValueError("Not enough X data to allow for any predictions at this sequence+horizon.")

    min_y_date = x_dates[earliest_predictable_idx]
    max_y_date = x_dates[-1]
    valid_y_dates = y_dates[(y_dates >= min_y_date) & (y_dates <= max_y_date)]
    return y_ser.loc[valid_y_dates]

# ---------------------- Splitting ---------------------- #
def create_folds(N, seq_len=756, val_window=504, holdout=756):
    trainable = N - holdout
    if trainable <= seq_len + val_window:
        raise ValueError("Not enough data for the requested scheme.")

    initial_train = 504
    residue = (trainable - initial_train) % val_window
    first_train_end = initial_train + residue

    folds = []
    i = first_train_end
    while i + val_window <= trainable:
        folds.append((0, i, i, i + val_window))
        i += val_window

    holdout_slice = slice(trainable, N)
    return folds, holdout_slice

# ---------------------- Objective Function ---------------------- #
def objective(trial, X_df, y_ser, folds):
    params = {
        "hidden_dim"   : trial.suggest_int("hidden_dim", *HSPACE["hidden_dim"]),
        "num_layers"   : trial.suggest_categorical("num_layers", HSPACE["num_layers"]),
        "dropout"      : trial.suggest_float("dropout", *HSPACE["dropout"]),
        "learning_rate": trial.suggest_float("learning_rate", *HSPACE["learning_rate"], log=True),
        "batch_size"   : trial.suggest_categorical("batch_size", HSPACE["batch_size"]),
        "epochs"       : trial.suggest_int("epochs", *HSPACE["epochs"])
    }

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_df), index=X_df.index, columns=X_df.columns)

    all_preds, all_trues = [], []
    for train_start, train_end, val_start, val_end in folds:
        X_train, y_train = X_scaled.iloc[train_start:train_end], y_ser.iloc[train_start:train_end]
        X_val, y_val     = X_scaled.iloc[val_start:val_end], y_ser.iloc[val_start:val_end]

        Xt, yt = gen_sequences(X_train, y_train)
        Xv, yv = gen_sequences(X_val, y_val)

        if len(Xt) == 0 or len(Xv) == 0:
            continue

        model = LSTMRegressor(Xt.shape[-1], params["hidden_dim"], params["num_layers"], drop=params["dropout"]).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])
        loss_fn = nn.MSELoss()

        train_loader = DataLoader(TensorDataset(torch.tensor(Xt), torch.tensor(yt)),
                                  batch_size=params["batch_size"], shuffle=True)
        best_loss = float("inf")
        patience = EARLY_STOP_PATIENCE

        for epoch in range(params["epochs"]):
            model.train()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                pred = model(xb)
                loss = loss_fn(pred, yb)
                loss.backward()
                optimizer.step()

            model.eval()
            with torch.no_grad():
                val_pred = model(torch.tensor(Xv).to(device)).cpu().numpy()
                val_loss = mean_squared_error(yv, val_pred)

            if val_loss < best_loss:
                best_loss = val_loss
                patience = EARLY_STOP_PATIENCE
            else:
                patience -= 1
                if patience == 0:
                    break

        all_preds.append(val_pred.flatten())
        all_trues.append(yv.flatten())

    if not all_preds:
        raise ValueError("No valid training/validation split produced any predictions.")

    return mean_squared_error(np.concatenate(all_trues), np.concatenate(all_preds))

# ---------------------- Main (Notebook-compatible) ---------------------- #
def main_notebook(horizon, trials=30, n_jobs=1):
    X_df  = pd.read_csv("X_df_filtered_shap.csv", index_col=0, parse_dates=True)
    y_df  = load_target(horizon)
    y_ser = y_df.mean(axis=1).rename("err")  # Use raw directional error, not absolute
    y_ser = trim_y_to_X(X_df, y_ser, seq_len=SEQ_LEN, horizon=horizon)

    folds, hold_out = create_folds(len(y_ser))
    print(f"Expanding CV folds: {len(folds)}, hold‑out length: {hold_out.stop - hold_out.start}")

    study = optuna.create_study(direction="minimize",
                                sampler=TPESampler(seed=RNG_SEED),
                                pruner=MedianPruner(n_startup_trials=8, n_warmup_steps=15))
    t0 = time.time()
    study.optimize(lambda tr: objective(tr, X_df, y_ser, folds),
                   n_trials=trials, n_jobs=n_jobs, show_progress_bar=True)
    minutes = (time.time() - t0) / 60

    print("=== Best trial (CV) ===")
    print(f"MSE : {study.best_value:.6f}")
    print("Params:")
    for k, v in study.best_trial.params.items():
        print(f"  {k}: {v}")
    print(f"Optimisation time: {minutes:.1f} min")

    # Optionally add: train_on_full()

main_notebook(1)

[INFO] Device: cpu


[I 2025-05-15 19:41:36,383] A new study created in memory with name: no-name-9f5e2578-e1e3-42dc-853d-a86672d13fc5


Expanding CV folds: 6, hold‑out length: 756


  0%|          | 0/30 [00:00<?, ?it/s]

[W 2025-05-15 19:41:36,881] Trial 0 failed with parameters: {'hidden_dim': 92, 'num_layers': 1, 'dropout': 0.0936111842654619, 'learning_rate': 0.00018408992080552527, 'batch_size': 64, 'epochs': 69} because of the following error: ValueError('No valid training/validation split produced any predictions.').
Traceback (most recent call last):
  File "C:\Users\azorb\PycharmProjects\Predicting the Yield Curve\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\azorb\AppData\Local\Temp\ipykernel_253940\1142337683.py", line 216, in <lambda>
    study.optimize(lambda tr: objective(tr, X_df, y_ser, folds),
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\azorb\AppData\Local\Temp\ipykernel_253940\1142337683.py", line 198, in objective
    raise ValueError("No valid training/validation split produced any predictions.")
ValueError: No valid training/validati




ValueError: No valid training/validation split produced any predictions.

[I 2025-05-15 18:51:25,841] A new study created in memory with name: no-name-799123d5-d0de-4e7f-93f3-1443279853f2


Expanding CV folds: 6, hold‑out length: 756


  0%|          | 0/30 [00:00<?, ?it/s]

[W 2025-05-15 18:51:25,854] Trial 0 failed with parameters: {'hidden_dim': 92, 'num_layers': 1, 'dropout': 0.0936111842654619, 'learning_rate': 0.00018408992080552527, 'batch_size': 64, 'epochs': 69} because of the following error: ValueError('need at least one array to stack').
Traceback (most recent call last):
  File "C:\Users\azorb\PycharmProjects\Predicting the Yield Curve\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\azorb\AppData\Local\Temp\ipykernel_253940\2407196840.py", line 149, in <lambda>
    study.optimize(lambda tr: objective(tr, X_df, y_ser, folds),
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\azorb\AppData\Local\Temp\ipykernel_253940\4097518514.py", line 229, in objective
    Xv, yv = gen_sequences(X_va_s, y_va)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\azorb\AppData\Local\Temp\ipykernel_253940\2407196840




ValueError: need at least one array to stack

In [12]:
X_df  = pd.read_csv(r"C:\Users\azorb\PycharmProjects\Predicting the Yield Curve\Data Processing\Output\Independent\X_df_filtered_shap.csv", index_col=0, parse_dates=True)
y_ser = load_target(horizon)


In [13]:
X_df

Unnamed: 0,RX1 Index,.EURGBP02 U Index,HSI Index,GJTB3MO Index,SX7E Index,Y_df_change_dir_63_US_6m,EUR009M Index,CS,NG1 Index,Y_df_change_dir_252_US_5y,...,EUDR1T Index,LEI CHNG Index,.EUR1030Y U Index,Y_df_change_dir_21_US_5y,Y_df_change_dir_252_US_3y,ER3 Index,USURTOT Index,Y_df_change_dir_21_US_3m,INF,BULL
2004-09-13,430.0,-1.99750,13139.57,0.0090,260.92,1,2.302,1.34,4.850,1,...,2.0375,0.6,-0.8056,0,1,185,5.4,1,2.7,50.50
2004-09-14,445.0,-2.00100,13148.06,0.0080,261.07,1,2.298,1.34,4.928,1,...,2.0375,0.6,-0.8091,0,1,185,5.4,1,2.7,50.50
2004-09-15,451.0,-2.00150,13084.40,0.0080,261.01,1,2.289,1.33,4.824,1,...,2.0375,0.6,-0.7951,0,1,185,5.4,1,2.7,50.50
2004-09-16,451.0,-2.07000,13209.84,0.0080,261.88,1,2.304,1.35,4.719,1,...,2.0475,0.6,-0.8012,0,1,185,5.4,1,2.7,45.50
2004-09-17,451.0,-2.07650,13224.93,0.0080,263.87,1,2.288,1.33,5.108,1,...,2.0375,0.6,-0.8031,0,1,185,5.4,1,2.7,45.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-27,579.0,-2.02400,23718.29,0.3000,184.49,0,-0.194,0.92,3.934,0,...,2.6500,-0.2,-0.2711,0,0,564,4.0,1,3.0,19.38
2025-02-28,579.0,-2.03600,22941.32,0.3300,184.23,0,-0.194,0.97,3.834,0,...,2.7202,-0.2,-0.2812,0,0,564,4.1,1,3.0,19.38
2025-03-03,579.0,-2.03700,23006.27,0.3353,187.20,0,-0.194,1.00,4.122,0,...,2.7200,-0.2,-0.2942,0,0,564,4.1,1,3.0,19.38
2025-03-04,579.0,-2.02800,22941.77,0.3403,179.59,0,-0.194,1.00,4.350,0,...,2.6974,-0.2,-0.2934,0,0,564,4.1,0,3.0,19.38


In [14]:
y_ser

Unnamed: 0_level_0,err_0,err_1,err_2,err_3,err_4,err_5
eval_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-08-21,0.037889,-0.074640,-0.033638,0.106929,0.090232,0.046057
2006-08-22,0.055971,-0.060338,-0.035661,0.090449,0.068072,0.020281
2006-08-23,0.068861,-0.060926,-0.041979,0.051814,0.045365,-0.003775
2006-08-24,0.069610,-0.061383,-0.044437,0.054951,0.026934,-0.002941
2006-08-25,0.051137,-0.049746,-0.032708,0.066100,0.047075,0.015302
...,...,...,...,...,...,...
2025-02-27,-0.055035,-0.059044,0.027544,0.048352,0.070113,0.046477
2025-02-28,-0.047761,-0.028886,0.065750,0.068886,0.079419,0.038070
2025-03-03,-0.053037,-0.067601,0.101380,0.132421,0.139024,0.116499
2025-03-04,-0.063268,-0.062399,0.089305,0.076784,0.051101,0.012029


In [16]:
y_ser

Unnamed: 0_level_0,err_0,err_1,err_2,err_3,err_4,err_5
eval_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-08-07,-0.037180,-0.147252,-0.115564,0.001923,-0.041472,-0.087331
2007-08-08,-0.033905,-0.167879,-0.162297,-0.095216,-0.139183,-0.159733
2007-08-09,0.178331,0.053061,0.056316,0.096766,0.028895,-0.056236
2007-08-10,0.462340,0.160074,0.158193,0.158394,0.083001,-0.012737
2007-08-13,0.156232,-0.071388,-0.024745,0.081188,0.022104,-0.043261
...,...,...,...,...,...,...
2025-02-27,-0.055035,-0.059044,0.027544,0.048352,0.070113,0.046477
2025-02-28,-0.047761,-0.028886,0.065750,0.068886,0.079419,0.038070
2025-03-03,-0.053037,-0.067601,0.101380,0.132421,0.139024,0.116499
2025-03-04,-0.063268,-0.062399,0.089305,0.076784,0.051101,0.012029


In [32]:
# ---------------------------------------------------------
# DEBUG: inspect expanding-CV fold boundaries for any horizon
# ---------------------------------------------------------
# --- adjust this line to the horizon you want to inspect ---
horizon = 5     # choose from 1, 5, 21, 63, 252
# ---------------------------------------------------------

# load features & target (MAE from DNS_KF forecast errors)
X_df  = pd.read_csv(r"C:\Users\azorb\PycharmProjects\Predicting the Yield Curve\Data Processing\Output\Independent\X_df_filtered_shap.csv", index_col=0, parse_dates=True)
y_ser = load_target(horizon)

# Trim y_ser so that LSTM can always construct a sequence ending at y_t - horizon
y_ser = trim_y_to_X(X_df, y_ser, seq_len=SEQ_LEN, horizon=horizon)

# build folds + hold-out
folds, hold_slice = create_folds(len(y_ser))

date_index = y_ser.index  # convenience alias

print(f"\n=== Horizon h={horizon}  |  Total aligned obs.: {len(y_ser):,} ===")
for i, (tr_s, tr_e, va_s, va_e) in enumerate(folds, 1):
    print(f"\nFold {i}")
    print(f"  Train : {date_index[tr_s]}  →  {date_index[tr_e-1]}   "
          f"(n={tr_e-tr_s})")
    print(f"  Val   : {date_index[va_s]}  →  {date_index[va_e-1]}   "
          f"(n={va_e-va_s})")

print("\nHold-out")
print(f"  Test  : {date_index[hold_slice.start]}  →  "
      f"{date_index[hold_slice.stop-1]}   "
      f"(n={hold_slice.stop - hold_slice.start})")


=== Horizon h=5  |  Total aligned obs.: 4,583 ===

Fold 1
  Train : 2007-08-13 00:00:00  →  2010-09-08 00:00:00   (n=803)
  Val   : 2010-09-09 00:00:00  →  2012-08-14 00:00:00   (n=504)

Fold 2
  Train : 2007-08-13 00:00:00  →  2012-08-14 00:00:00   (n=1307)
  Val   : 2012-08-15 00:00:00  →  2014-07-21 00:00:00   (n=504)

Fold 3
  Train : 2007-08-13 00:00:00  →  2014-07-21 00:00:00   (n=1811)
  Val   : 2014-07-22 00:00:00  →  2016-06-24 00:00:00   (n=504)

Fold 4
  Train : 2007-08-13 00:00:00  →  2016-06-24 00:00:00   (n=2315)
  Val   : 2016-06-27 00:00:00  →  2018-05-31 00:00:00   (n=504)

Fold 5
  Train : 2007-08-13 00:00:00  →  2018-05-31 00:00:00   (n=2819)
  Val   : 2018-06-01 00:00:00  →  2020-05-06 00:00:00   (n=504)

Fold 6
  Train : 2007-08-13 00:00:00  →  2020-05-06 00:00:00   (n=3323)
  Val   : 2020-05-07 00:00:00  →  2022-04-12 00:00:00   (n=504)

Hold-out
  Test  : 2022-04-13 00:00:00  →  2025-03-05 00:00:00   (n=756)


In [3]:
    print(f"MSE   : {study.best_value:.6f}")
    print(f"Params: {study.best_trial.params}")
    print(f"Total run time: {dur:.1f} s")

MSE   : 0.018671
Params: {'hidden_dim': 156, 'num_layers': 3, 'dropout': 0.4241144063085703, 'learning_rate': 0.001732053535845956, 'batch_size': 32, 'epochs': 44}
Total run time: 6340.3 s


In [17]:
len(y_true)

756