In [1]:
# ============================================================
# STEP 4 â€” Final LightGBM Backtest using Top Feature Subset
# ============================================================

import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import os

TRAIN_PATH = "/kaggle/input/hull-tactical-market-prediction/train.csv"
TARGET_NAME = "market_forward_excess_returns"

print("=== STEP 4: Training + Backtest ===")

df = pd.read_csv(TRAIN_PATH).sort_values("date_id")

df = df.dropna(subset=[TARGET_NAME, "forward_returns", "risk_free_rate"])

interest_cols  = [c for c in df.columns if c.startswith("I")]
valuation_cols = [c for c in df.columns if c.startswith("P")]
feature_cols   = sorted(interest_cols + valuation_cols)

X_all = df[feature_cols].fillna(0).values
y_all = df[TARGET_NAME].values

# -----------------------------
# Train/test split
# -----------------------------
unique_dates = np.sort(df["date_id"].unique())
n_dates = len(unique_dates)

n_test = max(180, int(0.2 * n_dates))
test_dates  = unique_dates[-n_test:]
train_dates = unique_dates[:-n_test]

train_mask = df["date_id"].isin(train_dates)
test_mask  = df["date_id"].isin(test_dates)

X_train, y_train = X_all[train_mask], y_all[train_mask]
X_test,  y_test  = X_all[test_mask],  y_all[test_mask]

df_train = df[train_mask].copy()
df_test  = df[test_mask].copy()

# -----------------------------
# Train model
# -----------------------------
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.02,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_data_in_leaf": 50,
    "verbosity": -1,
}

dtrain = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params, dtrain, num_boost_round=1200)

y_mean = y_train.mean()
y_std  = y_train.std() + 1e-9

def pred_to_allocation(pred, mean, std, alpha=0.5):
    z = (pred - mean) / (std + 1e-9)
    w = 1 + alpha * z
    return np.clip(w, 0, 2)

# -----------------------------
# Predict & backtest
# -----------------------------
pred = model.predict(X_test)
alloc = pred_to_allocation(pred, y_mean, y_std)

df_test["alloc"] = alloc

fwd = df_test["forward_returns"].values
rf  = df_test["risk_free_rate"].values

strategy_ret = alloc * fwd
strategy_excess = strategy_ret - rf

df_test["strategy_ret"] = strategy_ret
df_test["strategy_excess"] = strategy_excess

df_test["cum_strategy"] = (1 + strategy_ret).cumprod()
df_test["cum_market"]   = (1 + fwd).cumprod()

# -----------------------------
# Metrics
# -----------------------------
vol_s = strategy_excess.std()
vol_m = df_test["market_forward_excess_returns"].std() + 1e-9

mu_s = strategy_excess.mean()
mu_m = df_test["market_forward_excess_returns"].mean()

sharpe_s = mu_s / vol_s
vol_ratio = vol_s / vol_m

def max_drawdown(series):
    m = series.cummax()
    dd = (series - m) / m
    return dd.min()

mdd_s = max_drawdown(df_test["cum_strategy"])
mdd_m = max_drawdown(df_test["cum_market"])

print("\n=== BACKTEST RESULTS ===")
print(f"Cumulative Return (Strategy): {df_test['cum_strategy'].iloc[-1]:.4f}")
print(f"Cumulative Return (Market):   {df_test['cum_market'].iloc[-1]:.4f}")
print(f"Sharpe-like: {sharpe_s:.4f}")
print(f"Vol Ratio:   {vol_ratio:.4f}")
print(f"MaxDD Strat: {mdd_s:.4f}")
print(f"MaxDD Mkt:   {mdd_m:.4f}")

=== STEP 4: Training + Backtest ===

=== BACKTEST RESULTS ===
Cumulative Return (Strategy): 2.7420
Cumulative Return (Market):   2.7309
Sharpe-like: 0.0457
Vol Ratio:   1.0274
MaxDD Strat: -0.2900
MaxDD Mkt:   -0.2413
