# 2.8 Hyperparameter Optimization (Simple Returns)

Based on best model from 2.5 (Sharpe 25). 

**Key Change: Simple Returns Instead of Log Returns**
- Training target: `simple_return = exp(log_return) - 1`
- Evaluation: NO clipping (realistic assessment)
- This matches what backtrader actually computes

**Optimizes:**
- Architecture: latent dims, hidden sizes, news alpha
- Training: learning rate, weight decay, label smoothing

**Fixed (from 2.5 best):**
- Dropout: fund=0.8, price=0.4, news=0.2
- Single-pair-per-symbol training

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from itertools import product
from dataclasses import dataclass
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [2]:
@dataclass
class ModelConfig:
    # Feature dimensions (fixed)
    n_fundamental_features: int = 19
    n_price_features: int = 9
    n_embedding_dim: int = 768
    
    # Hidden layer sizes (tunable)
    fund_hidden: int = 64
    price_hidden: int = 32
    news_hidden: int = 128
    
    # Encoder latent dimensions (tunable)
    fundamental_latent: int = 32
    price_latent: int = 16
    news_latent: int = 32
    
    # Dropout (fixed from 2.5 best)
    fundamental_dropout: float = 0.8
    price_dropout: float = 0.4
    news_dropout: float = 0.2
    
    # News influence (tunable)
    news_alpha: float = 0.8
    
    # Training (tunable)
    batch_size: int = 512
    learning_rate: float = 1e-3
    weight_decay: float = 1e-3
    label_smoothing: float = 0.1
    n_epochs: int = 15  # Reduced for grid search

## 1. Load Data

In [3]:
df = pd.read_parquet("data/ml_dataset.pqt")
df["feature_date"] = pd.to_datetime(df["feature_date"])

# Convert log returns to simple returns for training
# target_return is log(next_close/close), we need (next_close/close - 1)
df["simple_return"] = np.exp(df["target_return"]) - 1

print(f"Dataset: {len(df):,} rows")
print(f"Date range: {df['feature_date'].min().date()} to {df['feature_date'].max().date()}")
print(f"\nReturn comparison:")
print(f"  Log return range:    [{df['target_return'].min():.3f}, {df['target_return'].max():.3f}]")
print(f"  Simple return range: [{df['simple_return'].min():.3f}, {df['simple_return'].max():.3f}]")

Dataset: 2,092,929 rows
Date range: 2021-01-13 to 2025-12-18

Return comparison:
  Log return range:    [-10.765, 10.633]
  Simple return range: [-1.000, 41469.588]


In [4]:
# Feature columns
price_feat_cols = [
    "overnight_gap_z", "intraday_ret_z",
    "ret_1d_z", "ret_2d_z", "ret_3d_z", "ret_5d_z",
    "vol_5d_z", "dist_from_high_5d_z", "dist_from_low_5d_z"
]
fund_feat_cols = [c for c in df.columns if c.endswith("_z") and c not in price_feat_cols and c != "news_count_z"]
emb_cols = [c for c in df.columns if c.startswith("emb_")]

print(f"Price features: {len(price_feat_cols)}")
print(f"Fundamental features: {len(fund_feat_cols)}")
print(f"Embedding dims: {len(emb_cols)}")

Price features: 9
Fundamental features: 19
Embedding dims: 768


In [5]:
# Time-based split
dates = sorted(df["feature_date"].unique())
n_dates = len(dates)
train_end_idx = int(n_dates * 0.7)
val_end_idx = int(n_dates * 0.8)

train_dates = set(dates[:train_end_idx])
val_dates = set(dates[train_end_idx:val_end_idx])
test_dates = set(dates[val_end_idx:])

train_df = df[df["feature_date"].isin(train_dates)].copy()
val_df = df[df["feature_date"].isin(val_dates)].copy()
test_df = df[df["feature_date"].isin(test_dates)].copy()

print(f"Train: {len(train_df):,} rows ({min(train_dates).date()} to {max(train_dates).date()})")
print(f"Val: {len(val_df):,} rows ({min(val_dates).date()} to {max(val_dates).date()})")
print(f"Test: {len(test_df):,} rows ({min(test_dates).date()} to {max(test_dates).date()})")

# News-only filtering for evaluation
def filter_news_only(df_in, emb_cols):
    has_news = (df_in[emb_cols].abs().sum(axis=1) > 0)
    return df_in[has_news].copy()

val_df_news = filter_news_only(val_df, emb_cols)
test_df_news = filter_news_only(test_df, emb_cols)
print(f"\nVal (news-only): {len(val_df_news):,}")
print(f"Test (news-only): {len(test_df_news):,}")

Train: 1,418,494 rows (2021-01-13 to 2024-05-01)
Val: 210,247 rows (2024-05-02 to 2024-10-21)
Test: 464,188 rows (2024-10-22 to 2025-12-18)

Val (news-only): 58,882
Test (news-only): 128,502


## 2. Dataset and Model

In [6]:
class SinglePairDataset(Dataset):
    """Dataset where each symbol appears in exactly one pair per day.
    
    Uses simple returns (not log returns) for training targets.
    """

    def __init__(self, df, price_cols, fund_cols, emb_cols, verbose=True):
        has_news = (df[emb_cols].abs().sum(axis=1) > 0)
        df_news = df[has_news].copy().reset_index(drop=True)
        if verbose:
            print(f"Filtered to news-only: {len(df_news):,} rows")

        self.df = df_news
        self.price_cols = price_cols
        self.fund_cols = fund_cols
        self.emb_cols = emb_cols

        self.date_groups = {}
        for date, group in self.df.groupby("feature_date"):
            indices = group.index.tolist()
            if len(indices) >= 2:
                self.date_groups[date] = indices

        self.dates = list(self.date_groups.keys())

        self.price_arr = self.df[price_cols].values.astype(np.float32)
        self.fund_arr = self.df[fund_cols].values.astype(np.float32)
        self.emb_arr = self.df[emb_cols].values.astype(np.float32)
        
        # Use simple_return instead of target_return (log)
        self.target_arr = self.df["simple_return"].values.astype(np.float32)

        self.pairs = []
        self._generate_pairs(verbose=verbose)

    def _generate_pairs(self, verbose=False):
        pairs = []
        for date in self.dates:
            indices = list(self.date_groups[date])
            np.random.shuffle(indices)
            for i in range(0, len(indices) - 1, 2):
                pairs.append((indices[i], indices[i + 1]))
        self.pairs = pairs
        if verbose:
            print(f"Generated {len(self.pairs):,} pairs")

    def resample_pairs(self):
        self._generate_pairs()

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        i, j = self.pairs[idx]
        price_i, price_j = self.price_arr[i], self.price_arr[j]
        fund_i, fund_j = self.fund_arr[i], self.fund_arr[j]
        emb_i, emb_j = self.emb_arr[i], self.emb_arr[j]
        actual_label = 1.0 if self.target_arr[i] > self.target_arr[j] else 0.0

        if np.random.random() < 0.5:
            price_i, price_j = price_j, price_i
            fund_i, fund_j = fund_j, fund_i
            emb_i, emb_j = emb_j, emb_i
            label = 1.0 - actual_label
        else:
            label = actual_label

        return {
            "price_i": torch.tensor(price_i), "price_j": torch.tensor(price_j),
            "fund_i": torch.tensor(fund_i), "fund_j": torch.tensor(fund_j),
            "emb_i": torch.tensor(emb_i), "emb_j": torch.tensor(emb_j),
            "label": torch.tensor(label),
        }

In [7]:
class MultiBranchRanker(nn.Module):
    """Multi-branch ranking model with configurable architecture."""
    
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.fund_encoder = nn.Sequential(
            nn.Linear(config.n_fundamental_features, config.fund_hidden),
            nn.ReLU(),
            nn.Dropout(config.fundamental_dropout),
            nn.Linear(config.fund_hidden, config.fundamental_latent),
            nn.ReLU(),
        )
        
        self.price_encoder = nn.Sequential(
            nn.Linear(config.n_price_features, config.price_hidden),
            nn.ReLU(),
            nn.Dropout(config.price_dropout),
            nn.Linear(config.price_hidden, config.price_latent),
            nn.ReLU(),
        )
        
        self.news_encoder = nn.Sequential(
            nn.Linear(config.n_embedding_dim, config.news_hidden),
            nn.ReLU(),
            nn.Dropout(config.news_dropout),
            nn.Linear(config.news_hidden, config.news_latent),
            nn.ReLU(),
        )
        
        fused_dim = config.fundamental_latent + config.price_latent + config.news_latent
        self.output_head = nn.Sequential(
            nn.Linear(fused_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, 1),
        )
    
    def forward(self, price, fund, emb):
        h_f = self.fund_encoder(fund)
        h_p = self.price_encoder(price)
        h_n = self.news_encoder(emb)
        h_n_scaled = self.config.news_alpha * h_n
        h = torch.cat([h_f, h_p, h_n_scaled], dim=-1)
        return self.output_head(h).squeeze(-1)
    
    def forward_pair(self, price_i, fund_i, emb_i, price_j, fund_j, emb_j):
        score_i = self.forward(price_i, fund_i, emb_i)
        score_j = self.forward(price_j, fund_j, emb_j)
        return torch.sigmoid(score_i - score_j)

## 3. Training and Evaluation

In [8]:
def pairwise_ranking_loss(pred_prob, label, smoothing=0.1):
    smoothed_label = label * (1 - smoothing) + 0.5 * smoothing
    return F.binary_cross_entropy(pred_prob, smoothed_label)


def train_epoch(model, loader, optimizer, device, label_smoothing=0.1):
    model.train()
    total_loss = 0
    total_samples = 0
    
    for batch in loader:
        price_i = batch["price_i"].to(device)
        price_j = batch["price_j"].to(device)
        fund_i = batch["fund_i"].to(device)
        fund_j = batch["fund_j"].to(device)
        emb_i = batch["emb_i"].to(device)
        emb_j = batch["emb_j"].to(device)
        label = batch["label"].to(device)
        
        optimizer.zero_grad()
        pred_prob = model.forward_pair(price_i, fund_i, emb_i, price_j, fund_j, emb_j)
        loss = pairwise_ranking_loss(pred_prob, label, smoothing=label_smoothing)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * len(label)
        total_samples += len(label)
    
    return total_loss / total_samples


@torch.no_grad()
def get_scores(model, df, price_cols, fund_cols, emb_cols, device, batch_size=1024):
    model.eval()
    price_arr = torch.tensor(df[price_cols].values.astype(np.float32))
    fund_arr = torch.tensor(df[fund_cols].values.astype(np.float32))
    emb_arr = torch.tensor(df[emb_cols].values.astype(np.float32))
    
    scores = []
    for i in range(0, len(df), batch_size):
        price = price_arr[i:i+batch_size].to(device)
        fund = fund_arr[i:i+batch_size].to(device)
        emb = emb_arr[i:i+batch_size].to(device)
        score = model(price, fund, emb)
        scores.append(score.cpu().numpy())
    
    return np.concatenate(scores)


def evaluate_model(model, df, price_cols, fund_cols, emb_cols, device):
    """Compute IC Sharpe and short strategy Sharpe using SIMPLE returns (no clipping)."""
    df_eval = df.copy()
    df_eval["score"] = get_scores(model, df_eval, price_cols, fund_cols, emb_cols, device)
    
    # IC - use simple_return for correlation
    ics = []
    for date, group in df_eval.groupby("feature_date"):
        if len(group) < 10:
            continue
        ic, _ = spearmanr(group["score"], group["simple_return"])
        if not np.isnan(ic):
            ics.append(ic)
    
    mean_ic = np.mean(ics) if ics else 0
    ic_std = np.std(ics) if ics else 1
    ic_sharpe = mean_ic / ic_std * np.sqrt(252) if ic_std > 0 else 0
    
    # Short strategy - NO CLIPPING (realistic)
    returns = []
    for date, group in df_eval.groupby("feature_date"):
        if len(group) < 10:
            continue
        bottom = group.nsmallest(5, "score")
        # Use simple_return, NO clipping
        short_ret = -bottom["simple_return"].mean()
        returns.append(short_ret)
    
    if len(returns) > 1:
        short_sharpe = np.mean(returns) / np.std(returns) * np.sqrt(252)
    else:
        short_sharpe = 0
    
    return {"mean_ic": mean_ic, "ic_sharpe": ic_sharpe, "short_sharpe": short_sharpe}

## 4. Hyperparameter Search Space

In [9]:
# Search space
SEARCH_SPACE = {
    # Architecture - latent dimensions
    "latent_scale": [0.5, 1.0, 2.0],  # Multiplier for default latent dims (32, 16, 32)
    
    # Architecture - hidden layer scale
    "hidden_scale": [0.5, 1.0, 1.5],  # Multiplier for default hidden dims (64, 32, 128)
    
    # News alpha
    "news_alpha": [0.5, 0.8, 1.0],
    
    # Training
    "learning_rate": [5e-4, 1e-3, 2e-3],
    "weight_decay": [1e-4, 1e-3, 1e-2],
    "label_smoothing": [0.05, 0.1, 0.15],
}

# Calculate total combinations (full grid would be huge)
total_full = 1
for k, v in SEARCH_SPACE.items():
    total_full *= len(v)
print(f"Full grid search would be: {total_full} combinations")
print("\nWe'll use a staged approach instead.")

Full grid search would be: 729 combinations

We'll use a staged approach instead.


In [10]:
# Create training dataset once
train_dataset = SinglePairDataset(train_df, price_feat_cols, fund_feat_cols, emb_cols, verbose=True)

Filtered to news-only: 339,872 rows
Generated 169,737 pairs


## 5. Stage 1: Architecture Search (Latent & Hidden Dims)

In [11]:
def train_config(config, train_dataset, val_df_news, n_epochs=15, verbose=False):
    """Train a single configuration and return validation metrics."""
    model = MultiBranchRanker(config).to(device)
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=config.learning_rate, 
        weight_decay=config.weight_decay
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
    
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    
    best_ic_sharpe = -float('inf')
    best_state = None
    
    for epoch in range(n_epochs):
        train_dataset.resample_pairs()
        train_loss = train_epoch(model, train_loader, optimizer, device, config.label_smoothing)
        scheduler.step()
        
        # Evaluate every 5 epochs
        if (epoch + 1) % 5 == 0 or epoch == n_epochs - 1:
            metrics = evaluate_model(model, val_df_news, price_feat_cols, fund_feat_cols, emb_cols, device)
            if metrics["ic_sharpe"] > best_ic_sharpe:
                best_ic_sharpe = metrics["ic_sharpe"]
                best_state = model.state_dict().copy()
            if verbose:
                print(f"  Epoch {epoch+1}: IC_sharpe={metrics['ic_sharpe']:.2f}")
    
    model.load_state_dict(best_state)
    final_metrics = evaluate_model(model, val_df_news, price_feat_cols, fund_feat_cols, emb_cols, device)
    
    return final_metrics, model

In [None]:
# Stage 1: Architecture search
print("STAGE 1: Architecture Search")
print("=" * 70)

arch_configs = list(product(
    SEARCH_SPACE["latent_scale"],
    SEARCH_SPACE["hidden_scale"],
    SEARCH_SPACE["news_alpha"],
))

print(f"Testing {len(arch_configs)} architecture configurations...\n")

arch_results = []
best_arch_sharpe = -float('inf')
best_arch_config = None

for i, (latent_scale, hidden_scale, news_alpha) in enumerate(arch_configs):
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        # Scaled architecture
        fund_hidden=int(64 * hidden_scale),
        price_hidden=int(32 * hidden_scale),
        news_hidden=int(128 * hidden_scale),
        fundamental_latent=int(32 * latent_scale),
        price_latent=int(16 * latent_scale),
        news_latent=int(32 * latent_scale),
        news_alpha=news_alpha,
        # Fixed from 2.5 best
        fundamental_dropout=0.8,
        price_dropout=0.4,
        news_dropout=0.2,
        # Default training params
        learning_rate=1e-3,
        weight_decay=1e-3,
        label_smoothing=0.1,
    )
    
    start = datetime.now()
    metrics, _ = train_config(config, train_dataset, val_df_news, n_epochs=15)
    elapsed = (datetime.now() - start).total_seconds()
    
    result = {
        "latent_scale": latent_scale,
        "hidden_scale": hidden_scale,
        "news_alpha": news_alpha,
        **metrics,
    }
    arch_results.append(result)
    
    if metrics["ic_sharpe"] > best_arch_sharpe:
        best_arch_sharpe = metrics["ic_sharpe"]
        best_arch_config = (latent_scale, hidden_scale, news_alpha)
    
    print(f"[{i+1:2d}/{len(arch_configs)}] latent={latent_scale:.1f} hidden={hidden_scale:.1f} alpha={news_alpha:.1f} | "
          f"IC_sharpe={metrics['ic_sharpe']:5.2f} short_sharpe={metrics['short_sharpe']:5.2f} | {elapsed:.0f}s")

print(f"\nBest architecture: latent_scale={best_arch_config[0]}, hidden_scale={best_arch_config[1]}, news_alpha={best_arch_config[2]}")
print(f"Best IC Sharpe: {best_arch_sharpe:.2f}")

STAGE 1: Architecture Search
Testing 27 architecture configurations...



In [None]:
# Show architecture results
arch_df = pd.DataFrame(arch_results).sort_values("ic_sharpe", ascending=False)
print("\nTop 10 Architecture Configurations:")
print(arch_df.head(10).to_string(index=False))

## 6. Stage 2: Training Hyperparameter Search

In [None]:
# Stage 2: Training params with best architecture
print("\nSTAGE 2: Training Hyperparameter Search")
print("=" * 70)

best_latent, best_hidden, best_alpha = best_arch_config

train_configs = list(product(
    SEARCH_SPACE["learning_rate"],
    SEARCH_SPACE["weight_decay"],
    SEARCH_SPACE["label_smoothing"],
))

print(f"Testing {len(train_configs)} training configurations with best architecture...\n")

train_results = []
best_train_sharpe = -float('inf')
best_train_config = None
best_model = None

for i, (lr, wd, smoothing) in enumerate(train_configs):
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        # Best architecture from Stage 1
        fund_hidden=int(64 * best_hidden),
        price_hidden=int(32 * best_hidden),
        news_hidden=int(128 * best_hidden),
        fundamental_latent=int(32 * best_latent),
        price_latent=int(16 * best_latent),
        news_latent=int(32 * best_latent),
        news_alpha=best_alpha,
        # Fixed dropout
        fundamental_dropout=0.8,
        price_dropout=0.4,
        news_dropout=0.2,
        # Tuned training params
        learning_rate=lr,
        weight_decay=wd,
        label_smoothing=smoothing,
    )
    
    start = datetime.now()
    metrics, model = train_config(config, train_dataset, val_df_news, n_epochs=15)
    elapsed = (datetime.now() - start).total_seconds()
    
    result = {
        "learning_rate": lr,
        "weight_decay": wd,
        "label_smoothing": smoothing,
        **metrics,
    }
    train_results.append(result)
    
    if metrics["ic_sharpe"] > best_train_sharpe:
        best_train_sharpe = metrics["ic_sharpe"]
        best_train_config = (lr, wd, smoothing)
        best_model = model
    
    print(f"[{i+1:2d}/{len(train_configs)}] lr={lr:.0e} wd={wd:.0e} smooth={smoothing:.2f} | "
          f"IC_sharpe={metrics['ic_sharpe']:5.2f} short_sharpe={metrics['short_sharpe']:5.2f} | {elapsed:.0f}s")

print(f"\nBest training config: lr={best_train_config[0]:.0e}, wd={best_train_config[1]:.0e}, smooth={best_train_config[2]}")
print(f"Best IC Sharpe: {best_train_sharpe:.2f}")

In [None]:
# Show training results
train_df_results = pd.DataFrame(train_results).sort_values("ic_sharpe", ascending=False)
print("\nTop 10 Training Configurations:")
print(train_df_results.head(10).to_string(index=False))

## 7. Final Model with Best Config

In [None]:
# Train final model with more epochs
print("\nTRAINING FINAL MODEL")
print("=" * 70)

best_lr, best_wd, best_smooth = best_train_config

final_config = ModelConfig(
    n_fundamental_features=len(fund_feat_cols),
    n_price_features=len(price_feat_cols),
    n_embedding_dim=len(emb_cols),
    # Best architecture
    fund_hidden=int(64 * best_hidden),
    price_hidden=int(32 * best_hidden),
    news_hidden=int(128 * best_hidden),
    fundamental_latent=int(32 * best_latent),
    price_latent=int(16 * best_latent),
    news_latent=int(32 * best_latent),
    news_alpha=best_alpha,
    # Fixed dropout
    fundamental_dropout=0.8,
    price_dropout=0.4,
    news_dropout=0.2,
    # Best training params
    learning_rate=best_lr,
    weight_decay=best_wd,
    label_smoothing=best_smooth,
    n_epochs=25,  # More epochs for final model
)

print(f"Final config:")
print(f"  Architecture: fund_hidden={final_config.fund_hidden}, price_hidden={final_config.price_hidden}, news_hidden={final_config.news_hidden}")
print(f"  Latent: fund={final_config.fundamental_latent}, price={final_config.price_latent}, news={final_config.news_latent}")
print(f"  News alpha: {final_config.news_alpha}")
print(f"  Training: lr={final_config.learning_rate:.0e}, wd={final_config.weight_decay:.0e}, smooth={final_config.label_smoothing}")

final_metrics, final_model = train_config(final_config, train_dataset, val_df_news, n_epochs=25, verbose=True)

In [None]:
# Evaluate on test set
test_metrics = evaluate_model(final_model, test_df_news, price_feat_cols, fund_feat_cols, emb_cols, device)

print("\n" + "=" * 60)
print("FINAL MODEL - TEST SET RESULTS (NEWS-ONLY)")
print("=" * 60)
print(f"Mean IC:       {test_metrics['mean_ic']:.4f}")
print(f"IC Sharpe:     {test_metrics['ic_sharpe']:.2f}")
print(f"Short Sharpe:  {test_metrics['short_sharpe']:.2f}")

## 8. Compare with Baseline

In [None]:
# Train baseline (original 2.5 config) for comparison
print("\nTraining baseline (2.5 config) for comparison...")

baseline_config = ModelConfig(
    n_fundamental_features=len(fund_feat_cols),
    n_price_features=len(price_feat_cols),
    n_embedding_dim=len(emb_cols),
    # Original 2.5 architecture
    fund_hidden=64,
    price_hidden=32,
    news_hidden=128,
    fundamental_latent=32,
    price_latent=16,
    news_latent=32,
    news_alpha=0.8,
    # Same dropout
    fundamental_dropout=0.8,
    price_dropout=0.4,
    news_dropout=0.2,
    # Original training params
    learning_rate=1e-3,
    weight_decay=1e-3,
    label_smoothing=0.1,
)

baseline_metrics, _ = train_config(baseline_config, train_dataset, val_df_news, n_epochs=25)
baseline_test = evaluate_model(_, test_df_news, price_feat_cols, fund_feat_cols, emb_cols, device)

In [None]:
print("\n" + "=" * 60)
print("COMPARISON: OPTIMIZED vs BASELINE")
print("=" * 60)
print(f"{'Metric':<20} {'Baseline':>12} {'Optimized':>12} {'Improvement':>12}")
print("-" * 60)
print(f"{'IC Sharpe':<20} {baseline_test['ic_sharpe']:>12.2f} {test_metrics['ic_sharpe']:>12.2f} {test_metrics['ic_sharpe'] - baseline_test['ic_sharpe']:>+12.2f}")
print(f"{'Short Sharpe':<20} {baseline_test['short_sharpe']:>12.2f} {test_metrics['short_sharpe']:>12.2f} {test_metrics['short_sharpe'] - baseline_test['short_sharpe']:>+12.2f}")
print(f"{'Mean IC':<20} {baseline_test['mean_ic']:>12.4f} {test_metrics['mean_ic']:>12.4f} {test_metrics['mean_ic'] - baseline_test['mean_ic']:>+12.4f}")

## 9. Save Results

In [None]:
# Save all results
all_results = {
    "architecture_search": arch_results,
    "training_search": train_results,
    "best_config": {
        "latent_scale": best_latent,
        "hidden_scale": best_hidden,
        "news_alpha": best_alpha,
        "learning_rate": best_lr,
        "weight_decay": best_wd,
        "label_smoothing": best_smooth,
    },
    "test_metrics": test_metrics,
    "baseline_metrics": baseline_test,
}

# Save results
pd.DataFrame(arch_results).to_parquet("data/hyperparam_arch_results.pqt")
pd.DataFrame(train_results).to_parquet("data/hyperparam_train_results.pqt")
print("Saved search results to data/hyperparam_*.pqt")

# Save best model
torch.save({
    "model_state_dict": final_model.state_dict(),
    "config": final_config,
    "price_cols": price_feat_cols,
    "fund_cols": fund_feat_cols,
    "emb_cols": emb_cols,
    "training_approach": "simple_returns_no_clip",  # Key change
    "search_results": all_results,
    "test_metrics": test_metrics,
}, "data/model_simple_returns.pt")

print("Saved best model to data/model_simple_returns.pt")
print("\nNote: This model was trained on SIMPLE returns (not log) and evaluated WITHOUT clipping")