# 2.7 Hyperparameter Optimization

Unified optimization of architecture, training, and dropout hyperparameters.

**Selection Metric: IC Sharpe (not Short Sharpe)**
- IC Sharpe is bounded and robust to outliers
- Short Sharpe is highly sensitive to single extreme trades
- We track both but SELECT on IC Sharpe

**Search Space:**
1. Architecture: latent dims, hidden sizes, news alpha
2. Training: learning rate, weight decay, label smoothing
3. Dropout: fundamental, price, news dropout rates

**Approach:** Staged search to reduce combinations

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from itertools import product
from dataclasses import dataclass, asdict
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 5)

Using device: cpu


In [2]:
@dataclass
class ModelConfig:
    """Full model configuration."""
    # Feature dimensions (fixed by data)
    n_fundamental_features: int = 19
    n_price_features: int = 9
    n_embedding_dim: int = 768
    
    # Hidden layer sizes
    fund_hidden: int = 64
    price_hidden: int = 32
    news_hidden: int = 128
    
    # Encoder latent dimensions
    fundamental_latent: int = 32
    price_latent: int = 16
    news_latent: int = 32
    
    # Dropout rates
    fundamental_dropout: float = 0.5
    price_dropout: float = 0.3
    news_dropout: float = 0.2
    
    # News influence
    news_alpha: float = 0.8
    
    # Training
    batch_size: int = 512
    learning_rate: float = 1e-3
    weight_decay: float = 1e-3
    label_smoothing: float = 0.1
    n_epochs: int = 15

## 1. Load Data

In [3]:
df = pd.read_parquet("data/ml_dataset.pqt")
df["feature_date"] = pd.to_datetime(df["feature_date"])

# Convert log returns to simple returns
df["simple_return"] = np.exp(df["target_return"]) - 1

# Clip extreme values (data errors) - only for TRAINING
CLIP_LIMIT = 1.0  # Â±100%
n_clipped = ((df["simple_return"] < -CLIP_LIMIT) | (df["simple_return"] > CLIP_LIMIT)).sum()
df["simple_return_clipped"] = df["simple_return"].clip(-CLIP_LIMIT, CLIP_LIMIT)

print(f"Dataset: {len(df):,} rows")
print(f"Date range: {df['feature_date'].min().date()} to {df['feature_date'].max().date()}")
print(f"Clipped {n_clipped:,} extreme values for training")

Dataset: 2,092,929 rows
Date range: 2021-01-13 to 2025-12-18
Clipped 81 extreme values for training


In [4]:
# Feature columns
price_feat_cols = [
    "overnight_gap_z", "intraday_ret_z",
    "ret_1d_z", "ret_2d_z", "ret_3d_z", "ret_5d_z",
    "vol_5d_z", "dist_from_high_5d_z", "dist_from_low_5d_z"
]
fund_feat_cols = [c for c in df.columns if c.endswith("_z") and c not in price_feat_cols and c != "news_count_z"]
emb_cols = [c for c in df.columns if c.startswith("emb_")]

print(f"Price features: {len(price_feat_cols)}")
print(f"Fundamental features: {len(fund_feat_cols)}")
print(f"Embedding dims: {len(emb_cols)}")

Price features: 9
Fundamental features: 19
Embedding dims: 768


In [5]:
# Time-based split: 70% train, 10% val, 20% test
dates = sorted(df["feature_date"].unique())
n_dates = len(dates)
train_end_idx = int(n_dates * 0.7)
val_end_idx = int(n_dates * 0.8)

train_dates = set(dates[:train_end_idx])
val_dates = set(dates[train_end_idx:val_end_idx])
test_dates = set(dates[val_end_idx:])

train_df = df[df["feature_date"].isin(train_dates)].copy()
val_df = df[df["feature_date"].isin(val_dates)].copy()
test_df = df[df["feature_date"].isin(test_dates)].copy()

print(f"Train: {len(train_df):,} rows ({min(train_dates).date()} to {max(train_dates).date()})")
print(f"Val:   {len(val_df):,} rows ({min(val_dates).date()} to {max(val_dates).date()})")
print(f"Test:  {len(test_df):,} rows ({min(test_dates).date()} to {max(test_dates).date()})")

# News-only filtering for evaluation
def filter_news_only(df_in, emb_cols):
    has_news = (df_in[emb_cols].abs().sum(axis=1) > 0)
    return df_in[has_news].copy()

val_df_news = filter_news_only(val_df, emb_cols)
test_df_news = filter_news_only(test_df, emb_cols)
print(f"\nVal (news-only):  {len(val_df_news):,}")
print(f"Test (news-only): {len(test_df_news):,}")

Train: 1,418,494 rows (2021-01-13 to 2024-05-01)
Val:   210,247 rows (2024-05-02 to 2024-10-21)
Test:  464,188 rows (2024-10-22 to 2025-12-18)

Val (news-only):  58,882
Test (news-only): 128,502


## 2. Dataset and Model

In [6]:
class SinglePairDataset(Dataset):
    """Dataset where each symbol appears in exactly one pair per day."""

    def __init__(self, df, price_cols, fund_cols, emb_cols, use_clipped=True, verbose=True):
        has_news = (df[emb_cols].abs().sum(axis=1) > 0)
        df_news = df[has_news].copy().reset_index(drop=True)
        if verbose:
            print(f"Filtered to news-only: {len(df_news):,} rows")

        self.df = df_news
        self.price_cols = price_cols
        self.fund_cols = fund_cols
        self.emb_cols = emb_cols

        self.date_groups = {}
        for date, group in self.df.groupby("feature_date"):
            indices = group.index.tolist()
            if len(indices) >= 2:
                self.date_groups[date] = indices

        self.dates = list(self.date_groups.keys())

        self.price_arr = self.df[price_cols].values.astype(np.float32)
        self.fund_arr = self.df[fund_cols].values.astype(np.float32)
        self.emb_arr = self.df[emb_cols].values.astype(np.float32)
        
        # Use clipped returns for training (avoid extreme outlier influence)
        target_col = "simple_return_clipped" if use_clipped else "simple_return"
        self.target_arr = self.df[target_col].values.astype(np.float32)

        self.pairs = []
        self._generate_pairs(verbose=verbose)

    def _generate_pairs(self, verbose=False):
        pairs = []
        for date in self.dates:
            indices = list(self.date_groups[date])
            np.random.shuffle(indices)
            for i in range(0, len(indices) - 1, 2):
                pairs.append((indices[i], indices[i + 1]))
        self.pairs = pairs
        if verbose:
            print(f"Generated {len(self.pairs):,} pairs")

    def resample_pairs(self):
        self._generate_pairs()

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        i, j = self.pairs[idx]
        price_i, price_j = self.price_arr[i], self.price_arr[j]
        fund_i, fund_j = self.fund_arr[i], self.fund_arr[j]
        emb_i, emb_j = self.emb_arr[i], self.emb_arr[j]
        actual_label = 1.0 if self.target_arr[i] > self.target_arr[j] else 0.0

        # Random swap for symmetry
        if np.random.random() < 0.5:
            price_i, price_j = price_j, price_i
            fund_i, fund_j = fund_j, fund_i
            emb_i, emb_j = emb_j, emb_i
            actual_label = 1.0 - actual_label

        return {
            "price_i": torch.tensor(price_i), "price_j": torch.tensor(price_j),
            "fund_i": torch.tensor(fund_i), "fund_j": torch.tensor(fund_j),
            "emb_i": torch.tensor(emb_i), "emb_j": torch.tensor(emb_j),
            "label": torch.tensor(actual_label),
        }

In [7]:
class MultiBranchRanker(nn.Module):
    """Multi-branch ranking model with configurable architecture."""
    
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.fund_encoder = nn.Sequential(
            nn.Linear(config.n_fundamental_features, config.fund_hidden),
            nn.ReLU(),
            nn.Dropout(config.fundamental_dropout),
            nn.Linear(config.fund_hidden, config.fundamental_latent),
            nn.ReLU(),
        )
        
        self.price_encoder = nn.Sequential(
            nn.Linear(config.n_price_features, config.price_hidden),
            nn.ReLU(),
            nn.Dropout(config.price_dropout),
            nn.Linear(config.price_hidden, config.price_latent),
            nn.ReLU(),
        )
        
        self.news_encoder = nn.Sequential(
            nn.Linear(config.n_embedding_dim, config.news_hidden),
            nn.ReLU(),
            nn.Dropout(config.news_dropout),
            nn.Linear(config.news_hidden, config.news_latent),
            nn.ReLU(),
        )
        
        fused_dim = config.fundamental_latent + config.price_latent + config.news_latent
        self.output_head = nn.Sequential(
            nn.Linear(fused_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, 1),
        )
    
    def forward(self, price, fund, emb):
        h_f = self.fund_encoder(fund)
        h_p = self.price_encoder(price)
        h_n = self.news_encoder(emb)
        h_n_scaled = self.config.news_alpha * h_n
        h = torch.cat([h_f, h_p, h_n_scaled], dim=-1)
        return self.output_head(h).squeeze(-1)
    
    def forward_pair(self, price_i, fund_i, emb_i, price_j, fund_j, emb_j):
        score_i = self.forward(price_i, fund_i, emb_i)
        score_j = self.forward(price_j, fund_j, emb_j)
        return torch.sigmoid(score_i - score_j)

## 3. Training and Evaluation

In [8]:
def pairwise_ranking_loss(pred_prob, label, smoothing=0.1):
    smoothed_label = label * (1 - smoothing) + 0.5 * smoothing
    return F.binary_cross_entropy(pred_prob, smoothed_label)


def train_epoch(model, loader, optimizer, device, label_smoothing=0.1):
    model.train()
    total_loss = 0
    total_samples = 0
    
    for batch in loader:
        price_i = batch["price_i"].to(device)
        price_j = batch["price_j"].to(device)
        fund_i = batch["fund_i"].to(device)
        fund_j = batch["fund_j"].to(device)
        emb_i = batch["emb_i"].to(device)
        emb_j = batch["emb_j"].to(device)
        label = batch["label"].to(device)
        
        optimizer.zero_grad()
        pred_prob = model.forward_pair(price_i, fund_i, emb_i, price_j, fund_j, emb_j)
        loss = pairwise_ranking_loss(pred_prob, label, smoothing=label_smoothing)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * len(label)
        total_samples += len(label)
    
    return total_loss / total_samples


@torch.no_grad()
def get_scores(model, df, price_cols, fund_cols, emb_cols, device, batch_size=1024):
    model.eval()
    price_arr = torch.tensor(df[price_cols].values.astype(np.float32))
    fund_arr = torch.tensor(df[fund_cols].values.astype(np.float32))
    emb_arr = torch.tensor(df[emb_cols].values.astype(np.float32))
    
    scores = []
    for i in range(0, len(df), batch_size):
        price = price_arr[i:i+batch_size].to(device)
        fund = fund_arr[i:i+batch_size].to(device)
        emb = emb_arr[i:i+batch_size].to(device)
        score = model(price, fund, emb)
        scores.append(score.cpu().numpy())
    
    return np.concatenate(scores)


def evaluate_model(model, df, price_cols, fund_cols, emb_cols, device, k=5):
    """Compute IC Sharpe and short strategy Sharpe.
    
    IC Sharpe: Uses ranks, bounded, robust to outliers
    Short Sharpe: Uses actual returns, sensitive to outliers (for reporting only)
    """
    df_eval = df.copy()
    df_eval["score"] = get_scores(model, df_eval, price_cols, fund_cols, emb_cols, device)
    
    # IC Sharpe (rank-based, robust)
    ics = []
    for date, group in df_eval.groupby("feature_date"):
        if len(group) < 10:
            continue
        ic, _ = spearmanr(group["score"], group["simple_return"])
        if not np.isnan(ic):
            ics.append(ic)
    
    mean_ic = np.mean(ics) if ics else 0
    ic_std = np.std(ics) if ics else 1
    ic_sharpe = mean_ic / ic_std * np.sqrt(252) if ic_std > 0 else 0
    
    # Short strategy Sharpe (for reporting, sensitive to outliers)
    returns = []
    for date, group in df_eval.groupby("feature_date"):
        if len(group) < 10:
            continue
        bottom = group.nsmallest(k, "score")
        short_ret = -bottom["simple_return"].mean()
        returns.append(short_ret)
    
    if len(returns) > 1:
        short_sharpe = np.mean(returns) / np.std(returns) * np.sqrt(252)
    else:
        short_sharpe = 0
    
    return {"mean_ic": mean_ic, "ic_sharpe": ic_sharpe, "short_sharpe": short_sharpe}

In [9]:
def train_and_evaluate(config, train_dataset, val_df_news, n_epochs=15, verbose=False):
    """Train a model and return validation metrics.
    
    Selects best checkpoint by IC Sharpe (robust to outliers).
    """
    model = MultiBranchRanker(config).to(device)
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=config.learning_rate, 
        weight_decay=config.weight_decay
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
    
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    
    best_ic_sharpe = -float('inf')
    best_state = None
    
    for epoch in range(n_epochs):
        train_dataset.resample_pairs()
        train_loss = train_epoch(model, train_loader, optimizer, device, config.label_smoothing)
        scheduler.step()
        
        # Evaluate every 5 epochs
        if (epoch + 1) % 5 == 0 or epoch == n_epochs - 1:
            metrics = evaluate_model(model, val_df_news, price_feat_cols, fund_feat_cols, emb_cols, device)
            # Select by IC Sharpe (robust to outliers)
            if metrics["ic_sharpe"] > best_ic_sharpe:
                best_ic_sharpe = metrics["ic_sharpe"]
                best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            if verbose:
                print(f"  Epoch {epoch+1}: IC={metrics['ic_sharpe']:.2f}, Short={metrics['short_sharpe']:.2f}")
    
    model.load_state_dict(best_state)
    final_metrics = evaluate_model(model, val_df_news, price_feat_cols, fund_feat_cols, emb_cols, device)
    
    return final_metrics, model

In [10]:
# Create training dataset
train_dataset = SinglePairDataset(
    train_df, price_feat_cols, fund_feat_cols, emb_cols, 
    use_clipped=True,  # Use clipped returns for training
    verbose=True
)

Filtered to news-only: 339,872 rows
Generated 169,737 pairs


## 4. Search Space Definition

In [11]:
# Full search space (staged to reduce combinations)
SEARCH_SPACE = {
    # Stage 1: Architecture
    "latent_scale": [0.5, 1.0, 2.0],
    "hidden_scale": [0.5, 1.0, 1.5],
    "news_alpha": [0.6, 0.8, 1.0],
    
    # Stage 2: Training
    "learning_rate": [5e-4, 1e-3, 2e-3],
    "weight_decay": [1e-4, 1e-3, 1e-2],
    "label_smoothing": [0.05, 0.1, 0.15],
    
    # Stage 3: Dropout
    "fund_dropout": [0.4, 0.5, 0.6, 0.7],
    "price_dropout": [0.2, 0.3, 0.4],
    "news_dropout": [0.1, 0.2, 0.3],
}

n_arch = len(SEARCH_SPACE["latent_scale"]) * len(SEARCH_SPACE["hidden_scale"]) * len(SEARCH_SPACE["news_alpha"])
n_train = len(SEARCH_SPACE["learning_rate"]) * len(SEARCH_SPACE["weight_decay"]) * len(SEARCH_SPACE["label_smoothing"])
n_dropout = len(SEARCH_SPACE["fund_dropout"]) * len(SEARCH_SPACE["price_dropout"]) * len(SEARCH_SPACE["news_dropout"])

print(f"Stage 1 (Architecture): {n_arch} configs")
print(f"Stage 2 (Training):     {n_train} configs")
print(f"Stage 3 (Dropout):      {n_dropout} configs")
print(f"Total (staged):         {n_arch + n_train + n_dropout} configs")
print(f"Full grid would be:     {n_arch * n_train * n_dropout // (3*3*3):,} configs")

Stage 1 (Architecture): 27 configs
Stage 2 (Training):     27 configs
Stage 3 (Dropout):      36 configs
Total (staged):         90 configs
Full grid would be:     972 configs


## 5. Stage 1: Architecture Search

In [13]:
print("STAGE 1: ARCHITECTURE SEARCH")
print("Selection metric: IC Sharpe (robust to outliers)")
print("=" * 70)

arch_configs = list(product(
    SEARCH_SPACE["latent_scale"],
    SEARCH_SPACE["hidden_scale"],
    SEARCH_SPACE["news_alpha"],
))

arch_results = []
best_ic = -float('inf')
best_arch = None

for i, (latent_scale, hidden_scale, news_alpha) in enumerate(arch_configs):
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        fund_hidden=int(64 * hidden_scale),
        price_hidden=int(32 * hidden_scale),
        news_hidden=int(128 * hidden_scale),
        fundamental_latent=int(32 * latent_scale),
        price_latent=int(16 * latent_scale),
        news_latent=int(32 * latent_scale),
        news_alpha=news_alpha,
        # Default dropout and training
        fundamental_dropout=0.8,
        price_dropout=0.4,
        news_dropout=0.2,
    )
    
    start = datetime.now()
    metrics, _ = train_and_evaluate(config, train_dataset, val_df_news, n_epochs=15)
    elapsed = (datetime.now() - start).total_seconds()
    
    arch_results.append({
        "latent_scale": latent_scale,
        "hidden_scale": hidden_scale,
        "news_alpha": news_alpha,
        **metrics,
    })
    
    if metrics["ic_sharpe"] > best_ic:
        best_ic = metrics["ic_sharpe"]
        best_arch = (latent_scale, hidden_scale, news_alpha)
    
    print(f"[{i+1:2d}/{len(arch_configs)}] lat={latent_scale:.1f} hid={hidden_scale:.1f} alpha={news_alpha:.1f} | "
          f"IC={metrics['ic_sharpe']:5.2f} short={metrics['short_sharpe']:5.2f} | {elapsed:.0f}s")

print(f"\nBest architecture: latent={best_arch[0]}, hidden={best_arch[1]}, alpha={best_arch[2]} -> IC={best_ic:.2f}")

STAGE 1: ARCHITECTURE SEARCH
Selection metric: IC Sharpe (robust to outliers)
[ 1/27] lat=0.5 hid=0.5 alpha=0.6 | IC= 4.79 short=-0.48 | 133s
[ 2/27] lat=0.5 hid=0.5 alpha=0.8 | IC= 4.69 short=-0.40 | 134s
[ 3/27] lat=0.5 hid=0.5 alpha=1.0 | IC= 4.19 short=-0.25 | 134s
[ 4/27] lat=0.5 hid=1.0 alpha=0.6 | IC= 5.10 short=-0.94 | 146s
[ 5/27] lat=0.5 hid=1.0 alpha=0.8 | IC= 4.32 short=-0.97 | 146s
[ 6/27] lat=0.5 hid=1.0 alpha=1.0 | IC= 4.36 short=-0.42 | 146s
[ 7/27] lat=0.5 hid=1.5 alpha=0.6 | IC= 4.59 short=-0.04 | 155s
[ 8/27] lat=0.5 hid=1.5 alpha=0.8 | IC= 4.59 short= 0.06 | 156s
[ 9/27] lat=0.5 hid=1.5 alpha=1.0 | IC= 4.72 short=-0.36 | 156s
[10/27] lat=1.0 hid=0.5 alpha=0.6 | IC= 4.45 short=-0.07 | 136s
[11/27] lat=1.0 hid=0.5 alpha=0.8 | IC= 4.13 short=-0.79 | 136s
[12/27] lat=1.0 hid=0.5 alpha=1.0 | IC= 4.14 short=-0.40 | 138s
[13/27] lat=1.0 hid=1.0 alpha=0.6 | IC= 4.41 short=-0.50 | 149s


KeyboardInterrupt: 

In [None]:
arch_df = pd.DataFrame(arch_results).sort_values("ic_sharpe", ascending=False)
print("\nTop 10 by IC Sharpe:")
print(arch_df.head(10).to_string(index=False))

## 6. Stage 2: Training Hyperparameter Search

In [None]:
print("\nSTAGE 2: TRAINING HYPERPARAMETER SEARCH")
print("=" * 70)

best_latent, best_hidden, best_alpha = best_arch

train_configs = list(product(
    SEARCH_SPACE["learning_rate"],
    SEARCH_SPACE["weight_decay"],
    SEARCH_SPACE["label_smoothing"],
))

train_results = []
best_train_ic = -float('inf')
best_train = None

for i, (lr, wd, smoothing) in enumerate(train_configs):
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        # Best architecture
        fund_hidden=int(64 * best_hidden),
        price_hidden=int(32 * best_hidden),
        news_hidden=int(128 * best_hidden),
        fundamental_latent=int(32 * best_latent),
        price_latent=int(16 * best_latent),
        news_latent=int(32 * best_latent),
        news_alpha=best_alpha,
        # Default dropout
        fundamental_dropout=0.5,
        price_dropout=0.3,
        news_dropout=0.2,
        # Tuned training
        learning_rate=lr,
        weight_decay=wd,
        label_smoothing=smoothing,
    )
    
    start = datetime.now()
    metrics, _ = train_and_evaluate(config, train_dataset, val_df_news, n_epochs=15)
    elapsed = (datetime.now() - start).total_seconds()
    
    train_results.append({
        "learning_rate": lr,
        "weight_decay": wd,
        "label_smoothing": smoothing,
        **metrics,
    })
    
    if metrics["ic_sharpe"] > best_train_ic:
        best_train_ic = metrics["ic_sharpe"]
        best_train = (lr, wd, smoothing)
    
    print(f"[{i+1:2d}/{len(train_configs)}] lr={lr:.0e} wd={wd:.0e} smooth={smoothing:.2f} | "
          f"IC={metrics['ic_sharpe']:5.2f} short={metrics['short_sharpe']:5.2f} | {elapsed:.0f}s")

print(f"\nBest training: lr={best_train[0]:.0e}, wd={best_train[1]:.0e}, smooth={best_train[2]} -> IC={best_train_ic:.2f}")

In [None]:
train_df_results = pd.DataFrame(train_results).sort_values("ic_sharpe", ascending=False)
print("\nTop 10 by IC Sharpe:")
print(train_df_results.head(10).to_string(index=False))

## 7. Stage 3: Dropout Search

In [None]:
print("\nSTAGE 3: DROPOUT SEARCH")
print("=" * 70)

best_lr, best_wd, best_smooth = best_train

dropout_configs = list(product(
    SEARCH_SPACE["fund_dropout"],
    SEARCH_SPACE["price_dropout"],
    SEARCH_SPACE["news_dropout"],
))

dropout_results = []
best_dropout_ic = -float('inf')
best_dropout = None
best_model = None

for i, (fund_do, price_do, news_do) in enumerate(dropout_configs):
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        # Best architecture
        fund_hidden=int(64 * best_hidden),
        price_hidden=int(32 * best_hidden),
        news_hidden=int(128 * best_hidden),
        fundamental_latent=int(32 * best_latent),
        price_latent=int(16 * best_latent),
        news_latent=int(32 * best_latent),
        news_alpha=best_alpha,
        # Tuned dropout
        fundamental_dropout=fund_do,
        price_dropout=price_do,
        news_dropout=news_do,
        # Best training
        learning_rate=best_lr,
        weight_decay=best_wd,
        label_smoothing=best_smooth,
    )
    
    start = datetime.now()
    metrics, model = train_and_evaluate(config, train_dataset, val_df_news, n_epochs=15)
    elapsed = (datetime.now() - start).total_seconds()
    
    dropout_results.append({
        "fund_dropout": fund_do,
        "price_dropout": price_do,
        "news_dropout": news_do,
        **metrics,
    })
    
    if metrics["ic_sharpe"] > best_dropout_ic:
        best_dropout_ic = metrics["ic_sharpe"]
        best_dropout = (fund_do, price_do, news_do)
        best_model = model
    
    print(f"[{i+1:2d}/{len(dropout_configs)}] fund={fund_do:.1f} price={price_do:.1f} news={news_do:.1f} | "
          f"IC={metrics['ic_sharpe']:5.2f} short={metrics['short_sharpe']:5.2f} | {elapsed:.0f}s")

print(f"\nBest dropout: fund={best_dropout[0]}, price={best_dropout[1]}, news={best_dropout[2]} -> IC={best_dropout_ic:.2f}")

In [None]:
dropout_df = pd.DataFrame(dropout_results).sort_values("ic_sharpe", ascending=False)
print("\nTop 10 by IC Sharpe:")
print(dropout_df.head(10).to_string(index=False))

## 8. Train Final Model

In [None]:
print("\nTRAINING FINAL MODEL")
print("=" * 70)

best_fund_do, best_price_do, best_news_do = best_dropout

final_config = ModelConfig(
    n_fundamental_features=len(fund_feat_cols),
    n_price_features=len(price_feat_cols),
    n_embedding_dim=len(emb_cols),
    # Best architecture
    fund_hidden=int(64 * best_hidden),
    price_hidden=int(32 * best_hidden),
    news_hidden=int(128 * best_hidden),
    fundamental_latent=int(32 * best_latent),
    price_latent=int(16 * best_latent),
    news_latent=int(32 * best_latent),
    news_alpha=best_alpha,
    # Best dropout
    fundamental_dropout=best_fund_do,
    price_dropout=best_price_do,
    news_dropout=best_news_do,
    # Best training
    learning_rate=best_lr,
    weight_decay=best_wd,
    label_smoothing=best_smooth,
    n_epochs=25,
)

print("Final configuration:")
print(f"  Architecture: hidden=({final_config.fund_hidden}, {final_config.price_hidden}, {final_config.news_hidden})")
print(f"  Latent dims:  ({final_config.fundamental_latent}, {final_config.price_latent}, {final_config.news_latent})")
print(f"  News alpha:   {final_config.news_alpha}")
print(f"  Dropout:      ({final_config.fundamental_dropout}, {final_config.price_dropout}, {final_config.news_dropout})")
print(f"  Training:     lr={final_config.learning_rate:.0e}, wd={final_config.weight_decay:.0e}, smooth={final_config.label_smoothing}")
print()

final_metrics, final_model = train_and_evaluate(final_config, train_dataset, val_df_news, n_epochs=25, verbose=True)

In [None]:
# Evaluate on test set
test_metrics = evaluate_model(final_model, test_df_news, price_feat_cols, fund_feat_cols, emb_cols, device)

print("\n" + "=" * 60)
print("FINAL MODEL - TEST SET RESULTS")
print("=" * 60)
print(f"Mean IC:       {test_metrics['mean_ic']:.4f}")
print(f"IC Sharpe:     {test_metrics['ic_sharpe']:.2f}")
print(f"Short Sharpe:  {test_metrics['short_sharpe']:.2f}  (for reference only, sensitive to outliers)")

## 9. Visualization

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Architecture search
ax = axes[0]
arch_df_sorted = arch_df.sort_values("ic_sharpe", ascending=True).tail(10)
labels = [f"l={r['latent_scale']:.1f} h={r['hidden_scale']:.1f}" for _, r in arch_df_sorted.iterrows()]
ax.barh(labels, arch_df_sorted["ic_sharpe"], color='steelblue')
ax.set_xlabel('IC Sharpe')
ax.set_title('Stage 1: Architecture')

# Training search
ax = axes[1]
train_sorted = train_df_results.sort_values("ic_sharpe", ascending=True).tail(10)
labels = [f"lr={r['learning_rate']:.0e}" for _, r in train_sorted.iterrows()]
ax.barh(labels, train_sorted["ic_sharpe"], color='coral')
ax.set_xlabel('IC Sharpe')
ax.set_title('Stage 2: Training')

# Dropout search
ax = axes[2]
drop_sorted = dropout_df.sort_values("ic_sharpe", ascending=True).tail(10)
labels = [f"f={r['fund_dropout']:.1f} p={r['price_dropout']:.1f}" for _, r in drop_sorted.iterrows()]
ax.barh(labels, drop_sorted["ic_sharpe"], color='green')
ax.set_xlabel('IC Sharpe')
ax.set_title('Stage 3: Dropout')

plt.tight_layout()
plt.show()

In [None]:
# IC Sharpe vs Short Sharpe scatter
fig, ax = plt.subplots(figsize=(8, 6))

all_results = pd.concat([
    arch_df.assign(stage='Architecture'),
    train_df_results.assign(stage='Training'),
    dropout_df.assign(stage='Dropout'),
])

for stage, color in [('Architecture', 'steelblue'), ('Training', 'coral'), ('Dropout', 'green')]:
    data = all_results[all_results['stage'] == stage]
    ax.scatter(data['ic_sharpe'], data['short_sharpe'], label=stage, alpha=0.6, s=50, c=color)

ax.set_xlabel('IC Sharpe (selection metric)')
ax.set_ylabel('Short Sharpe (reporting only)')
ax.set_title('IC Sharpe vs Short Sharpe Across All Experiments')
ax.legend()
ax.grid(True, alpha=0.3)

# Add correlation
corr = all_results[['ic_sharpe', 'short_sharpe']].corr().iloc[0, 1]
ax.text(0.05, 0.95, f'Correlation: {corr:.2f}', transform=ax.transAxes, fontsize=10)

plt.tight_layout()
plt.show()

## 10. Save Results

In [None]:
# Save search results
pd.DataFrame(arch_results).to_parquet("data/hyperparam_arch_results.pqt")
pd.DataFrame(train_results).to_parquet("data/hyperparam_train_results.pqt")
pd.DataFrame(dropout_results).to_parquet("data/hyperparam_dropout_results.pqt")
print("Saved search results to data/hyperparam_*.pqt")

# Save best model
torch.save({
    "model_state_dict": final_model.state_dict(),
    "config": final_config,
    "price_cols": price_feat_cols,
    "fund_cols": fund_feat_cols,
    "emb_cols": emb_cols,
    "search_results": {
        "best_arch": {"latent_scale": best_latent, "hidden_scale": best_hidden, "news_alpha": best_alpha},
        "best_train": {"lr": best_lr, "wd": best_wd, "smoothing": best_smooth},
        "best_dropout": {"fund": best_fund_do, "price": best_price_do, "news": best_news_do},
    },
    "test_metrics": test_metrics,
}, "data/model_optimized.pt")

print("Saved model to data/model_optimized.pt")

In [None]:
print("\n" + "=" * 60)
print("OPTIMIZATION SUMMARY")
print("=" * 60)
print(f"\nSelection metric: IC Sharpe (robust to outliers)")
print(f"\nBest configuration:")
print(f"  Architecture: latent_scale={best_latent}, hidden_scale={best_hidden}, alpha={best_alpha}")
print(f"  Training:     lr={best_lr:.0e}, wd={best_wd:.0e}, smooth={best_smooth}")
print(f"  Dropout:      fund={best_fund_do}, price={best_price_do}, news={best_news_do}")
print(f"\nTest set performance:")
print(f"  IC Sharpe:    {test_metrics['ic_sharpe']:.2f}")
print(f"  Short Sharpe: {test_metrics['short_sharpe']:.2f} (sensitive to outliers)")
print(f"  Mean IC:      {test_metrics['mean_ic']:.4f}")