# 5.1 Liquid Universe Training

Retrain the model on liquid stocks only (>$50M daily volume).

**Hypothesis**: The original model learned patterns in small/illiquid stocks that:
1. Don't transfer to liquid stocks
2. Actually invert in liquid stocks

A model trained specifically on liquid stocks might find different (weaker but tradeable) patterns.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from itertools import product
from dataclasses import dataclass
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 5)

Using device: cpu


In [2]:
@dataclass
class ModelConfig:
    n_fundamental_features: int = 19
    n_price_features: int = 9
    n_embedding_dim: int = 768
    fund_hidden: int = 64
    price_hidden: int = 32
    news_hidden: int = 128
    fundamental_latent: int = 32
    price_latent: int = 16
    news_latent: int = 32
    fundamental_dropout: float = 0.6
    price_dropout: float = 0.3
    news_dropout: float = 0.1
    news_alpha: float = 0.8
    batch_size: int = 512
    learning_rate: float = 1e-3
    weight_decay: float = 1e-3
    label_smoothing: float = 0.1
    n_epochs: int = 15

## 1. Load Data and Filter to Liquid Stocks

In [3]:
# Load ML dataset
df = pd.read_parquet("data/ml_dataset.pqt")
df["feature_date"] = pd.to_datetime(df["feature_date"])
df["simple_return"] = np.exp(df["target_return"]) - 1
df["simple_return_clipped"] = df["simple_return"].clip(-1.0, 1.0)

print(f"Full dataset: {len(df):,} rows")
print(f"Date range: {df['feature_date'].min().date()} to {df['feature_date'].max().date()}")

Full dataset: 2,092,929 rows
Date range: 2021-01-13 to 2025-12-18


In [4]:
# Load price data to compute liquidity
prices_df = pd.read_parquet('data/prices.pqt')
prices_df['date'] = pd.to_datetime(prices_df['date'])
prices_df['dollar_volume'] = prices_df['volume'] * prices_df['close']

# Compute average dollar volume per symbol (rolling would be better, but this is simpler)
symbol_liquidity = prices_df.groupby('symbol')['dollar_volume'].mean().reset_index()
symbol_liquidity.columns = ['symbol', 'avg_dollar_volume']

print(f"Liquidity data for {len(symbol_liquidity):,} symbols")

Liquidity data for 5,644 symbols


In [5]:
# Merge liquidity into main dataset
df = df.merge(symbol_liquidity, on='symbol', how='left')
print(f"Merged. Missing liquidity: {df['avg_dollar_volume'].isna().sum():,}")

# Drop rows without liquidity data
df = df[df['avg_dollar_volume'].notna()].copy()
print(f"After dropping missing: {len(df):,} rows")

Merged. Missing liquidity: 0
After dropping missing: 2,092,929 rows


In [6]:
# Filter to liquid stocks only
LIQUID_THRESHOLD = 50e6  # $50M daily volume

df_liquid = df[df['avg_dollar_volume'] >= LIQUID_THRESHOLD].copy()

print(f"\nLIQUID UNIVERSE (>= ${LIQUID_THRESHOLD/1e6:.0f}M daily volume)")
print("=" * 50)
print(f"Rows: {len(df_liquid):,} ({len(df_liquid)/len(df)*100:.1f}% of total)")
print(f"Unique symbols: {df_liquid['symbol'].nunique():,}")
print(f"Date range: {df_liquid['feature_date'].min().date()} to {df_liquid['feature_date'].max().date()}")


LIQUID UNIVERSE (>= $50M daily volume)
Rows: 1,100,207 (52.6% of total)
Unique symbols: 1,022
Date range: 2021-01-13 to 2025-12-18


In [7]:
# Feature columns
price_feat_cols = [
    "overnight_gap_z", "intraday_ret_z",
    "ret_1d_z", "ret_2d_z", "ret_3d_z", "ret_5d_z",
    "vol_5d_z", "dist_from_high_5d_z", "dist_from_low_5d_z"
]
fund_feat_cols = [c for c in df_liquid.columns if c.endswith("_z") and c not in price_feat_cols and c != "news_count_z"]
emb_cols = [c for c in df_liquid.columns if c.startswith("emb_")]

print(f"Price features: {len(price_feat_cols)}")
print(f"Fundamental features: {len(fund_feat_cols)}")
print(f"Embedding dims: {len(emb_cols)}")

Price features: 9
Fundamental features: 19
Embedding dims: 768


In [8]:
# Time-based split
dates = sorted(df_liquid["feature_date"].unique())
n_dates = len(dates)
train_end = int(n_dates * 0.7)
val_end = int(n_dates * 0.8)

train_dates = set(dates[:train_end])
val_dates = set(dates[train_end:val_end])
test_dates = set(dates[val_end:])

train_df = df_liquid[df_liquid["feature_date"].isin(train_dates)].copy()
val_df = df_liquid[df_liquid["feature_date"].isin(val_dates)].copy()
test_df = df_liquid[df_liquid["feature_date"].isin(test_dates)].copy()

def filter_news_only(df_in):
    has_news = (df_in[emb_cols].abs().sum(axis=1) > 0)
    return df_in[has_news].copy()

train_df_news = filter_news_only(train_df)
val_df_news = filter_news_only(val_df)
test_df_news = filter_news_only(test_df)

print(f"\nDATA SPLITS (Liquid + News-only):")
print(f"Train: {len(train_df_news):,} rows ({min(train_dates).date()} to {max(train_dates).date()})")
print(f"Val:   {len(val_df_news):,} rows ({min(val_dates).date()} to {max(val_dates).date()})")
print(f"Test:  {len(test_df_news):,} rows ({min(test_dates).date()} to {max(test_dates).date()})")


DATA SPLITS (Liquid + News-only):
Train: 241,382 rows (2021-01-13 to 2024-05-01)
Val:   40,236 rows (2024-05-02 to 2024-10-21)
Test:  83,936 rows (2024-10-22 to 2025-12-18)


## 2. Dataset and Model (Same as 2.8)

In [9]:
class SinglePairDataset(Dataset):
    def __init__(self, df, price_cols, fund_cols, emb_cols, verbose=True):
        # Already filtered to news-only before passing in
        self.df = df.reset_index(drop=True)
        self.price_cols = price_cols
        self.fund_cols = fund_cols
        self.emb_cols = emb_cols

        self.date_groups = {}
        for date, group in self.df.groupby("feature_date"):
            indices = group.index.tolist()
            if len(indices) >= 2:
                self.date_groups[date] = indices
        self.dates = list(self.date_groups.keys())

        self.price_arr = self.df[price_cols].values.astype(np.float32)
        self.fund_arr = self.df[fund_cols].values.astype(np.float32)
        self.emb_arr = self.df[emb_cols].values.astype(np.float32)
        self.target_arr = self.df["simple_return_clipped"].values.astype(np.float32)

        self.pairs = []
        self._generate_pairs(verbose=verbose)

    def _generate_pairs(self, verbose=False):
        pairs = []
        for date in self.dates:
            indices = list(self.date_groups[date])
            np.random.shuffle(indices)
            for i in range(0, len(indices) - 1, 2):
                pairs.append((indices[i], indices[i + 1]))
        self.pairs = pairs
        if verbose:
            print(f"Generated {len(self.pairs):,} pairs from {len(self.df):,} rows")

    def resample_pairs(self):
        self._generate_pairs()

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        i, j = self.pairs[idx]
        price_i, price_j = self.price_arr[i], self.price_arr[j]
        fund_i, fund_j = self.fund_arr[i], self.fund_arr[j]
        emb_i, emb_j = self.emb_arr[i], self.emb_arr[j]
        label = 1.0 if self.target_arr[i] > self.target_arr[j] else 0.0

        if np.random.random() < 0.5:
            price_i, price_j = price_j, price_i
            fund_i, fund_j = fund_j, fund_i
            emb_i, emb_j = emb_j, emb_i
            label = 1.0 - label

        return {
            "price_i": torch.tensor(price_i), "price_j": torch.tensor(price_j),
            "fund_i": torch.tensor(fund_i), "fund_j": torch.tensor(fund_j),
            "emb_i": torch.tensor(emb_i), "emb_j": torch.tensor(emb_j),
            "label": torch.tensor(label),
        }

In [10]:
class MultiBranchRanker(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.fund_encoder = nn.Sequential(
            nn.Linear(config.n_fundamental_features, config.fund_hidden),
            nn.ReLU(),
            nn.Dropout(config.fundamental_dropout),
            nn.Linear(config.fund_hidden, config.fundamental_latent),
            nn.ReLU(),
        )
        
        self.price_encoder = nn.Sequential(
            nn.Linear(config.n_price_features, config.price_hidden),
            nn.ReLU(),
            nn.Dropout(config.price_dropout),
            nn.Linear(config.price_hidden, config.price_latent),
            nn.ReLU(),
        )
        
        self.news_encoder = nn.Sequential(
            nn.Linear(config.n_embedding_dim, config.news_hidden),
            nn.ReLU(),
            nn.Dropout(config.news_dropout),
            nn.Linear(config.news_hidden, config.news_latent),
            nn.ReLU(),
        )
        
        fused_dim = config.fundamental_latent + config.price_latent + config.news_latent
        self.output_head = nn.Sequential(
            nn.Linear(fused_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, 1),
        )
    
    def forward(self, price, fund, emb):
        h_f = self.fund_encoder(fund)
        h_p = self.price_encoder(price)
        h_n = self.news_encoder(emb)
        h_n_scaled = self.config.news_alpha * h_n
        h = torch.cat([h_f, h_p, h_n_scaled], dim=-1)
        return self.output_head(h).squeeze(-1)
    
    def forward_pair(self, price_i, fund_i, emb_i, price_j, fund_j, emb_j):
        score_i = self.forward(price_i, fund_i, emb_i)
        score_j = self.forward(price_j, fund_j, emb_j)
        return torch.sigmoid(score_i - score_j)

## 3. Evaluation Metrics

In [11]:
@torch.no_grad()
def get_scores(model, df, price_cols, fund_cols, emb_cols, device, batch_size=1024):
    model.eval()
    price_arr = torch.tensor(df[price_cols].values.astype(np.float32))
    fund_arr = torch.tensor(df[fund_cols].values.astype(np.float32))
    emb_arr = torch.tensor(df[emb_cols].values.astype(np.float32))
    
    scores = []
    for i in range(0, len(df), batch_size):
        price = price_arr[i:i+batch_size].to(device)
        fund = fund_arr[i:i+batch_size].to(device)
        emb = emb_arr[i:i+batch_size].to(device)
        score = model(price, fund, emb)
        scores.append(score.cpu().numpy())
    
    return np.concatenate(scores)


def evaluate_model(model, df, price_cols, fund_cols, emb_cols, device, k=5):
    """Evaluate model on liquid stocks - long side, short side, and L/S."""
    df_eval = df.copy()
    df_eval["score"] = get_scores(model, df_eval, price_cols, fund_cols, emb_cols, device)
    
    # IC (rank correlation)
    ics = []
    for date, group in df_eval.groupby("feature_date"):
        if len(group) < 10:
            continue
        ic, _ = spearmanr(group["score"], group["simple_return"])
        if not np.isnan(ic):
            ics.append(ic)
    
    mean_ic = np.mean(ics) if ics else 0
    ic_std = np.std(ics) if ics else 1
    ic_sharpe = mean_ic / ic_std * np.sqrt(252) if ic_std > 0 else 0
    
    # Strategy returns
    long_returns = []
    short_returns = []
    
    for date, group in df_eval.groupby("feature_date"):
        if len(group) < 10:
            continue
        top_k = group.nlargest(k, "score")
        bottom_k = group.nsmallest(k, "score")
        
        long_returns.append(top_k["simple_return"].mean())
        short_returns.append(-bottom_k["simple_return"].mean())
    
    long_arr = np.array(long_returns)
    short_arr = np.array(short_returns)
    ls_arr = (long_arr + short_arr) / 2
    
    def compute_sharpe(returns):
        if len(returns) < 2 or np.std(returns) == 0:
            return 0
        return np.mean(returns) / np.std(returns) * np.sqrt(252)
    
    return {
        'ic_sharpe': ic_sharpe,
        'mean_ic': mean_ic,
        'long_return': np.mean(long_arr) * 252,  # Annualized
        'long_sharpe': compute_sharpe(long_arr),
        'short_return': np.mean(short_arr) * 252,
        'short_sharpe': compute_sharpe(short_arr),
        'ls_return': np.mean(ls_arr) * 252,
        'ls_sharpe': compute_sharpe(ls_arr),
    }

## 4. Training

In [12]:
def pairwise_ranking_loss(pred_prob, label, smoothing=0.1):
    smoothed_label = label * (1 - smoothing) + 0.5 * smoothing
    return F.binary_cross_entropy(pred_prob, smoothed_label)


def train_epoch(model, loader, optimizer, device, label_smoothing=0.1):
    model.train()
    total_loss = 0
    total_samples = 0
    
    for batch in loader:
        price_i = batch["price_i"].to(device)
        price_j = batch["price_j"].to(device)
        fund_i = batch["fund_i"].to(device)
        fund_j = batch["fund_j"].to(device)
        emb_i = batch["emb_i"].to(device)
        emb_j = batch["emb_j"].to(device)
        label = batch["label"].to(device)
        
        optimizer.zero_grad()
        pred_prob = model.forward_pair(price_i, fund_i, emb_i, price_j, fund_j, emb_j)
        loss = pairwise_ranking_loss(pred_prob, label, smoothing=label_smoothing)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * len(label)
        total_samples += len(label)
    
    return total_loss / total_samples


def train_and_evaluate(config, train_dataset, val_df, price_cols, fund_cols, emb_cols, 
                       selection_metric='ic_sharpe', n_epochs=15, verbose=False):
    """Train model, select best checkpoint by specified metric."""
    model = MultiBranchRanker(config).to(device)
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=config.learning_rate, 
        weight_decay=config.weight_decay
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    
    best_metric = -float('inf')
    best_state = None
    
    for epoch in range(n_epochs):
        train_dataset.resample_pairs()
        train_loss = train_epoch(model, train_loader, optimizer, device, config.label_smoothing)
        scheduler.step()
        
        if (epoch + 1) % 5 == 0 or epoch == n_epochs - 1:
            metrics = evaluate_model(model, val_df, price_cols, fund_cols, emb_cols, device)
            
            current = metrics[selection_metric]
            if current > best_metric:
                best_metric = current
                best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            
            if verbose:
                print(f"  Epoch {epoch+1}: IC={metrics['ic_sharpe']:.2f}, Long={metrics['long_sharpe']:.2f}, Short={metrics['short_sharpe']:.2f}")
    
    model.load_state_dict(best_state)
    final_metrics = evaluate_model(model, val_df, price_cols, fund_cols, emb_cols, device)
    
    return final_metrics, model

In [13]:
# Create training dataset from liquid stocks
train_dataset = SinglePairDataset(train_df_news, price_feat_cols, fund_feat_cols, emb_cols, verbose=True)

Generated 120,493 pairs from 241,382 rows


## 5. Baseline: Original Model on Liquid Stocks

In [14]:
# Load original model and evaluate on liquid test set
from trading.model import ModelInference

original_model_path = Path('data/model_robust_optimized.pt')
if original_model_path.exists():
    original_model = ModelInference(original_model_path)
    
    # Score liquid test set
    test_df_news['orig_score'] = original_model.score(test_df_news)
    
    # Evaluate
    orig_long = []
    orig_short = []
    orig_ics = []
    
    for date, group in test_df_news.groupby('feature_date'):
        if len(group) < 10:
            continue
        
        ic, _ = spearmanr(group['orig_score'], group['simple_return'])
        if not np.isnan(ic):
            orig_ics.append(ic)
        
        top_5 = group.nlargest(5, 'orig_score')
        bottom_5 = group.nsmallest(5, 'orig_score')
        
        orig_long.append(top_5['simple_return'].mean())
        orig_short.append(-bottom_5['simple_return'].mean())
    
    print("ORIGINAL MODEL ON LIQUID TEST SET")
    print("=" * 50)
    print(f"IC Sharpe: {np.mean(orig_ics)/np.std(orig_ics)*np.sqrt(252):.2f}")
    print(f"Long:  {np.mean(orig_long)*252*100:.1f}% ann, Sharpe {np.mean(orig_long)/np.std(orig_long)*np.sqrt(252):.2f}")
    print(f"Short: {np.mean(orig_short)*252*100:.1f}% ann, Sharpe {np.mean(orig_short)/np.std(orig_short)*np.sqrt(252):.2f}")
else:
    print("Original model not found")

ORIGINAL MODEL ON LIQUID TEST SET
IC Sharpe: 1.66
Long:  -5.8% ann, Sharpe -0.25
Short: -73.4% ann, Sharpe -0.72


## 6. Search Space

In [15]:
SEARCH_SPACE = {
    # Architecture
    "latent_scale": [0.5, 1.0, 2.0],
    "hidden_scale": [0.5, 1.0, 1.5],
    "news_alpha": [0.6, 0.8, 1.0],
    # Training
    "learning_rate": [5e-4, 1e-3, 2e-3],
    "weight_decay": [1e-4, 1e-3, 1e-2],
    "label_smoothing": [0.05, 0.1, 0.15],
    # Dropout
    "fund_dropout": [0.3, 0.5, 0.7],
    "price_dropout": [0.2, 0.3, 0.4],
    "news_dropout": [0.1, 0.2, 0.3],
}

# Selection metric - use IC Sharpe since we want to find patterns that generalize
SELECTION_METRIC = 'ic_sharpe'

n_arch = len(SEARCH_SPACE["latent_scale"]) * len(SEARCH_SPACE["hidden_scale"]) * len(SEARCH_SPACE["news_alpha"])
print(f"Architecture configs: {n_arch}")

Architecture configs: 27


## 7. Architecture Search

In [16]:
print("STAGE 1: ARCHITECTURE SEARCH")
print("=" * 70)

arch_configs = list(product(
    SEARCH_SPACE["latent_scale"],
    SEARCH_SPACE["hidden_scale"],
    SEARCH_SPACE["news_alpha"],
))

arch_results = []
best_arch_metric = -float('inf')
best_arch = None

for i, (latent_scale, hidden_scale, news_alpha) in enumerate(arch_configs):
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        fund_hidden=int(64 * hidden_scale),
        price_hidden=int(32 * hidden_scale),
        news_hidden=int(128 * hidden_scale),
        fundamental_latent=int(32 * latent_scale),
        price_latent=int(16 * latent_scale),
        news_latent=int(32 * latent_scale),
        news_alpha=news_alpha,
    )
    
    start = datetime.now()
    metrics, _ = train_and_evaluate(
        config, train_dataset, val_df_news, 
        price_feat_cols, fund_feat_cols, emb_cols,
        selection_metric=SELECTION_METRIC, n_epochs=15
    )
    elapsed = (datetime.now() - start).total_seconds()
    
    arch_results.append({
        "latent_scale": latent_scale,
        "hidden_scale": hidden_scale,
        "news_alpha": news_alpha,
        **metrics,
    })
    
    if metrics[SELECTION_METRIC] > best_arch_metric:
        best_arch_metric = metrics[SELECTION_METRIC]
        best_arch = (latent_scale, hidden_scale, news_alpha)
    
    print(f"[{i+1:2d}/{len(arch_configs)}] l={latent_scale:.1f} h={hidden_scale:.1f} a={news_alpha:.1f} | "
          f"IC={metrics['ic_sharpe']:.2f} Long={metrics['long_sharpe']:.2f} Short={metrics['short_sharpe']:.2f} | {elapsed:.0f}s")

best_latent, best_hidden, best_alpha = best_arch
print(f"\nBest: latent={best_latent}, hidden={best_hidden}, alpha={best_alpha}")

STAGE 1: ARCHITECTURE SEARCH
[ 1/27] l=0.5 h=0.5 a=0.6 | IC=3.27 Long=3.31 Short=-1.15 | 83s
[ 2/27] l=0.5 h=0.5 a=0.8 | IC=3.69 Long=2.62 Short=-2.12 | 83s
[ 3/27] l=0.5 h=0.5 a=1.0 | IC=3.19 Long=4.37 Short=-2.68 | 83s
[ 4/27] l=0.5 h=1.0 a=0.6 | IC=3.62 Long=3.11 Short=-2.65 | 90s
[ 5/27] l=0.5 h=1.0 a=0.8 | IC=3.53 Long=2.46 Short=-1.67 | 90s
[ 6/27] l=0.5 h=1.0 a=1.0 | IC=3.24 Long=2.22 Short=-0.93 | 90s
[ 7/27] l=0.5 h=1.5 a=0.6 | IC=2.92 Long=2.76 Short=-1.29 | 96s
[ 8/27] l=0.5 h=1.5 a=0.8 | IC=2.88 Long=1.97 Short=-2.82 | 97s
[ 9/27] l=0.5 h=1.5 a=1.0 | IC=2.82 Long=2.81 Short=-1.68 | 97s
[10/27] l=1.0 h=0.5 a=0.6 | IC=4.15 Long=1.71 Short=-2.71 | 84s
[11/27] l=1.0 h=0.5 a=0.8 | IC=4.54 Long=1.76 Short=-2.61 | 84s
[12/27] l=1.0 h=0.5 a=1.0 | IC=3.59 Long=3.57 Short=-2.55 | 84s
[13/27] l=1.0 h=1.0 a=0.6 | IC=3.14 Long=2.45 Short=-2.69 | 91s
[14/27] l=1.0 h=1.0 a=0.8 | IC=3.83 Long=2.45 Short=-1.56 | 91s
[15/27] l=1.0 h=1.0 a=1.0 | IC=3.50 Long=2.14 Short=-2.37 | 91s
[16/27] l=1

In [17]:
# Show top results
arch_df = pd.DataFrame(arch_results).sort_values(SELECTION_METRIC, ascending=False)
print("\nTop 10 by IC Sharpe:")
print(arch_df.head(10)[['latent_scale', 'hidden_scale', 'news_alpha', 'ic_sharpe', 'long_sharpe', 'short_sharpe', 'ls_sharpe']].round(2).to_string(index=False))


Top 10 by IC Sharpe:
 latent_scale  hidden_scale  news_alpha  ic_sharpe  long_sharpe  short_sharpe  ls_sharpe
          1.0           0.5         0.8       4.54         1.76         -2.61      -2.29
          1.0           0.5         0.6       4.15         1.71         -2.71      -2.31
          1.0           1.0         0.8       3.83         2.45         -1.56      -0.94
          0.5           0.5         0.8       3.69         2.62         -2.12      -1.52
          2.0           1.0         0.6       3.67         2.16         -2.27      -1.78
          0.5           1.0         0.6       3.62         3.11         -2.65      -2.04
          1.0           1.5         1.0       3.59         2.44         -2.48      -1.60
          1.0           0.5         1.0       3.59         3.57         -2.55      -1.80
          0.5           1.0         0.8       3.53         2.46         -1.67      -1.20
          2.0           1.5         1.0       3.52         2.11         -1.28      -0.75

## 8. Training Hyperparameter Search

In [18]:
print("\nSTAGE 2: TRAINING HYPERPARAMETERS")
print("-" * 50)

train_configs = list(product(
    SEARCH_SPACE["learning_rate"],
    SEARCH_SPACE["weight_decay"],
    SEARCH_SPACE["label_smoothing"],
))

train_results = []
best_train_metric = -float('inf')
best_train = None

for i, (lr, wd, smoothing) in enumerate(train_configs):
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        fund_hidden=int(64 * best_hidden),
        price_hidden=int(32 * best_hidden),
        news_hidden=int(128 * best_hidden),
        fundamental_latent=int(32 * best_latent),
        price_latent=int(16 * best_latent),
        news_latent=int(32 * best_latent),
        news_alpha=best_alpha,
        learning_rate=lr,
        weight_decay=wd,
        label_smoothing=smoothing,
    )
    
    metrics, _ = train_and_evaluate(
        config, train_dataset, val_df_news,
        price_feat_cols, fund_feat_cols, emb_cols,
        selection_metric=SELECTION_METRIC, n_epochs=15
    )
    
    train_results.append({"lr": lr, "wd": wd, "smooth": smoothing, **metrics})
    
    if metrics[SELECTION_METRIC] > best_train_metric:
        best_train_metric = metrics[SELECTION_METRIC]
        best_train = (lr, wd, smoothing)
    
    print(f"[{i+1:2d}/{len(train_configs)}] lr={lr:.0e} wd={wd:.0e} s={smoothing:.2f} | "
          f"IC={metrics['ic_sharpe']:.2f} Long={metrics['long_sharpe']:.2f}")

best_lr, best_wd, best_smooth = best_train
print(f"\nBest: lr={best_lr:.0e}, wd={best_wd:.0e}, smooth={best_smooth}")


STAGE 2: TRAINING HYPERPARAMETERS
--------------------------------------------------
[ 1/27] lr=5e-04 wd=1e-04 s=0.05 | IC=3.80 Long=2.51
[ 2/27] lr=5e-04 wd=1e-04 s=0.10 | IC=3.46 Long=2.44
[ 3/27] lr=5e-04 wd=1e-04 s=0.15 | IC=3.34 Long=3.06
[ 4/27] lr=5e-04 wd=1e-03 s=0.05 | IC=3.51 Long=3.33
[ 5/27] lr=5e-04 wd=1e-03 s=0.10 | IC=3.39 Long=3.71
[ 6/27] lr=5e-04 wd=1e-03 s=0.15 | IC=3.15 Long=2.41
[ 7/27] lr=5e-04 wd=1e-02 s=0.05 | IC=3.01 Long=2.48
[ 8/27] lr=5e-04 wd=1e-02 s=0.10 | IC=3.77 Long=1.67
[ 9/27] lr=5e-04 wd=1e-02 s=0.15 | IC=3.78 Long=3.67
[10/27] lr=1e-03 wd=1e-04 s=0.05 | IC=3.99 Long=2.58
[11/27] lr=1e-03 wd=1e-04 s=0.10 | IC=2.33 Long=0.28
[12/27] lr=1e-03 wd=1e-04 s=0.15 | IC=3.72 Long=2.02
[13/27] lr=1e-03 wd=1e-03 s=0.05 | IC=3.22 Long=1.54
[14/27] lr=1e-03 wd=1e-03 s=0.10 | IC=3.05 Long=3.36
[15/27] lr=1e-03 wd=1e-03 s=0.15 | IC=3.13 Long=2.22
[16/27] lr=1e-03 wd=1e-02 s=0.05 | IC=3.10 Long=2.32
[17/27] lr=1e-03 wd=1e-02 s=0.10 | IC=2.96 Long=3.47
[18/27] lr=1e

## 9. Dropout Search

In [19]:
print("\nSTAGE 3: DROPOUT")
print("-" * 50)

dropout_configs = list(product(
    SEARCH_SPACE["fund_dropout"],
    SEARCH_SPACE["price_dropout"],
    SEARCH_SPACE["news_dropout"],
))

dropout_results = []
best_dropout_metric = -float('inf')
best_dropout = None
best_model = None

for i, (fund_do, price_do, news_do) in enumerate(dropout_configs):
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        fund_hidden=int(64 * best_hidden),
        price_hidden=int(32 * best_hidden),
        news_hidden=int(128 * best_hidden),
        fundamental_latent=int(32 * best_latent),
        price_latent=int(16 * best_latent),
        news_latent=int(32 * best_latent),
        news_alpha=best_alpha,
        fundamental_dropout=fund_do,
        price_dropout=price_do,
        news_dropout=news_do,
        learning_rate=best_lr,
        weight_decay=best_wd,
        label_smoothing=best_smooth,
    )
    
    metrics, model = train_and_evaluate(
        config, train_dataset, val_df_news,
        price_feat_cols, fund_feat_cols, emb_cols,
        selection_metric=SELECTION_METRIC, n_epochs=15
    )
    
    dropout_results.append({"fund": fund_do, "price": price_do, "news": news_do, **metrics})
    
    if metrics[SELECTION_METRIC] > best_dropout_metric:
        best_dropout_metric = metrics[SELECTION_METRIC]
        best_dropout = (fund_do, price_do, news_do)
        best_model = model
    
    print(f"[{i+1:2d}/{len(dropout_configs)}] f={fund_do:.1f} p={price_do:.1f} n={news_do:.1f} | "
          f"IC={metrics['ic_sharpe']:.2f} Long={metrics['long_sharpe']:.2f}")

best_fund_do, best_price_do, best_news_do = best_dropout
print(f"\nBest: fund={best_fund_do}, price={best_price_do}, news={best_news_do}")


STAGE 3: DROPOUT
--------------------------------------------------
[ 1/27] f=0.3 p=0.2 n=0.1 | IC=2.67 Long=1.21
[ 2/27] f=0.3 p=0.2 n=0.2 | IC=2.88 Long=1.69
[ 3/27] f=0.3 p=0.2 n=0.3 | IC=2.64 Long=1.46
[ 4/27] f=0.3 p=0.3 n=0.1 | IC=2.79 Long=2.25
[ 5/27] f=0.3 p=0.3 n=0.2 | IC=3.71 Long=2.17
[ 6/27] f=0.3 p=0.3 n=0.3 | IC=3.23 Long=0.42
[ 7/27] f=0.3 p=0.4 n=0.1 | IC=2.91 Long=2.04
[ 8/27] f=0.3 p=0.4 n=0.2 | IC=2.92 Long=1.97
[ 9/27] f=0.3 p=0.4 n=0.3 | IC=3.48 Long=2.19
[10/27] f=0.5 p=0.2 n=0.1 | IC=3.24 Long=2.34
[11/27] f=0.5 p=0.2 n=0.2 | IC=2.85 Long=3.24
[12/27] f=0.5 p=0.2 n=0.3 | IC=3.05 Long=1.11
[13/27] f=0.5 p=0.3 n=0.1 | IC=3.46 Long=2.28
[14/27] f=0.5 p=0.3 n=0.2 | IC=3.04 Long=3.16
[15/27] f=0.5 p=0.3 n=0.3 | IC=2.50 Long=2.00
[16/27] f=0.5 p=0.4 n=0.1 | IC=3.10 Long=2.87
[17/27] f=0.5 p=0.4 n=0.2 | IC=2.75 Long=1.17
[18/27] f=0.5 p=0.4 n=0.3 | IC=2.39 Long=1.27
[19/27] f=0.7 p=0.2 n=0.1 | IC=3.20 Long=2.99
[20/27] f=0.7 p=0.2 n=0.2 | IC=3.83 Long=1.72
[21/27] f=0

## 10. Train Final Model

In [20]:
print("\nTRAINING FINAL MODEL (25 epochs)")
print("=" * 70)

final_config = ModelConfig(
    n_fundamental_features=len(fund_feat_cols),
    n_price_features=len(price_feat_cols),
    n_embedding_dim=len(emb_cols),
    fund_hidden=int(64 * best_hidden),
    price_hidden=int(32 * best_hidden),
    news_hidden=int(128 * best_hidden),
    fundamental_latent=int(32 * best_latent),
    price_latent=int(16 * best_latent),
    news_latent=int(32 * best_latent),
    news_alpha=best_alpha,
    fundamental_dropout=best_fund_do,
    price_dropout=best_price_do,
    news_dropout=best_news_do,
    learning_rate=best_lr,
    weight_decay=best_wd,
    label_smoothing=best_smooth,
    n_epochs=25,
)

print(f"Config:")
print(f"  hidden=({final_config.fund_hidden},{final_config.price_hidden},{final_config.news_hidden})")
print(f"  latent=({final_config.fundamental_latent},{final_config.price_latent},{final_config.news_latent})")
print(f"  dropout=({final_config.fundamental_dropout},{final_config.price_dropout},{final_config.news_dropout})")
print(f"  lr={final_config.learning_rate:.0e}, wd={final_config.weight_decay:.0e}")

final_metrics, final_model = train_and_evaluate(
    final_config, train_dataset, val_df_news,
    price_feat_cols, fund_feat_cols, emb_cols,
    selection_metric=SELECTION_METRIC, n_epochs=25, verbose=True
)


TRAINING FINAL MODEL (25 epochs)
Config:
  hidden=(32,16,64)
  latent=(32,16,32)
  dropout=(0.7,0.2,0.2)
  lr=2e-03, wd=1e-03
  Epoch 5: IC=1.96, Long=2.65, Short=-1.97
  Epoch 10: IC=4.33, Long=1.03, Short=-1.93
  Epoch 15: IC=3.13, Long=3.28, Short=-1.36
  Epoch 20: IC=2.70, Long=3.49, Short=-1.27
  Epoch 25: IC=2.52, Long=3.58, Short=-1.29


## 11. Test Set Evaluation

In [21]:
# Evaluate on test set
test_metrics = evaluate_model(final_model, test_df_news, price_feat_cols, fund_feat_cols, emb_cols, device)

print("\n" + "=" * 60)
print("TEST SET RESULTS (LIQUID-TRAINED MODEL)")
print("=" * 60)
print(f"IC Sharpe:    {test_metrics['ic_sharpe']:.2f}")
print(f"Mean IC:      {test_metrics['mean_ic']:.4f}")
print(f"")
print(f"Long:  {test_metrics['long_return']*100:.1f}% ann, Sharpe {test_metrics['long_sharpe']:.2f}")
print(f"Short: {test_metrics['short_return']*100:.1f}% ann, Sharpe {test_metrics['short_sharpe']:.2f}")
print(f"L/S:   {test_metrics['ls_return']*100:.1f}% ann, Sharpe {test_metrics['ls_sharpe']:.2f}")


TEST SET RESULTS (LIQUID-TRAINED MODEL)
IC Sharpe:    1.64
Mean IC:      0.0106

Long:  -32.6% ann, Sharpe -1.37
Short: -147.2% ann, Sharpe -1.55
L/S:   -89.9% ann, Sharpe -2.00


In [22]:
# Compare to original model
print("\n" + "=" * 60)
print("COMPARISON: ORIGINAL vs LIQUID-TRAINED")
print("=" * 60)
print("\nOn liquid test set ($50M+ daily volume):")
print("")
print(f"{'Metric':<15} {'Original':>12} {'Liquid-Trained':>15}")
print("-" * 45)

if 'orig_score' in test_df_news.columns:
    orig_ic_sharpe = np.mean(orig_ics)/np.std(orig_ics)*np.sqrt(252)
    orig_long_sharpe = np.mean(orig_long)/np.std(orig_long)*np.sqrt(252)
    orig_short_sharpe = np.mean(orig_short)/np.std(orig_short)*np.sqrt(252)
    
    print(f"{'IC Sharpe':<15} {orig_ic_sharpe:>12.2f} {test_metrics['ic_sharpe']:>15.2f}")
    print(f"{'Long Sharpe':<15} {orig_long_sharpe:>12.2f} {test_metrics['long_sharpe']:>15.2f}")
    print(f"{'Short Sharpe':<15} {orig_short_sharpe:>12.2f} {test_metrics['short_sharpe']:>15.2f}")
    print(f"{'Long Return':<15} {np.mean(orig_long)*252*100:>11.1f}% {test_metrics['long_return']*100:>14.1f}%")
    print(f"{'Short Return':<15} {np.mean(orig_short)*252*100:>11.1f}% {test_metrics['short_return']*100:>14.1f}%")


COMPARISON: ORIGINAL vs LIQUID-TRAINED

On liquid test set ($50M+ daily volume):

Metric              Original  Liquid-Trained
---------------------------------------------
IC Sharpe               1.66            1.64
Long Sharpe            -0.25           -1.37
Short Sharpe           -0.72           -1.55
Long Return            -5.8%          -32.6%
Short Return          -73.4%         -147.2%


## 12. Cost-Adjusted Analysis

In [23]:
# Simulate returns with costs for liquid-trained model
SPREAD_COST = 0.001  # 10 bps round-trip
IMPACT_COST = 0.0005  # 5 bps each way
BORROW_COST_DAILY = 0.03 / 252  # 3% annual

test_df_eval = test_df_news.copy()
test_df_eval['score'] = get_scores(final_model, test_df_eval, price_feat_cols, fund_feat_cols, emb_cols, device)

long_returns_gross = []
short_returns_gross = []

for date, group in test_df_eval.groupby('feature_date'):
    if len(group) < 10:
        continue
    top_5 = group.nlargest(5, 'score')
    bottom_5 = group.nsmallest(5, 'score')
    
    long_returns_gross.append(top_5['simple_return'].mean())
    short_returns_gross.append(-bottom_5['simple_return'].mean())

long_arr = np.array(long_returns_gross)
short_arr = np.array(short_returns_gross)

# Apply costs
long_cost = SPREAD_COST + IMPACT_COST * 2  # Entry + exit
short_cost = SPREAD_COST + IMPACT_COST * 2 + BORROW_COST_DAILY

long_net = long_arr - long_cost
short_net = short_arr - short_cost
ls_net = (long_net + short_net) / 2

print("\n" + "=" * 60)
print("COST-ADJUSTED RETURNS (1-day holding)")
print("=" * 60)
print(f"Costs: Spread={SPREAD_COST*100:.2f}%, Impact={IMPACT_COST*200:.2f}% RT, Borrow={BORROW_COST_DAILY*252*100:.1f}% ann")
print("")
print(f"Long:  Gross {np.mean(long_arr)*252*100:.1f}% → Net {np.mean(long_net)*252*100:.1f}%")
print(f"Short: Gross {np.mean(short_arr)*252*100:.1f}% → Net {np.mean(short_net)*252*100:.1f}%")
print(f"L/S:   Net {np.mean(ls_net)*252*100:.1f}%")
print("")
print(f"Long Net Sharpe:  {np.mean(long_net)/np.std(long_net)*np.sqrt(252):.2f}")
print(f"Short Net Sharpe: {np.mean(short_net)/np.std(short_net)*np.sqrt(252):.2f}")
print(f"L/S Net Sharpe:   {np.mean(ls_net)/np.std(ls_net)*np.sqrt(252):.2f}")


COST-ADJUSTED RETURNS (1-day holding)
Costs: Spread=0.10%, Impact=0.10% RT, Borrow=3.0% ann

Long:  Gross -32.6% → Net -83.0%
Short: Gross -147.2% → Net -200.6%
L/S:   Net -141.8%

Long Net Sharpe:  -3.49
Short Net Sharpe: -2.11
L/S Net Sharpe:   -3.15


## 13. Save Model

In [24]:
# Save liquid-trained model
torch.save({
    "model_state_dict": final_model.state_dict(),
    "config": final_config,
    "price_cols": price_feat_cols,
    "fund_cols": fund_feat_cols,
    "emb_cols": emb_cols,
    "liquid_threshold": LIQUID_THRESHOLD,
    "test_metrics": test_metrics,
}, "data/model_liquid_trained.pt")

print("Saved to data/model_liquid_trained.pt")

Saved to data/model_liquid_trained.pt


In [25]:
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"\nTraining universe: Liquid stocks (>= ${LIQUID_THRESHOLD/1e6:.0f}M daily volume)")
print(f"Training samples: {len(train_df_news):,}")
print(f"")
print(f"Best config:")
print(f"  Architecture: latent={best_latent}, hidden={best_hidden}, alpha={best_alpha}")
print(f"  Training: lr={best_lr:.0e}, wd={best_wd:.0e}, smooth={best_smooth}")
print(f"  Dropout: fund={best_fund_do}, price={best_price_do}, news={best_news_do}")
print(f"")
print(f"Test performance (liquid stocks):")
print(f"  IC Sharpe: {test_metrics['ic_sharpe']:.2f}")
print(f"  Long: {test_metrics['long_return']*100:.1f}% ann (Sharpe {test_metrics['long_sharpe']:.2f})")
print(f"  Short: {test_metrics['short_return']*100:.1f}% ann (Sharpe {test_metrics['short_sharpe']:.2f})")


SUMMARY

Training universe: Liquid stocks (>= $50M daily volume)
Training samples: 241,382

Best config:
  Architecture: latent=1.0, hidden=0.5, alpha=0.8
  Training: lr=2e-03, wd=1e-03, smooth=0.05
  Dropout: fund=0.7, price=0.2, news=0.2

Test performance (liquid stocks):
  IC Sharpe: 1.64
  Long: -32.6% ann (Sharpe -1.37)
  Short: -147.2% ann (Sharpe -1.55)
