# 2.4 Fundamental Regularization

Address the imbalance between fundamental and news feature exposure.

**Problem**: Fundamentals update quarterly but news is daily, so the model sees the same fundamental features ~60x more often than each news embedding. This may cause:
- Overfitting to specific fundamental values
- Underweighting news signal (each embedding is unique/"noisy")
- Gradient imbalance favoring fundamentals

**Solutions tested**:
1. **Noise augmentation**: Add small noise to fundamentals during training
2. **Higher dropout**: Increase fundamental encoder dropout

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from scipy.stats import spearmanr

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [2]:
from dataclasses import dataclass

@dataclass
class ModelConfig:
    # Feature dimensions
    n_fundamental_features: int = 19
    n_price_features: int = 9
    n_embedding_dim: int = 768
    
    # Encoder latent dimensions
    fundamental_latent: int = 32
    price_latent: int = 16
    news_latent: int = 32
    
    # Dropout - fundamental dropout is now a tunable parameter
    fundamental_dropout: float = 0.2
    price_dropout: float = 0.2
    news_dropout: float = 0.3
    
    # News influence cap
    news_alpha: float = 0.8
    
    # Training
    batch_size: int = 512
    learning_rate: float = 1e-3
    weight_decay: float = 1e-3
    n_epochs: int = 5
    
    # NEW: Fundamental noise augmentation
    fundamental_noise_std: float = 0.0  # 0 = no noise

## 1. Load Data

In [3]:
df = pd.read_parquet("data/ml_dataset.pqt")
df["feature_date"] = pd.to_datetime(df["feature_date"])

print(f"Dataset: {len(df):,} rows")
print(f"Date range: {df['feature_date'].min().date()} to {df['feature_date'].max().date()}")
print(f"Symbols: {df['symbol'].nunique():,}")

Dataset: 2,092,929 rows
Date range: 2021-01-13 to 2025-12-18
Symbols: 3,506


In [4]:
# Feature columns
price_feat_cols = [
    "overnight_gap_z", "intraday_ret_z",
    "ret_1d_z", "ret_2d_z", "ret_3d_z", "ret_5d_z",
    "vol_5d_z", "dist_from_high_5d_z", "dist_from_low_5d_z"
]
fund_feat_cols = [c for c in df.columns if c.endswith("_z") and c not in price_feat_cols and c != "news_count_z"]
emb_cols = [c for c in df.columns if c.startswith("emb_")]

print(f"Price features: {len(price_feat_cols)}")
print(f"Fundamental features: {len(fund_feat_cols)}")
print(f"Embedding dims: {len(emb_cols)}")

Price features: 9
Fundamental features: 19
Embedding dims: 768


In [5]:
# Time-based split
dates = sorted(df["feature_date"].unique())
n_dates = len(dates)
train_end_idx = int(n_dates * 0.7)
val_end_idx = int(n_dates * 0.8)

train_dates = set(dates[:train_end_idx])
val_dates = set(dates[train_end_idx:val_end_idx])
test_dates = set(dates[val_end_idx:])

train_df = df[df["feature_date"].isin(train_dates)].copy()
val_df = df[df["feature_date"].isin(val_dates)].copy()
test_df = df[df["feature_date"].isin(test_dates)].copy()

print(f"Train: {len(train_df):,} rows, {len(train_dates)} days")
print(f"Val: {len(val_df):,} rows, {len(val_dates)} days")
print(f"Test: {len(test_df):,} rows, {len(test_dates)} days")

Train: 1,418,494 rows, 830 days
Val: 210,247 rows, 118 days
Test: 464,188 rows, 238 days


## 2. Analyze Fundamental Repetition

In [6]:
# Filter to news-only rows (what we train on)
has_news = (train_df[emb_cols].abs().sum(axis=1) > 0)
train_news = train_df[has_news].copy()

print(f"Training rows with news: {len(train_news):,} ({len(train_news)/len(train_df)*100:.1f}%)")

# For each symbol, count unique fundamental "snapshots"
# (fundamentals change quarterly, so should have ~4 per year)
fund_cols_check = fund_feat_cols[:5]  # Just check a few columns
train_news['fund_hash'] = train_news[fund_feat_cols].apply(lambda x: hash(tuple(x.round(4))), axis=1)

fund_stats = train_news.groupby('symbol').agg({
    'fund_hash': 'nunique',
    'feature_date': 'count'
}).rename(columns={'fund_hash': 'unique_fundamentals', 'feature_date': 'news_days'})

fund_stats['repetition_factor'] = fund_stats['news_days'] / fund_stats['unique_fundamentals']

print(f"\nFundamental repetition stats:")
print(fund_stats['repetition_factor'].describe())

print(f"\nTop 10 symbols by repetition factor:")
print(fund_stats.nlargest(10, 'repetition_factor'))

Training rows with news: 339,872 (24.0%)

Fundamental repetition stats:
count    2477.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: repetition_factor, dtype: float64

Top 10 symbols by repetition factor:
        unique_fundamentals  news_days  repetition_factor
symbol                                                   
A                       303        303                1.0
AA                      394        394                1.0
AACG                      2          2                1.0
AAL                     635        635                1.0
AAME                      2          2                1.0
AAOI                     37         37                1.0
AAON                     87         87                1.0
AAPL                    830        830                1.0
AAT                      16         16                1.0
AB                      182        182                1.0


## 3. Dataset with Noise Augmentation

In [7]:
class AugmentedPairwiseDataset(Dataset):
    """Pairwise dataset with fundamental noise augmentation.
    
    Adds noise to fundamental features during training to prevent
    overfitting to repeated quarterly values.
    """

    def __init__(
        self,
        df: pd.DataFrame,
        price_cols: list[str],
        fund_cols: list[str],
        emb_cols: list[str],
        fundamental_noise_std: float = 0.0,
        training: bool = True,
    ):
        # Filter to rows with news only
        has_news = (df[emb_cols].abs().sum(axis=1) > 0)
        df_news = df[has_news].copy()
        print(f"Filtered to news-only: {len(df_news):,} / {len(df):,} rows ({len(df_news)/len(df)*100:.1f}%)")

        self.df = df_news.reset_index(drop=True)
        self.price_cols = price_cols
        self.fund_cols = fund_cols
        self.emb_cols = emb_cols
        self.fundamental_noise_std = fundamental_noise_std
        self.training = training

        # Group by date
        self.date_groups = {}
        for date, group in df_news.groupby("feature_date"):
            indices = group.index.tolist()
            if len(indices) < 2:
                continue
            self.date_groups[date] = np.array(indices)

        self.dates = list(self.date_groups.keys())
        print(f"Days with sufficient news coverage: {len(self.dates)}")

        # Precompute arrays
        self.price_arr = df_news[price_cols].values.astype(np.float32)
        self.fund_arr = df_news[fund_cols].values.astype(np.float32)
        self.emb_arr = df_news[emb_cols].values.astype(np.float32)
        self.target_arr = df_news["target_return"].values.astype(np.float32)

        # Map original index to position in filtered df
        self.idx_map = {old_idx: new_idx for new_idx, old_idx in enumerate(df_news.index)}

        # Generate pairs
        self.pairs = []
        self._generate_pairs()
        
        if self.fundamental_noise_std > 0:
            print(f"Fundamental noise augmentation: std={self.fundamental_noise_std}")

    def _generate_pairs(self):
        """Generate all pairs."""
        pairs = []
        for date in self.dates:
            indices = self.date_groups[date]
            n = len(indices)
            for i in range(n):
                for j in range(i + 1, n):
                    idx_i = self.idx_map[indices[i]]
                    idx_j = self.idx_map[indices[j]]
                    pairs.append((idx_i, idx_j))

        self.pairs = pairs
        print(f"Generated {len(self.pairs):,} pairs")

    def resample_pairs(self):
        """Reshuffle pairs."""
        np.random.shuffle(self.pairs)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        i, j = self.pairs[idx]

        price_i = self.price_arr[i].copy()
        price_j = self.price_arr[j].copy()
        fund_i = self.fund_arr[i].copy()
        fund_j = self.fund_arr[j].copy()
        emb_i = self.emb_arr[i]
        emb_j = self.emb_arr[j]

        # Add noise to fundamentals during training
        if self.training and self.fundamental_noise_std > 0:
            fund_i = fund_i + np.random.normal(0, self.fundamental_noise_std, size=fund_i.shape).astype(np.float32)
            fund_j = fund_j + np.random.normal(0, self.fundamental_noise_std, size=fund_j.shape).astype(np.float32)

        actual_label = 1.0 if self.target_arr[i] > self.target_arr[j] else 0.0

        # Random swap for label balance
        if np.random.random() < 0.5:
            price_i, price_j = price_j, price_i
            fund_i, fund_j = fund_j, fund_i
            emb_i, emb_j = emb_j, emb_i
            label = 1.0 - actual_label
        else:
            label = actual_label

        return {
            "price_i": torch.tensor(price_i),
            "price_j": torch.tensor(price_j),
            "fund_i": torch.tensor(fund_i),
            "fund_j": torch.tensor(fund_j),
            "emb_i": torch.tensor(emb_i),
            "emb_j": torch.tensor(emb_j),
            "label": torch.tensor(label),
        }

## 4. Model Architecture

In [8]:
class MultiBranchRanker(nn.Module):
    """Multi-branch model with configurable fundamental dropout."""
    
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config
        
        self.fund_encoder = nn.Sequential(
            nn.Linear(config.n_fundamental_features, 64),
            nn.ReLU(),
            nn.Dropout(config.fundamental_dropout),  # Tunable!
            nn.Linear(64, config.fundamental_latent),
            nn.ReLU(),
        )
        
        self.price_encoder = nn.Sequential(
            nn.Linear(config.n_price_features, 32),
            nn.ReLU(),
            nn.Dropout(config.price_dropout),
            nn.Linear(32, config.price_latent),
            nn.ReLU(),
        )
        
        self.news_encoder = nn.Sequential(
            nn.Linear(config.n_embedding_dim, 128),
            nn.ReLU(),
            nn.Dropout(config.news_dropout),
            nn.Linear(128, config.news_latent),
            nn.ReLU(),
        )
        
        fused_dim = config.fundamental_latent + config.price_latent + config.news_latent
        self.output_head = nn.Sequential(
            nn.Linear(fused_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, 1),
        )
    
    def forward(self, price, fund, emb):
        h_f = self.fund_encoder(fund)
        h_p = self.price_encoder(price)
        h_n = self.news_encoder(emb)
        h_n_scaled = self.config.news_alpha * h_n
        h = torch.cat([h_f, h_p, h_n_scaled], dim=-1)
        return self.output_head(h).squeeze(-1)
    
    def forward_pair(self, price_i, fund_i, emb_i, price_j, fund_j, emb_j):
        score_i = self.forward(price_i, fund_i, emb_i)
        score_j = self.forward(price_j, fund_j, emb_j)
        return torch.sigmoid(score_i - score_j)

## 5. Training Functions

In [9]:
def pairwise_ranking_loss(pred_prob, label, smoothing=0.1):
    smoothed_label = label * (1 - smoothing) + 0.5 * smoothing
    return F.binary_cross_entropy(pred_prob, smoothed_label)


def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in tqdm(loader, desc="Training", leave=False):
        price_i = batch["price_i"].to(device)
        price_j = batch["price_j"].to(device)
        fund_i = batch["fund_i"].to(device)
        fund_j = batch["fund_j"].to(device)
        emb_i = batch["emb_i"].to(device)
        emb_j = batch["emb_j"].to(device)
        label = batch["label"].to(device)
        
        optimizer.zero_grad()
        pred_prob = model.forward_pair(price_i, fund_i, emb_i, price_j, fund_j, emb_j)
        loss = pairwise_ranking_loss(pred_prob, label)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * len(label)
        total_correct += ((pred_prob > 0.5) == (label > 0.5)).sum().item()
        total_samples += len(label)
    
    return total_loss / total_samples, total_correct / total_samples


@torch.no_grad()
def eval_epoch(model, loader, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in tqdm(loader, desc="Evaluating", leave=False):
        price_i = batch["price_i"].to(device)
        price_j = batch["price_j"].to(device)
        fund_i = batch["fund_i"].to(device)
        fund_j = batch["fund_j"].to(device)
        emb_i = batch["emb_i"].to(device)
        emb_j = batch["emb_j"].to(device)
        label = batch["label"].to(device)
        
        pred_prob = model.forward_pair(price_i, fund_i, emb_i, price_j, fund_j, emb_j)
        loss = pairwise_ranking_loss(pred_prob, label)
        
        total_loss += loss.item() * len(label)
        total_correct += ((pred_prob > 0.5) == (label > 0.5)).sum().item()
        total_samples += len(label)
    
    return total_loss / total_samples, total_correct / total_samples


@torch.no_grad()
def get_scores(model, df, price_cols, fund_cols, emb_cols, device, batch_size=1024):
    """Score all rows in dataframe."""
    model.eval()
    
    price_arr = torch.tensor(df[price_cols].values.astype(np.float32))
    fund_arr = torch.tensor(df[fund_cols].values.astype(np.float32))
    emb_arr = torch.tensor(df[emb_cols].values.astype(np.float32))
    
    scores = []
    for i in range(0, len(df), batch_size):
        price = price_arr[i:i+batch_size].to(device)
        fund = fund_arr[i:i+batch_size].to(device)
        emb = emb_arr[i:i+batch_size].to(device)
        score = model(price, fund, emb)
        scores.append(score.cpu().numpy())
    
    return np.concatenate(scores)


def compute_daily_ic(df):
    """Compute Spearman IC per day."""
    ics = []
    for date, group in df.groupby("feature_date"):
        if len(group) < 10:
            continue
        ic, _ = spearmanr(group["score"], group["target_return"])
        if not np.isnan(ic):
            ics.append({"date": date, "ic": ic})
    return pd.DataFrame(ics)


def compute_short_returns(df, k=5, clip_return=0.10):
    """Compute daily short returns for bottom-K with return clipping."""
    returns = []
    for date, group in df.groupby("feature_date"):
        if len(group) < k * 2:
            continue
        bottom = group.nsmallest(k, "score")
        clipped_returns = bottom["target_return"].clip(-clip_return, clip_return)
        short_ret = -clipped_returns.mean()
        returns.append({"date": date, "return": short_ret})
    return pd.DataFrame(returns)

## 6. Train and Compare Configurations

In [10]:
def train_model(train_df, val_df, config, fundamental_noise_std=0.0, n_epochs=10):
    """Train a model with given config and return results."""
    
    noise_str = f"noise={fundamental_noise_std}" if fundamental_noise_std > 0 else "no_noise"
    dropout_str = f"dropout={config.fundamental_dropout}"
    print(f"\n{'='*60}")
    print(f"Training: {noise_str}, {dropout_str}")
    print(f"{'='*60}")
    
    # Create datasets
    train_dataset = AugmentedPairwiseDataset(
        train_df, price_feat_cols, fund_feat_cols, emb_cols,
        fundamental_noise_std=fundamental_noise_std,
        training=True,
    )
    val_dataset = AugmentedPairwiseDataset(
        val_df, price_feat_cols, fund_feat_cols, emb_cols,
        fundamental_noise_std=0.0,  # No noise during validation
        training=False,
    )
    
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)
    
    # Create model
    model = MultiBranchRanker(config).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
    
    # Train
    best_val_acc = 0
    history = []
    
    for epoch in range(n_epochs):
        train_dataset.resample_pairs()
        
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
        val_loss, val_acc = eval_epoch(model, val_loader, device)
        scheduler.step()
        
        history.append({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc,
        })
        
        print(f"Epoch {epoch+1}/{n_epochs}: "
              f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict().copy()
    
    # Load best model
    model.load_state_dict(best_state)
    
    return model, history

In [14]:
# Configurations to test
# Format: (fundamental_dropout, fundamental_noise_std)
configs_to_test = [
    (0.2, 0.0),   # Baseline: original dropout, no noise
    (0.2, 0.05),  # Low noise
    (0.2, 0.10),  # Medium noise
    (0.5, 0.0),   # High dropout only
    (0.5, 0.05),  # High dropout + low noise
    (0.5, 0.10),  # High dropout + medium noise
]
configs_to_test = list(reversed(configs_to_test))
configs_to_test

[(0.5, 0.1), (0.5, 0.05), (0.5, 0.0), (0.2, 0.1), (0.2, 0.05), (0.2, 0.0)]

In [15]:
results = {}

for fund_dropout, noise_std in configs_to_test:
    # Create config with specific dropout
    config = ModelConfig(
        n_fundamental_features=len(fund_feat_cols),
        n_price_features=len(price_feat_cols),
        n_embedding_dim=len(emb_cols),
        fundamental_dropout=fund_dropout,
        fundamental_noise_std=noise_std,
    )
    
    model, history = train_model(
        train_df, val_df, config, 
        fundamental_noise_std=noise_std,
        n_epochs=5
    )
    
    # Evaluate on test set
    test_df_eval = test_df.copy()
    test_df_eval["score"] = get_scores(model, test_df_eval, price_feat_cols, fund_feat_cols, emb_cols, device)
    
    # Compute metrics
    ic_df = compute_daily_ic(test_df_eval)
    short_df = compute_short_returns(test_df_eval, k=5, clip_return=0.10)
    
    mean_ic = ic_df['ic'].mean()
    ic_sharpe = mean_ic / ic_df['ic'].std() * np.sqrt(252)
    
    short_sharpe = short_df['return'].mean() / short_df['return'].std() * np.sqrt(252)
    short_cumret = (1 + short_df['return']).cumprod().iloc[-1] - 1
    
    config_name = f"d={fund_dropout}_n={noise_std}"
    results[config_name] = {
        'model': model,
        'config': config,
        'history': history,
        'mean_ic': mean_ic,
        'ic_sharpe': ic_sharpe,
        'short_sharpe': short_sharpe,
        'short_cumret': short_cumret,
        'fund_dropout': fund_dropout,
        'noise_std': noise_std,
    }
    
    print(f"\n{config_name} Test Results:")
    print(f"  IC: {mean_ic:.4f} (Sharpe: {ic_sharpe:.2f})")
    print(f"  Short K=5 Sharpe: {short_sharpe:.2f}")


Training: noise=0.1, dropout=0.5
Filtered to news-only: 339,872 / 1,418,494 rows (24.0%)
Days with sufficient news coverage: 830
Generated 71,008,149 pairs
Fundamental noise augmentation: std=0.1
Filtered to news-only: 58,882 / 210,247 rows (28.0%)
Days with sufficient news coverage: 118
Generated 14,886,015 pairs


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 1/5: train_acc=0.5544, val_acc=0.5034


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 2/5: train_acc=0.5759, val_acc=0.5028


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 3/5: train_acc=0.5843, val_acc=0.5033


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 4/5: train_acc=0.5911, val_acc=0.5028


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 5/5: train_acc=0.5967, val_acc=0.5028

d=0.5_n=0.1 Test Results:
  IC: -0.0121 (Sharpe: -2.36)
  Short K=5 Sharpe: 3.12

Training: noise=0.05, dropout=0.5
Filtered to news-only: 339,872 / 1,418,494 rows (24.0%)
Days with sufficient news coverage: 830
Generated 71,008,149 pairs
Fundamental noise augmentation: std=0.05
Filtered to news-only: 58,882 / 210,247 rows (28.0%)
Days with sufficient news coverage: 118
Generated 14,886,015 pairs


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 1/5: train_acc=0.5524, val_acc=0.5053


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 2/5: train_acc=0.5721, val_acc=0.5033


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 3/5: train_acc=0.5796, val_acc=0.5025


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 4/5: train_acc=0.5855, val_acc=0.5029


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/29075 [00:00<?, ?it/s]

Epoch 5/5: train_acc=0.5905, val_acc=0.5029

d=0.5_n=0.05 Test Results:
  IC: 0.0170 (Sharpe: 2.82)
  Short K=5 Sharpe: 2.89

Training: no_noise, dropout=0.5
Filtered to news-only: 339,872 / 1,418,494 rows (24.0%)
Days with sufficient news coverage: 830
Generated 71,008,149 pairs
Filtered to news-only: 58,882 / 210,247 rows (28.0%)
Days with sufficient news coverage: 118
Generated 14,886,015 pairs


Training:   0%|          | 0/138688 [00:00<?, ?it/s]

KeyboardInterrupt: 

## 7. Compare Results

In [None]:
# Summary table
summary = []
for config_name, res in results.items():
    summary.append({
        'config': config_name,
        'fund_dropout': res['fund_dropout'],
        'noise_std': res['noise_std'],
        'mean_ic': res['mean_ic'],
        'ic_sharpe': res['ic_sharpe'],
        'short_sharpe': res['short_sharpe'],
    })

summary_df = pd.DataFrame(summary)
print("\n" + "=" * 80)
print("COMPARISON SUMMARY (Test Set)")
print("=" * 80)
print(summary_df.to_string(index=False))

In [None]:
import matplotlib.pyplot as plt

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

configs = list(results.keys())
x = np.arange(len(configs))

# IC Sharpe
ax = axes[0]
ax.bar(x, [results[c]['ic_sharpe'] for c in configs])
ax.set_xticks(x)
ax.set_xticklabels(configs, rotation=45, ha='right')
ax.set_ylabel('IC Sharpe')
ax.set_title('Information Coefficient Sharpe')
ax.axhline(results['d=0.2_n=0.0']['ic_sharpe'], color='red', linestyle='--', alpha=0.5, label='Baseline')
ax.legend()

# Short Sharpe
ax = axes[1]
ax.bar(x, [results[c]['short_sharpe'] for c in configs])
ax.set_xticks(x)
ax.set_xticklabels(configs, rotation=45, ha='right')
ax.set_ylabel('Short Sharpe')
ax.set_title('Short Strategy Sharpe (K=5)')
ax.axhline(results['d=0.2_n=0.0']['short_sharpe'], color='red', linestyle='--', alpha=0.5, label='Baseline')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Heatmap view: dropout vs noise
pivot_ic = summary_df.pivot(index='fund_dropout', columns='noise_std', values='ic_sharpe')

fig, ax = plt.subplots(figsize=(8, 5))
im = ax.imshow(pivot_ic.values, cmap='RdYlGn', aspect='auto')

ax.set_xticks(np.arange(len(pivot_ic.columns)))
ax.set_yticks(np.arange(len(pivot_ic.index)))
ax.set_xticklabels([f'{x:.2f}' for x in pivot_ic.columns])
ax.set_yticklabels([f'{x:.1f}' for x in pivot_ic.index])
ax.set_xlabel('Noise Std')
ax.set_ylabel('Fundamental Dropout')
ax.set_title('IC Sharpe by Dropout and Noise')

# Add text annotations
for i in range(len(pivot_ic.index)):
    for j in range(len(pivot_ic.columns)):
        text = ax.text(j, i, f'{pivot_ic.values[i, j]:.2f}',
                       ha='center', va='center', color='black', fontsize=12)

plt.colorbar(im, ax=ax, label='IC Sharpe')
plt.tight_layout()
plt.show()

In [None]:
# Training curves comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for config_name, res in results.items():
    hist = pd.DataFrame(res['history'])
    axes[0].plot(hist['epoch'], hist['train_acc'], label=f"{config_name}")
    axes[1].plot(hist['epoch'], hist['val_acc'], label=f"{config_name}")

axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Training Accuracy')
axes[0].legend(fontsize=8)
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Validation Accuracy')
axes[1].legend(fontsize=8)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Save Best Model

In [None]:
# Find best config by IC Sharpe
best_config_name = max(results.keys(), key=lambda c: results[c]['ic_sharpe'])
baseline_name = 'd=0.2_n=0.0'

print(f"Best config: {best_config_name}")
print(f"  IC Sharpe: {results[best_config_name]['ic_sharpe']:.2f}")
print(f"  Mean IC: {results[best_config_name]['mean_ic']:.4f}")

print(f"\nBaseline ({baseline_name}):")
print(f"  IC Sharpe: {results[baseline_name]['ic_sharpe']:.2f}")
print(f"  Mean IC: {results[baseline_name]['mean_ic']:.4f}")

improvement = results[best_config_name]['ic_sharpe'] - results[baseline_name]['ic_sharpe']
print(f"\nImprovement: {improvement:+.2f}")

if improvement > 0:
    print("\n>>> Regularization IMPROVES performance!")
else:
    print("\n>>> Baseline is best")

In [None]:
# Save best model if it beats baseline
if results[best_config_name]['ic_sharpe'] > results[baseline_name]['ic_sharpe']:
    best_model = results[best_config_name]['model']
    best_config = results[best_config_name]['config']
    
    torch.save({
        "model_state_dict": best_model.state_dict(),
        "config": best_config,
        "price_cols": price_feat_cols,
        "fund_cols": fund_feat_cols,
        "emb_cols": emb_cols,
        "fundamental_dropout": best_config.fundamental_dropout,
        "fundamental_noise_std": best_config.fundamental_noise_std,
    }, "data/model_fund_reg.pt")
    print(f"Saved model ({best_config_name}) to data/model_fund_reg.pt")
else:
    print("NOT saving - baseline performs better")
    print("Use model_final.pt instead")