# News Embedding Signal Test

Test whether news embeddings contain non-linear predictive signal for returns.

Approach: Train a small MLP on embeddings only to predict above/below median return.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [2]:
df = pd.read_parquet("data/ml_dataset.pqt")
df["feature_date"] = pd.to_datetime(df["feature_date"])

# Only rows with news
emb_cols = [c for c in df.columns if c.startswith("emb_")]
has_news = (df[emb_cols].abs().sum(axis=1) > 0)
news_df = df[has_news].copy()

print(f"Total rows: {len(df):,}")
print(f"Rows with news: {len(news_df):,} ({len(news_df)/len(df)*100:.1f}%)")

Total rows: 2,092,929
Rows with news: 194,220 (9.3%)


In [3]:
# Time-based split (70% train, 30% val)
dates = sorted(news_df["feature_date"].unique())
split_date = dates[int(len(dates) * 0.7)]

train_df = news_df[news_df["feature_date"] < split_date].copy()
val_df = news_df[news_df["feature_date"] >= split_date].copy()

print(f"Train: {len(train_df):,} rows, {train_df['feature_date'].nunique()} days")
print(f"Val: {len(val_df):,} rows, {val_df['feature_date'].nunique()} days")
print(f"\nTrain period: {train_df['feature_date'].min().date()} to {train_df['feature_date'].max().date()}")
print(f"Val period: {val_df['feature_date'].min().date()} to {val_df['feature_date'].max().date()}")

Train: 125,880 rows, 830 days
Val: 68,340 rows, 356 days

Train period: 2021-01-13 to 2024-05-01
Val period: 2024-05-02 to 2025-12-18


In [4]:
# Binary classification: above/below median return that day (cross-sectional)
train_df["label"] = (train_df.groupby("feature_date")["target_return"]
                     .transform(lambda x: (x > x.median()).astype(int)))
val_df["label"] = (val_df.groupby("feature_date")["target_return"]
                   .transform(lambda x: (x > x.median()).astype(int)))

print(f"Label balance (train): {train_df['label'].mean():.3f}")
print(f"Label balance (val): {val_df['label'].mean():.3f}")

Label balance (train): 0.498
Label balance (val): 0.499


In [5]:
# Prepare tensors
X_train = torch.tensor(train_df[emb_cols].values, dtype=torch.float32)
X_val = torch.tensor(val_df[emb_cols].values, dtype=torch.float32)
y_train = torch.tensor(train_df["label"].values, dtype=torch.float32)
y_val = torch.tensor(val_df["label"].values, dtype=torch.float32)

print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")

X_train: torch.Size([125880, 768])
X_val: torch.Size([68340, 768])


In [6]:
class NewsClassifier(nn.Module):
    """Simple MLP to test if embeddings have predictive signal."""
    
    def __init__(self, input_dim=768, hidden_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
        )
    
    def forward(self, x):
        return self.net(x).squeeze(-1)

model = NewsClassifier(input_dim=len(emb_cols)).to(device)
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

Parameters: 102,593


In [7]:
# Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss()

train_loader = DataLoader(
    TensorDataset(X_train, y_train), 
    batch_size=512, 
    shuffle=True
)
val_loader = DataLoader(
    TensorDataset(X_val, y_val), 
    batch_size=512
)

In [9]:
# Training loop
n_epochs = 20
history = []

for epoch in range(n_epochs):
    # Train
    model.train()
    train_loss, train_correct, train_total = 0, 0, 0
    for xb, yb in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * len(yb)
        train_correct += ((pred > 0) == (yb > 0.5)).sum().item()
        train_total += len(yb)
    
    # Validate
    model.eval()
    val_correct, val_total = 0, 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            val_correct += ((pred > 0) == (yb > 0.5)).sum().item()
            val_total += len(yb)
    
    train_acc = train_correct / train_total
    val_acc = val_correct / val_total
    
    history.append({
        "epoch": epoch + 1,
        "train_loss": train_loss / train_total,
        "train_acc": train_acc,
        "val_acc": val_acc,
    })
    
    print(f"Epoch {epoch+1}/{n_epochs}: train_acc={train_acc:.4f}, val_acc={val_acc:.4f}")

Epoch 1:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 1/20: train_acc=0.5031, val_acc=0.5012


Epoch 2:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 2/20: train_acc=0.5024, val_acc=0.5013


Epoch 3:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 3/20: train_acc=0.5015, val_acc=0.5013


Epoch 4:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 4/20: train_acc=0.5011, val_acc=0.5055


Epoch 5:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 5/20: train_acc=0.5011, val_acc=0.5047


Epoch 6:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 6/20: train_acc=0.5043, val_acc=0.5020


Epoch 7:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 7/20: train_acc=0.5014, val_acc=0.5013


Epoch 8:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 8/20: train_acc=0.5037, val_acc=0.5019


Epoch 9:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 9/20: train_acc=0.5046, val_acc=0.5012


Epoch 10:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 10/20: train_acc=0.5046, val_acc=0.5047


Epoch 11:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 11/20: train_acc=0.5051, val_acc=0.5010


Epoch 12:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 12/20: train_acc=0.5064, val_acc=0.5025


Epoch 13:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 13/20: train_acc=0.5070, val_acc=0.5065


Epoch 14:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 14/20: train_acc=0.5067, val_acc=0.5060


Epoch 15:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 15/20: train_acc=0.5056, val_acc=0.5067


Epoch 16:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 16/20: train_acc=0.5070, val_acc=0.5058


Epoch 17:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 17/20: train_acc=0.5090, val_acc=0.5052


Epoch 18:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 18/20: train_acc=0.5088, val_acc=0.5046


Epoch 19:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 19/20: train_acc=0.5087, val_acc=0.5012


Epoch 20:   0%|          | 0/246 [00:00<?, ?it/s]

Epoch 20/20: train_acc=0.5094, val_acc=0.5050


In [10]:
print("\n" + "=" * 50)
print("RESULTS")
print("=" * 50)
print(f"\nBaseline (random): 50.0%")
print(f"Best val accuracy: {max(h['val_acc'] for h in history)*100:.2f}%")
print(f"Final val accuracy: {history[-1]['val_acc']*100:.2f}%")

delta = (max(h['val_acc'] for h in history) - 0.5) * 100
print(f"\nSignal above random: {delta:+.2f}%")

if delta > 1:
    print("\n=> News embeddings have SOME predictive signal")
elif delta > 0.5:
    print("\n=> Weak signal, may not be reliable")
else:
    print("\n=> No detectable signal in news embeddings")


RESULTS

Baseline (random): 50.0%
Best val accuracy: 50.67%
Final val accuracy: 50.50%

Signal above random: +0.67%

=> Weak signal, may not be reliable


## Pairwise Test

Alternative: test if embeddings can predict which of two stocks will outperform (same as main model).

In [13]:
# Generate pairs from validation set (fresh, not used in training)
def generate_pairs(df, emb_cols, n_pairs_per_day=500):
    """Generate pairs for pairwise ranking test."""
    pairs = []
    
    for date, group in df.groupby("feature_date"):
        if len(group) < 10:
            continue
        
        # Sample random pairs
        n = min(n_pairs_per_day, len(group) * (len(group) - 1) // 2)
        indices = group.index.tolist()
        
        for _ in range(n):
            i, j = np.random.choice(indices, 2, replace=False)
            ret_i = df.loc[i, "target_return"]
            ret_j = df.loc[j, "target_return"]
            emb_i = df.loc[i, emb_cols].values.astype(np.float32)
            emb_j = df.loc[j, emb_cols].values.astype(np.float32)
            
            # Label: 1 if i > j, else 0
            label = 1.0 if ret_i > ret_j else 0.0
            
            pairs.append({
                "emb_i": emb_i,
                "emb_j": emb_j,
                "label": label,
            })
    
    return pairs

print("Generating validation pairs...")
val_pairs = generate_pairs(val_df, emb_cols, n_pairs_per_day=500)
print(f"Generated {len(val_pairs):,} pairs")

Generating validation pairs...
Generated 178,000 pairs


In [14]:
# Test the trained model on pairs
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for pair in tqdm(val_pairs, desc="Evaluating pairs"):
        emb_i = torch.tensor(pair["emb_i"], dtype=torch.float32).unsqueeze(0).to(device)
        emb_j = torch.tensor(pair["emb_j"], dtype=torch.float32).unsqueeze(0).to(device)
        
        score_i = model(emb_i).item()
        score_j = model(emb_j).item()
        
        pred = 1 if score_i > score_j else 0
        label = int(pair["label"])
        
        if pred == label:
            correct += 1
        total += 1

pairwise_acc = correct / total
print(f"\nPairwise ranking accuracy: {pairwise_acc*100:.2f}%")
print(f"Baseline (random): 50.0%")
print(f"Signal above random: {(pairwise_acc - 0.5)*100:+.2f}%")

Evaluating pairs:   0%|          | 0/178000 [00:00<?, ?it/s]


Pairwise ranking accuracy: 50.78%
Baseline (random): 50.0%
Signal above random: +0.78%


## Top/Bottom-K Selection Test

What matters for trading: Can the model identify stocks that end up in the top or bottom K?

Test: Of stocks the model ranks in its top-K, what fraction are actually in the true top-K?

In [15]:
# Score all validation rows
model.eval()
val_df = val_df.copy()

with torch.no_grad():
    scores = model(X_val.to(device)).cpu().numpy()
val_df["score"] = scores

print(f"Scored {len(val_df):,} rows")

Scored 68,340 rows


In [16]:
def evaluate_topk_selection(df, k_values=[5, 10, 20]):
    """
    For each day, check if model's top-K picks overlap with actual top-K performers.
    Also check bottom-K (for shorting).
    
    Returns precision: what fraction of model's picks are in the true top/bottom K.
    """
    results = []
    
    for date, group in df.groupby("feature_date"):
        n = len(group)
        if n < 40:  # Need enough stocks for meaningful top/bottom K
            continue
        
        for k in k_values:
            if k > n // 4:
                continue
                
            # Model's top-K and bottom-K by score
            model_top_k = set(group.nlargest(k, "score").index)
            model_bottom_k = set(group.nsmallest(k, "score").index)
            
            # Actual top-K and bottom-K by return
            actual_top_k = set(group.nlargest(k, "target_return").index)
            actual_bottom_k = set(group.nsmallest(k, "target_return").index)
            
            # Precision: overlap / k
            top_precision = len(model_top_k & actual_top_k) / k
            bottom_precision = len(model_bottom_k & actual_bottom_k) / k
            
            # Random baseline: k/n (probability of randomly picking a true top-K stock)
            random_baseline = k / n
            
            results.append({
                "date": date,
                "k": k,
                "n_stocks": n,
                "top_precision": top_precision,
                "bottom_precision": bottom_precision,
                "random_baseline": random_baseline,
            })
    
    return pd.DataFrame(results)

results_df = evaluate_topk_selection(val_df, k_values=[5, 10, 20])
print(f"Evaluated {len(results_df):,} day-k combinations")

Evaluated 1,068 day-k combinations


In [17]:
# Summary by K
print("=" * 70)
print("TOP-K SELECTION ACCURACY (for going long)")
print("=" * 70)
print(f"{'K':<5} {'Precision':<12} {'Random':<12} {'Lift':<12} {'t-stat':<10}")
print("-" * 70)

for k in [5, 10, 20]:
    subset = results_df[results_df["k"] == k]
    if len(subset) == 0:
        continue
    precision = subset["top_precision"].mean()
    baseline = subset["random_baseline"].mean()
    lift = precision / baseline
    
    # t-test vs baseline
    from scipy.stats import ttest_1samp
    t_stat, p_val = ttest_1samp(subset["top_precision"], baseline)
    
    print(f"{k:<5} {precision*100:>5.2f}%      {baseline*100:>5.2f}%      {lift:>5.2f}x      {t_stat:>+5.2f}")

print("\n")
print("=" * 70)
print("BOTTOM-K SELECTION ACCURACY (for shorting)")
print("=" * 70)
print(f"{'K':<5} {'Precision':<12} {'Random':<12} {'Lift':<12} {'t-stat':<10}")
print("-" * 70)

for k in [5, 10, 20]:
    subset = results_df[results_df["k"] == k]
    if len(subset) == 0:
        continue
    precision = subset["bottom_precision"].mean()
    baseline = subset["random_baseline"].mean()
    lift = precision / baseline
    
    t_stat, p_val = ttest_1samp(subset["bottom_precision"], baseline)
    
    print(f"{k:<5} {precision*100:>5.2f}%      {baseline*100:>5.2f}%      {lift:>5.2f}x      {t_stat:>+5.2f}")

TOP-K SELECTION ACCURACY (for going long)
K     Precision    Random       Lift         t-stat    
----------------------------------------------------------------------
5      2.36%       2.69%       0.88x      -0.94
10     4.44%       5.38%       0.83x      -2.69
20     9.28%      10.76%       0.86x      -4.31


BOTTOM-K SELECTION ACCURACY (for shorting)
K     Precision    Random       Lift         t-stat    
----------------------------------------------------------------------
5      5.17%       2.69%       1.92x      +4.75
10     8.68%       5.38%       1.61x      +6.66
20    15.63%      10.76%       1.45x      +10.54


In [18]:
# More practical test: What's the average return of model's picks vs random?
print("\n")
print("=" * 70)
print("AVERAGE RETURN OF MODEL PICKS")
print("=" * 70)

for k in [5, 10, 20]:
    top_returns = []
    bottom_returns = []
    market_returns = []
    
    for date, group in val_df.groupby("feature_date"):
        if len(group) < 40:
            continue
        
        # Model picks
        model_top = group.nlargest(k, "score")
        model_bottom = group.nsmallest(k, "score")
        
        top_returns.append(model_top["target_return"].mean())
        bottom_returns.append(model_bottom["target_return"].mean())
        market_returns.append(group["target_return"].mean())
    
    top_ret = np.mean(top_returns) * 100
    bottom_ret = np.mean(bottom_returns) * 100
    mkt_ret = np.mean(market_returns) * 100
    
    # Short P&L is negative of bottom returns
    short_pnl = -bottom_ret
    
    print(f"\nK={k}:")
    print(f"  Model's Top-{k} avg return:    {top_ret:+.3f}% (market: {mkt_ret:+.3f}%)")
    print(f"  Model's Bottom-{k} avg return: {bottom_ret:+.3f}%")
    print(f"  Short P&L (if shorting bottom): {short_pnl:+.3f}%")
    print(f"  Long-Short spread:              {top_ret - bottom_ret:+.3f}%")



AVERAGE RETURN OF MODEL PICKS

K=5:
  Model's Top-5 avg return:    +0.106% (market: -0.070%)
  Model's Bottom-5 avg return: -0.263%
  Short P&L (if shorting bottom): +0.263%
  Long-Short spread:              +0.369%

K=10:
  Model's Top-10 avg return:    -0.011% (market: -0.070%)
  Model's Bottom-10 avg return: -0.224%
  Short P&L (if shorting bottom): +0.224%
  Long-Short spread:              +0.214%

K=20:
  Model's Top-20 avg return:    -0.026% (market: -0.070%)
  Model's Bottom-20 avg return: -0.184%
  Short P&L (if shorting bottom): +0.184%
  Long-Short spread:              +0.158%
