In [None]:
import sys
import torch
from pathlib import Path
from torch.utils.data import DataLoader
import torch.nn as nn
import math
import pandas as pd
import numpy as np

In [3]:
current_dir = Path.cwd()
project_root = current_dir.parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.utils.data_loader import load_and_merge_data
from src.models.dataset import FinancialDataset, get_annual_splits, prepare_scaled_fold

DATA_DIR = project_root / "data"
BATCH_SIZE = 64
WINDOW_SIZE = 60
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")

print(f"Running on: {DEVICE}")

Running on: mps


In [4]:
# Load the Point-in-Time merged dataframe
# This handles the merging of Price, Ratios, Macro, and Text + NaNs filling
df_main = load_and_merge_data(DATA_DIR)

# Generate the Walk-Forward Anchors (e.g., 2010-2015 Train, 2016 Val, 2017 Test)
splits = get_annual_splits(
    df_main, start_year=2010, train_years=5, val_years=1, test_years=1
)

print(f"\nGenerated {len(splits)} Walk-Forward splits.")

Reading data from: /Users/audricsicard/Documents/VSCode/AML Project/Modality-aware-transformer/data
Loading datasets...

--- Merging Data ---
Merged Market: (3254401, 12)
Merged Ratios: (3254401, 17)
Merged Macro: (3254401, 24)
Merged Text: (3254401, 31)
Filling NaN values...
Keeping records between 2010-01-01 and 2023-12-15...
Done! Final Data Shape: (2500247, 31)

Generated 8 Walk-Forward splits.


In [None]:
# --- 2. SELECT DEBUG UNIVERSE ---
# Pick 5 random stocks to simulate a "Mini Universe"
all_permnos = df_main["permno"].unique()
debug_permnos = np.random.choice(all_permnos, size=5, replace=False)
print(f"Debug Universe (5 Stocks): {debug_permnos}")

# Filter Main Data to just these stocks
mask_debug = df_main["permno"].isin(debug_permnos)
df_debug = df_main[mask_debug].copy()

num_cols = [
    "mkt_log_ret",
    "mkt_cap_rank",
    "mkt_mom_1m",
    "mkt_mom_3m",
    "mkt_volatility",
    "mkt_drawdown",
    "mkt_turnover",
    "mkt_rel_vol",
    "mkt_liq_risk",
    "ratio_pb",
    "ratio_ey",
    "ratio_roe",
    "ratio_de",
    "ratio_div_yield",
    "macro_unemp_rate",
    "macro_unemp_delta",
    "macro_cpi_yoy",
    "macro_ppi_yoy",
    "macro_yield_curve",
    "macro_risk_free",
    "macro_vix",
]


# --- 1. SETUP: Pick a specific split ---
# We will test the 'Validation' period of the first split
split = splits[0]
val_start_str, val_end_str = split["val"]
print(f"Verifying Coverage for Validation Period: {val_start_str} to {val_end_str}")

# --- 2. PREPARE DATA (Standard Workflow) ---
# Use your helper to get the Warm Validation DataFrame
_, df_val_warm, _ = prepare_scaled_fold(df_debug, num_cols, split, buffer_days=90)

# Initialize Dataset with STRICT limits
val_dataset = FinancialDataset(
    df_val_warm, window_size=60, min_date=val_start_str, max_date=val_end_str
)

# IMPORTANT: Use batch_size=1 and drop_last=FALSE to count every single item
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, drop_last=False)

# --- 3. CALCULATE EXPECTED COUNT ---
# We need to know how many rows in the raw 'df_val_warm'
# are actually valid targets (date >= val_start AND have 60 days of history).

# A. Filter to official range
dates = pd.to_datetime(df_val_warm["date"])
val_start_ts = pd.Timestamp(val_start_str)
val_end_ts = pd.Timestamp(val_end_str)

# B. Strict filtering logic (replicating Dataset logic manually)
expected_count = 0
permnos = df_val_warm["permno"].values
# Find start indices of every stock in the warm df
change_points = np.where(permnos[:-1] != permnos[1:])[0] + 1
start_points = np.concatenate(([0], change_points))
end_points = np.concatenate((change_points, [len(permnos)]))

print("\n--- Theoretical Valid Samples ---")
for start, end in zip(start_points, end_points):
    stock_len = end - start
    if stock_len > 60:
        # The window ends at index 'k'.
        # For a window ending at 'k', the input is [k-60 : k].
        # The target is at 'k-1' (The last day of input window? No, target is usually aligned to last day).
        # Let's align with your Dataset logic:
        # Window: [i : i+60]
        # Target Index: i + 60 - 1

        # We iterate through all possible starts 'i'
        for i in range(start, end - 60 + 1):
            target_idx = i + 60 - 1
            target_date = dates.iloc[target_idx]

            # Count if target is within validation range
            if val_start_ts <= target_date <= val_end_ts:
                expected_count += 1

print(f"Expected Samples (Manual Logic): {expected_count}")
print(f"Actual Samples (Dataset Len):    {len(val_dataset)}")

# --- 4. RUN LOADER TO VERIFY ---
actual_count = 0
for batch in val_loader:
    actual_count += batch["y"].shape[0]

print(f"Loader Yielded:                  {actual_count}")

# --- 5. VERDICT ---
if expected_count == actual_count == len(val_dataset):
    print("\n✅ SUCCESS: All valid test rows have a prediction.")
else:
    print("\n❌ FAILURE: Mismatch detected.")
    diff = expected_count - actual_count
    print(f"Missing Samples: {diff}")

Debug Universe (5 Stocks): [90373. 89508. 81677. 10890. 23712.]
Verifying Coverage for Validation Period: 2015-01-01 to 2015-12-31
Numerical Features (22): ['mkt_log_ret', 'mkt_cap_rank', 'mkt_mom_1m', 'mkt_mom_3m', 'mkt_volatility', 'mkt_drawdown', 'mkt_turnover', 'mkt_rel_vol', 'mkt_liq_risk', 'ratio_pb', 'ratio_ey', 'ratio_roe', 'ratio_de', 'ratio_div_yield', 'macro_unemp_rate', 'macro_unemp_delta', 'macro_cpi_yoy', 'macro_ppi_yoy', 'macro_yield_curve', 'macro_risk_free', 'macro_vix', 'has_news']
Text Features bar Embedding Vector (5): ['sent_score_mean', 'sent_pos_mean', 'sent_neg_mean', 'sent_score_std', 'log_n_news']
Converting to PyTorch Tensors...
Dataset Ready. Samples: 1260 (Filtered by 2015-01-01 to 2015-12-31)

--- Theoretical Valid Samples ---
Expected Samples (Manual Logic): 1260
Actual Samples (Dataset Len):    1260
Loader Yielded:                  1260

✅ SUCCESS: All valid test rows have a prediction.


In [None]:
# Check one batch
batch = next(iter(val_loader))

x_num = batch["x_num"].to(DEVICE)
x_text = batch["x_text"].to(DEVICE)
y = batch["y"].to(DEVICE)

print("--- Batch Shapes ---")
print(f"Num Features:  {x_num.shape}  (Batch, Window, Feats)")
print(f"Text Features: {x_text.shape} (Batch, Window, Emb+Scalars)")
print(f"Target:        {y.shape}      (Batch)")

--- Batch Shapes ---
Num Features:  torch.Size([1, 60, 22])  (Batch, Window, Feats)
Text Features: torch.Size([1, 60, 773]) (Batch, Window, Emb+Scalars)
Target:        torch.Size([1])      (Batch)


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0)]
        return self.dropout(x)

In [None]:
class CanonicalEncoder(nn.Module):
    def __init__(
        self,
        num_input_dim: int,
        text_input_dim: int,
        d_model: int = 128,
        nhead: int = 4,
        num_layers: int = 2,
        dropout: float = 0.2,
    ):
        """
        The Baseline: A Standard Transformer with Early Fusion.
        """
        super().__init__()

        # --- 1. Balanced Projection (Early Fusion) ---
        # We project both inputs to d_model/2 so they sum to d_model when concatenated.
        # This gives equal bandwidth to both modalities, just like MAT.
        half_dim = d_model // 2

        self.num_proj = nn.Sequential(
            nn.Linear(num_input_dim, half_dim),
            nn.BatchNorm1d(60),
            nn.GELU(),
            nn.Dropout(dropout),
        )

        self.text_proj = nn.Sequential(
            nn.Linear(text_input_dim, half_dim),
            nn.BatchNorm1d(60),
            nn.GELU(),
            nn.Dropout(dropout),
        )

        # --- 2. Positional Encoding ---
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # --- 3. The Transformer Backbone ---
        # Standard PyTorch Transformer implementation.
        # It treats the fused vector [Price_Info, Text_Info] as one single concept.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x_num, x_text):
        """
        Args:
            x_num: [Batch, 60, 20]
            x_text: [Batch, 60, 773]
        Returns:
            memory: [Batch, 60, 128] (Single stream output)
        """
        # 1. Project Modalities independently
        x_n = self.num_proj(x_num)  # [Batch, 60, 64]
        x_t = self.text_proj(x_text)  # [Batch, 60, 64]

        # 2. Early Concatenation (The key difference from MAT)
        # We fuse them immediately. The transformer now sees one vector of size 128.
        x_combined = torch.cat([x_n, x_t], dim=2)  # [Batch, 60, 128]

        # 3. Add Positional Encoding
        # (Transpose required for our PE implementation)
        x_combined = x_combined.transpose(0, 1)  # [60, Batch, 128]
        x_combined = self.pos_encoder(x_combined)
        x_combined = x_combined.transpose(0, 1)  # [Batch, 60, 128]

        # 4. Process with Standard Transformer
        memory = self.transformer(x_combined)

        return memory

In [None]:
class FeatureAttention(nn.Module):
    """
    Section 3.2.1: Feature-Level Attention.
    Now returns BOTH the weighted input and the importance scores.
    """

    def __init__(self, input_dim):
        super().__init__()
        self.attn = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.Tanh(),
            nn.Linear(input_dim, input_dim),
            nn.Softmax(dim=-1),
        )

    def forward(self, x):
        # 1. Compute Importance Scores [Batch, Seq, Input_Dim]
        weights = self.attn(x)

        # 2. Re-weight the input
        x_weighted = x * weights

        # 3. Return BOTH (So we can reuse weights later)
        return x_weighted, weights


class MATEncoderLayer(nn.Module):
    """
    A single layer of the MAT Encoder that handles:
    1. Intra-Modal Attention (Self-Attention within stream)
    2. Inter-Modal Attention (Cross-Attention between streams)
    3. Feed Forward
    """

    def __init__(self, d_model, nhead, dim_feedforward, dropout):
        super().__init__()

        # --- 1. Intra-Modal (Self Attention) ---
        self.self_attn_num = nn.MultiheadAttention(
            d_model, nhead, dropout=dropout, batch_first=True
        )
        self.self_attn_text = nn.MultiheadAttention(
            d_model, nhead, dropout=dropout, batch_first=True
        )

        self.norm1_num = nn.LayerNorm(d_model)
        self.norm1_text = nn.LayerNorm(d_model)

        # --- 2. Inter-Modal (Cross Attention) ---
        # Num queries Text, Text queries Num
        self.cross_attn_num = nn.MultiheadAttention(
            d_model, nhead, dropout=dropout, batch_first=True
        )
        self.cross_attn_text = nn.MultiheadAttention(
            d_model, nhead, dropout=dropout, batch_first=True
        )

        self.norm2_num = nn.LayerNorm(d_model)
        self.norm2_text = nn.LayerNorm(d_model)

        # --- 3. Feed Forward ---
        self.ff_num = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model),
        )
        self.ff_text = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model),
        )

        self.norm3_num = nn.LayerNorm(d_model)
        self.norm3_text = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x_num, x_text):
        # --- Step 1: Self Attention (Intra) ---
        # Learn temporal patterns independently
        attn_n, _ = self.self_attn_num(x_num, x_num, x_num)
        x_num = self.norm1_num(x_num + self.dropout(attn_n))

        attn_t, _ = self.self_attn_text(x_text, x_text, x_text)
        x_text = self.norm1_text(x_text + self.dropout(attn_t))

        # --- Step 2: Cross Attention (Inter) ---
        # Exchange information between streams
        # Num Stream looks at Text Stream
        attn_n_cross, _ = self.cross_attn_num(query=x_num, key=x_text, value=x_text)
        x_num_mixed = self.norm2_num(x_num + self.dropout(attn_n_cross))

        # Text Stream looks at Num Stream
        attn_t_cross, _ = self.cross_attn_text(query=x_text, key=x_num, value=x_num)
        x_text_mixed = self.norm2_text(x_text + self.dropout(attn_t_cross))

        # --- Step 3: Feed Forward ---
        ff_n = self.ff_num(x_num_mixed)
        x_num_out = self.norm3_num(x_num_mixed + self.dropout(ff_n))

        ff_t = self.ff_text(x_text_mixed)
        x_text_out = self.norm3_text(x_text_mixed + self.dropout(ff_t))

        return x_num_out, x_text_out


class MATEncoder(nn.Module):
    def __init__(
        self,
        num_input_dim: int,
        text_input_dim: int,
        d_model: int = 128,
        nhead: int = 4,
        num_layers: int = 2,
        dropout: float = 0.2,
    ):
        super().__init__()

        # 1. Feature Attention (Returns weights now)
        self.num_feat_attn = FeatureAttention(num_input_dim)
        self.text_feat_attn = FeatureAttention(text_input_dim)

        # 2. Weight Projectors (NEW)
        # We need to map the Feature Weights (dim 20 or 773) to d_model (128)
        # to apply them to the Encoder Output.
        self.num_weight_proj = nn.Sequential(
            nn.Linear(num_input_dim, d_model),
            nn.Sigmoid(),  # Gating (0-1)
        )
        self.text_weight_proj = nn.Sequential(
            nn.Linear(text_input_dim, d_model), nn.Sigmoid()
        )

        # 3. Standard Projections (Input -> d_model)
        self.num_proj = nn.Sequential(
            nn.Linear(num_input_dim, d_model),
            nn.BatchNorm1d(60),
            nn.GELU(),
            nn.Dropout(dropout),
        )

        self.text_proj = nn.Sequential(
            nn.Linear(text_input_dim, d_model),
            nn.BatchNorm1d(60),
            nn.GELU(),
            nn.Dropout(dropout),
        )

        self.pos_encoder = PositionalEncoding(d_model, dropout)

        self.layers = nn.ModuleList(
            [
                MATEncoderLayer(d_model, nhead, d_model * 4, dropout)
                for _ in range(num_layers)
            ]
        )

    def forward(self, x_num, x_text):
        # A. Feature Attention (Capture the Weights)
        # w_num: [Batch, 60, 20]
        # w_text: [Batch, 60, 773]
        x_num, w_num = self.num_feat_attn(x_num)
        x_text, w_text = self.text_feat_attn(x_text)

        # B. Project Input to d_model
        h_num = self.num_proj(x_num)
        h_text = self.text_proj(x_text)

        # C. Positional Encoding
        h_num = self.pos_encoder(h_num.transpose(0, 1)).transpose(0, 1)
        h_text = self.pos_encoder(h_text.transpose(0, 1)).transpose(0, 1)

        # D. MAT Processing (The Context Learning)
        # This mixes information over time and between modalities
        for layer in self.layers:
            h_num, h_text = layer(h_num, h_text)

        # E. RE-WEIGHTING (The Step I Missing)
        # We project the original Feature Weights to the hidden size
        # and use them to "Gate" the final output.
        # This ensures the model output respects the original feature importance.

        # [Batch, 60, 20] -> [Batch, 60, 128]
        gate_num = self.num_weight_proj(w_num)
        # [Batch, 60, 773] -> [Batch, 60, 128]
        gate_text = self.text_weight_proj(w_text)

        # Apply the Gate
        h_num_final = h_num * gate_num
        h_text_final = h_text * gate_text

        return h_num_final, h_text_final

In [None]:
############################## PAS IMPORTANT ##################################

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time


# --- 1. Canonical Baseline (Single Stream) ---
class BaselineCanonical(nn.Module):
    def __init__(self, encoder, d_model=128):
        super().__init__()
        self.encoder = encoder

        # Simple Regressor
        self.head = nn.Sequential(
            nn.Linear(d_model, 64), nn.GELU(), nn.Dropout(0.1), nn.Linear(64, 1)
        )

    def forward(self, x_num, x_text):
        # Encoder returns single memory [Batch, Seq, Dim]
        memory = self.encoder(x_num, x_text)

        # Pooling: Take the last step (Most recent day)
        last_step = memory[:, -1, :]

        return self.head(last_step)


# --- 2. MAT Baseline (Dual Stream) ---
class BaselineMAT(nn.Module):
    def __init__(self, encoder, d_model=128):
        super().__init__()
        self.encoder = encoder

        # Fusion Layer: Combine Num (128) + Text (128) -> 128
        self.fusion = nn.Sequential(
            nn.Linear(d_model * 2, d_model), nn.LayerNorm(d_model), nn.GELU()
        )

        # Same Regressor Head as Canonical for fair comparison
        self.head = nn.Sequential(
            nn.Linear(d_model, 64), nn.GELU(), nn.Dropout(0.1), nn.Linear(64, 1)
        )

    def forward(self, x_num, x_text):
        # Encoder returns TWO memories
        mem_num, mem_text = self.encoder(x_num, x_text)

        # Pooling: Take last step from BOTH
        last_num = mem_num[:, -1, :]  # [Batch, 128]
        last_text = mem_text[:, -1, :]  # [Batch, 128]

        # Fuse
        combined = torch.cat([last_num, last_text], dim=-1)  # [Batch, 256]
        fused = self.fusion(combined)  # [Batch, 128]

        return self.head(fused)