In [7]:
import torch
from torch import nn
from src.models import AffectModel, masked_mse_loss
from src.data import setup_dataloader

In [None]:
# Config
TRAIN_PATH = "dataset/TRAIN_RELEASE_3SEP2025/train_subtask1.csv"
TOKENIZER_PATH = "bert-base-uncased"

MODEL_CONFIG = {
    # Encoder
    'model_path': 'bert-base-uncased',
    # Set Attention
    'n_seeds': 4,
    'n_inducing': 32,
    'n_heads': 8,
    # LSTM
    'lstm_hidden': 256,
    'lstm_layers': 2,
    'bidirectional': True,
    # Head
    'constrain_output': True,
    # Shared
    'dropout': 0.3,
    # Debug
    'verbose': True,
}

DATA_CONFIG = {
    'csv_path': TRAIN_PATH,
    'tokenizer_path': TOKENIZER_PATH,
    'max_text_length': 512,
    'batch_size': 4,
    'shuffle': True,
    'num_workers': 0,
}

In [9]:
# Setup
train_loader, train_dataset = setup_dataloader(**DATA_CONFIG)
model = AffectModel(**MODEL_CONFIG)

print(f"\n{'='*50}")
print(f"Dataset size: {len(train_dataset)} users")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"{'='*50}")

[TransformerEncoder] Loaded: bert-base-uncased
[TransformerEncoder] Hidden size: 768
[TransformerEncoder] Backbone frozen: True

[ISAB] Initialized with 32 inducing points

[PMA] Initialized with 4 seed vectors

[LSTMEncoder] Initialized
  input_dim:     3072
  hidden_dim:    256
  num_layers:    2
  bidirectional: True
  output_dim:    512

[PredictionHead] Initialized
  input_dim:        512
  dropout:          0.3
  constrain_output: True

  valence range:    [-2, 2] (tanh * 2)
  arousal range:    [0, 2] (sigmoid * 2)


[AffectModel] Initialized
  Encoder: bert-base-uncased (frozen)
  ISAB: 32 inducing points
  PMA: 4 seeds
  LSTM: input=3072, hidden=256, layers=2, bidir=True
  Head: output=2, constrain_output=True


Dataset size: 137 users
Model parameters: 139,171,330
Trainable parameters: 29,689,090


In [10]:
# Test forward pass
model.eval()

with torch.no_grad():
    for batch in train_loader:
        predictions = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            seq_lengths=batch['seq_lengths'],
            seq_mask=batch['seq_attention_mask']
        )
        
        targets = torch.stack([batch['valences'], batch['arousals']], dim=-1)
        mask = batch['seq_attention_mask'].bool()
        loss = masked_mse_loss(predictions, targets, mask)
        
        break

print(f"\n{'='*50}")
print(f"RESULTS")
print(f"{'='*50}")
print(f"Input shape:     {batch['input_ids'].shape}")
print(f"Predictions:     {predictions.shape}")
print(f"Targets:         {targets.shape}")
print(f"Loss:            {loss.item():.4f}")

[TransformerEncoder] train(False) called, backbone forced to eval


[AffectModel] Forward pass
  Input shapes:
    input_ids:      torch.Size([4, 37, 512]) (B=4, S=37, T=512)
    attention_mask: torch.Size([4, 37, 512])
    seq_lengths:    torch.Size([4]) -> [7, 3, 37, 18]
    seq_mask:       torch.Size([4, 37]) -> 65 valid documents


  Step 1: Flatten valid documents
    input_ids_flat: torch.Size([65, 512])


[TransformerEncoder] Forward pass
  Input:
    input_ids:      torch.Size([65, 512])
    attention_mask: torch.Size([65, 512])
    real tokens:    2671 / 33280

  Output:
    tokens:       torch.Size([65, 512, 768])
    padding_mask: torch.Size([65, 512])
    positions to ignore: 30609


  Step 2: Transformer encoding
    tokens: torch.Size([65, 512, 768])
    padding_mask: torch.Size([65, 512])


  [ISAB] Forward pass
    Input X: torch.Size([65, 512, 768])

    Inducing points expanded: torch.Size([65, 32, 768])
    Step 1: Inducing points gather from tokens

    [MAB] Q: tor

In [11]:
# Inspect ranges
print(f"Target Ranges:")
print(f"  Valence: [{batch['valences'].min():.2f}, {batch['valences'].max():.2f}]")
print(f"  Arousal: [{batch['arousals'].min():.2f}, {batch['arousals'].max():.2f}]")

print(f"\nPrediction Ranges:")
print(f"  Valence: [{predictions[..., 0].min():.2f}, {predictions[..., 0].max():.2f}]")
print(f"  Arousal: [{predictions[..., 1].min():.2f}, {predictions[..., 1].max():.2f}]")

print(f"\nMask Stats:")
print(f"  Valid timesteps: {mask.sum()} / {mask.numel()} ({100*mask.sum()/mask.numel():.1f}%)")
print(f"  Seq lengths: {batch['seq_lengths'].tolist()}")

Target Ranges:
  Valence: [-2.00, 2.00]
  Arousal: [0.00, 2.00]

Prediction Ranges:
  Valence: [-0.25, 0.00]
  Arousal: [0.99, 1.05]

Mask Stats:
  Valid timesteps: 65 / 148 (43.9%)
  Seq lengths: [7, 3, 37, 18]


In [12]:
# Inspect one sample
sample_idx = 0
seq_len = batch['seq_lengths'][sample_idx].item()

print(f"Sample {sample_idx} (user: {batch['user_ids'][sample_idx]})")
print(f"  Sequence length: {seq_len} documents")
print(f"\n  Predictions vs Targets (first 5 docs):")
print(f"  {'Doc':<5} {'Pred V':>8} {'True V':>8} {'Pred A':>8} {'True A':>8}")
print(f"  {'-'*41}")

for i in range(min(5, seq_len)):
    pred_v = predictions[sample_idx, i, 0].item()
    pred_a = predictions[sample_idx, i, 1].item()
    true_v = targets[sample_idx, i, 0].item()
    true_a = targets[sample_idx, i, 1].item()
    print(f"  {i:<5} {pred_v:>8.3f} {true_v:>8.3f} {pred_a:>8.3f} {true_a:>8.3f}")

Sample 0 (user: 145)
  Sequence length: 7 documents

  Predictions vs Targets (first 5 docs):
  Doc     Pred V   True V   Pred A   True A
  -----------------------------------------
  0       -0.112   -1.000    1.003    1.000
  1       -0.169    1.000    1.015    1.000
  2       -0.195   -1.000    1.025    1.000
  3       -0.196   -1.000    1.030    2.000
  4       -0.191    1.000    1.034    0.000
