In [None]:
import os

def setup_storage():
    # Google Colab
    if "COLAB_GPU" in os.environ:
        from google.colab import drive
        drive.mount("/content/drive")
        base_dir = "/content/drive/MyDrive"
        env = "colab"

    # Kaggle
    elif os.path.exists("/kaggle"):
        base_dir = "/kaggle/working"
        env = "kaggle"

    # Local fallback
    else:
        base_dir = os.getcwd()
        env = "local"

    print(f"✔ Running on: {env}")
    print(f"✔ Base directory: {base_dir}")

    return base_dir, env


BASE_DIR, ENV = setup_storage()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✔ Running on: colab
✔ Base directory: /content/drive/MyDrive


In [None]:
PROJECT_ROOT = f"{BASE_DIR}/SEMEVAL2026_EMOVA"
CKPT_DIR = f"{PROJECT_ROOT}/model_checkpoints"

os.makedirs(CKPT_DIR, exist_ok=True)

In [None]:
!git clone https://github.com/AndreaLolli2912/SemEval2026-EmoVA.git
%cd SemEval2026-EmoVA

fatal: destination path 'SemEval2026-EmoVA' already exists and is not an empty directory.
/content/SemEval2026-EmoVA


In [None]:
import torch
import random
from dataclasses import dataclass
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, WeightedRandomSampler, Subset, random_split
import numpy as np

# Make sure these imports match your actual file structure
from src.data.dataset import EmoVADataset2a
from src.data.collate import create_collate_fn_2a
from src.models.affect_model import AffectModel2a
from src.models.tokenizer_wrapper import TokenizerWrapper
from src.training.trainer_2a import train
from src.training import GradientClipper

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # For full determinism (may slow down training)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # For PyTorch 1.8+
    import os
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    torch.use_deterministic_algorithms(True)

set_seed(128)

In [None]:
from typing import Optional
@dataclass
class Config:
    # Data
    data_path: str = '/content/drive/MyDrive/SEMEVAL2026_EMOVA/dataset/train_subtask2a.csv'  if BASE_DIR != '/kaggle/working' else "/kaggle/input/semeval-2026-emova/train_subtask2a.csv"
    val_split: float = 0.15

    # Model
    model_name: str = 'bert-base-uncased' # bert-base-uncased, distilbert/distilroberta-base, microsoft/deberta-v3-base, google/electra-base-discriminator
    encoder_bitfit: bool = True
    encoder_lora: bool = False
    max_text_length: int = 128
    max_posts_per_batch: int = 16
    isab_inducing_points: Optional[int] = 32
    n_heads: float = 8
    pma_num_seeds: Optional[float] = 8
    lstm_hidden_dim: int = 256
    lstm_num_layers: int = 1
    lstm_bidirectional = True
    dropout: float = 0.2
    constrain_output = False # for normalization and sigmoid, tanh activation

    # Training
    epochs: int = 150
    batch_size: int = 4
    accumulation_steps: int = 4
    lr: float = 1e-4
    scheduler_factor: float = 0.5
    scheduler_patience: float = 5
    weight_decay: float = 0.01
    max_grad_norm: float = 1.0
    patience: int = 15
    loss: str = 'mse'
    # 10% Valence / 90% Arousal importance
    valence_share: float = 0.2
    # 20% MSE (Stability) / 80% CCC (Ranking)
    mse_share: float = 0.15

    # System
    seed: int = 128
    num_workers: int = 0

config = Config()

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# Seed
torch.manual_seed(config.seed)

Device: cuda


<torch._C.Generator at 0x7d544c2451b0>

In [None]:
tokenizer = TokenizerWrapper(config.model_name, config.max_text_length)
full_dataset = EmoVADataset2a(path = config.data_path,
                              dtype=torch.float32,
                              constrain_output=config.constrain_output,
                              max_history= 5,
                              step = 2)

# Calculate split sizes
n_total = len(full_dataset)
n_val = int(n_total * 0.15)
n_train = n_total - n_val

# Standard Random Split (No filtering)
train_dataset, val_dataset = random_split(
    full_dataset,
    [n_train, n_val],
    generator=torch.Generator().manual_seed(config.seed)
)


collate_fn = create_collate_fn_2a(tokenizer)

# --- LOADERS ---
train_loader = DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    shuffle=True,       # Standard Shuffle
    collate_fn=collate_fn,
    num_workers=config.num_workers
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=config.num_workers
)

In [None]:
# Model
model = AffectModel2a(
    model_path=config.model_name,
    encoder_bitfit=config.encoder_bitfit,
    encoder_use_lora=config.encoder_lora,
    lora_r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    n_heads=config.n_heads,
    isab_inducing_points=config.isab_inducing_points,
    pma_num_seeds=config.pma_num_seeds,
    lstm_hidden_dim=config.lstm_hidden_dim,
    lstm_num_layers=config.lstm_num_layers,
    lstm_bidirectional=config.lstm_bidirectional,
    dropout=config.dropout,
    constrain_output=config.constrain_output,
)

if config.encoder_bitfit or config.encoder_lora:
    model.encoder.backbone.gradient_checkpointing_enable()

model = model.to(device)

total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Parameters: {total:,} total, {trainable:,} trainable")

Parameters: 144,024,839 total, 34,645,511 trainable


In [None]:
# Optimizer with separate LRs
param_groups = [
    {'params': list([p for n, p in model.encoder.named_parameters() if p.requires_grad]),
     'lr': 5e-6, 'name': 'encoder_bias'},
    {'params': list(model.isab.parameters()), 'lr': config.lr, 'name': 'isab'} if model.isab else None,
    {'params': list(model.pma.parameters()), 'lr': config.lr, 'name': 'pma'} if model.pma else None,
    {'params': list(model.lstm.parameters()), 'lr': config.lr, 'name': 'lstm'},
    {'params': list(model.head.parameters()), 'lr': config.lr, 'name': 'head'},
]

param_groups = [
    pg for pg in param_groups
    if pg is not None and len(pg['params']) > 0
]

optimizer = AdamW(param_groups, weight_decay=config.weight_decay)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=config.scheduler_factor, patience=config.patience)
clipper = GradientClipper(max_norm=config.max_grad_norm)

for pg in optimizer.param_groups:
    n_params = sum(p.numel() for p in pg['params'])
    print(f"{pg.get('name', 'unnamed')}: {n_params:,} params, lr={pg['lr']:.1e}")

encoder_bias: 102,912 params, lr=5.0e-06
isab: 14,200,320 params, lr=1.0e-04
pma: 7,094,016 params, lr=1.0e-04
lstm: 13,115,392 params, lr=1.0e-04
head: 132,871 params, lr=1.0e-04


In [None]:
# Train
save_dir = f'{BASE_DIR}/SEMEVAL2026_EMOVA/model_checkpoints_task2'

history, run_dir = train(
    model, train_loader, val_loader,
    config.loss, optimizer, scheduler, device, config,
    clipper=clipper, save_dir=save_dir
)

  scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 1/150
------------------------------
  Train Loss:   1.4889
  Val Loss:     1.2862
  Val Score:    0.4510 (Avg Pearson r)
  > Valence:    r=0.4450 | MAE=0.8156
  > Arousal:    r=0.4570 | MAE=0.7589
  ✓ New best model found (Score: 0.4510)


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 2/150
------------------------------
  Train Loss:   1.3912
  Val Loss:     1.2062
  Val Score:    0.3009 (Avg Pearson r)
  > Valence:    r=0.4054 | MAE=0.8053
  > Arousal:    r=0.1963 | MAE=0.7893


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 3/150
------------------------------
  Train Loss:   1.3070
  Val Loss:     1.3099
  Val Score:    0.2955 (Avg Pearson r)
  > Valence:    r=0.3623 | MAE=0.9206
  > Arousal:    r=0.2288 | MAE=0.8126


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 4/150
------------------------------
  Train Loss:   1.2643
  Val Loss:     1.1480
  Val Score:    0.4458 (Avg Pearson r)
  > Valence:    r=0.3649 | MAE=0.8145
  > Arousal:    r=0.5268 | MAE=0.7518


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 5/150
------------------------------
  Train Loss:   1.2124
  Val Loss:     1.1500
  Val Score:    0.5222 (Avg Pearson r)
  > Valence:    r=0.3780 | MAE=0.8669
  > Arousal:    r=0.6664 | MAE=0.7325
  ✓ New best model found (Score: 0.5222)


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 6/150
------------------------------
  Train Loss:   1.1545
  Val Loss:     1.1340
  Val Score:    0.5595 (Avg Pearson r)
  > Valence:    r=0.4866 | MAE=0.8431
  > Arousal:    r=0.6325 | MAE=0.7208
  ✓ New best model found (Score: 0.5595)


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 7/150
------------------------------
  Train Loss:   1.1394
  Val Loss:     1.1309
  Val Score:    0.5573 (Avg Pearson r)
  > Valence:    r=0.4797 | MAE=0.8268
  > Arousal:    r=0.6349 | MAE=0.7156


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 8/150
------------------------------
  Train Loss:   1.1217
  Val Loss:     1.2052
  Val Score:    0.4779 (Avg Pearson r)
  > Valence:    r=0.3035 | MAE=0.9234
  > Arousal:    r=0.6522 | MAE=0.7349


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 9/150
------------------------------
  Train Loss:   1.0916
  Val Loss:     1.1197
  Val Score:    0.5522 (Avg Pearson r)
  > Valence:    r=0.4898 | MAE=0.8353
  > Arousal:    r=0.6147 | MAE=0.7325


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 10/150
------------------------------
  Train Loss:   1.0886
  Val Loss:     1.1074
  Val Score:    0.5654 (Avg Pearson r)
  > Valence:    r=0.4199 | MAE=0.8313
  > Arousal:    r=0.7109 | MAE=0.7096
  ✓ New best model found (Score: 0.5654)


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 11/150
------------------------------
  Train Loss:   1.0306
  Val Loss:     1.1921
  Val Score:    0.4894 (Avg Pearson r)
  > Valence:    r=0.3299 | MAE=0.8988
  > Arousal:    r=0.6490 | MAE=0.7217


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 12/150
------------------------------
  Train Loss:   1.0179
  Val Loss:     1.1204
  Val Score:    0.5448 (Avg Pearson r)
  > Valence:    r=0.3545 | MAE=0.8661
  > Arousal:    r=0.7351 | MAE=0.7035


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 13/150
------------------------------
  Train Loss:   0.9489
  Val Loss:     1.1494
  Val Score:    0.4899 (Avg Pearson r)
  > Valence:    r=0.3309 | MAE=0.8538
  > Arousal:    r=0.6489 | MAE=0.7210


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 14/150
------------------------------
  Train Loss:   0.9410
  Val Loss:     1.2103
  Val Score:    0.5343 (Avg Pearson r)
  > Valence:    r=0.3305 | MAE=0.9257
  > Arousal:    r=0.7381 | MAE=0.7093


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 15/150
------------------------------
  Train Loss:   0.9104
  Val Loss:     1.1819
  Val Score:    0.5082 (Avg Pearson r)
  > Valence:    r=0.3686 | MAE=0.8934
  > Arousal:    r=0.6479 | MAE=0.6869


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 16/150
------------------------------
  Train Loss:   0.9096
  Val Loss:     1.2069
  Val Score:    0.5045 (Avg Pearson r)
  > Valence:    r=0.3493 | MAE=0.9452
  > Arousal:    r=0.6597 | MAE=0.6818


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 17/150
------------------------------
  Train Loss:   0.8510
  Val Loss:     1.3318
  Val Score:    0.5325 (Avg Pearson r)
  > Valence:    r=0.3530 | MAE=0.9930
  > Arousal:    r=0.7120 | MAE=0.6730


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 18/150
------------------------------
  Train Loss:   0.8209
  Val Loss:     1.2496
  Val Score:    0.5105 (Avg Pearson r)
  > Valence:    r=0.4213 | MAE=0.9383
  > Arousal:    r=0.5997 | MAE=0.7065


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 19/150
------------------------------
  Train Loss:   0.8219
  Val Loss:     1.2824
  Val Score:    0.4863 (Avg Pearson r)
  > Valence:    r=0.3473 | MAE=0.9328
  > Arousal:    r=0.6252 | MAE=0.7364


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 20/150
------------------------------
  Train Loss:   0.7940
  Val Loss:     1.4539
  Val Score:    0.5327 (Avg Pearson r)
  > Valence:    r=0.3692 | MAE=1.0749
  > Arousal:    r=0.6963 | MAE=0.7177


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 21/150
------------------------------
  Train Loss:   0.7875
  Val Loss:     1.3816
  Val Score:    0.5218 (Avg Pearson r)
  > Valence:    r=0.3636 | MAE=1.0800
  > Arousal:    r=0.6801 | MAE=0.7130


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 22/150
------------------------------
  Train Loss:   0.7683
  Val Loss:     1.3522
  Val Score:    0.4765 (Avg Pearson r)
  > Valence:    r=0.3877 | MAE=0.9790
  > Arousal:    r=0.5653 | MAE=0.7160


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 23/150
------------------------------
  Train Loss:   0.7507
  Val Loss:     1.3439
  Val Score:    0.5350 (Avg Pearson r)
  > Valence:    r=0.4392 | MAE=1.0060
  > Arousal:    r=0.6307 | MAE=0.7238


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 24/150
------------------------------
  Train Loss:   0.6950
  Val Loss:     1.3799
  Val Score:    0.5409 (Avg Pearson r)
  > Valence:    r=0.3939 | MAE=1.0971
  > Arousal:    r=0.6879 | MAE=0.6909


Training Task 2a:   0%|          | 0/287 [00:00<?, ?it/s]

Evaluating Task 2a:   0%|          | 0/51 [00:00<?, ?it/s]




Epoch 25/150
------------------------------
  Train Loss:   0.6795
  Val Loss:     1.6808
  Val Score:    0.4804 (Avg Pearson r)
  > Valence:    r=0.3719 | MAE=1.1832
  > Arousal:    r=0.5890 | MAE=0.7509

Early stopping at epoch 25

Training complete!
  Best Val Score: 0.5654 (Epoch 10)
  Saved to: /content/drive/MyDrive/SEMEVAL2026_EMOVA/model_checkpoints_task2/20260131_175415_score0.5654
