In [None]:
import os

def setup_storage():
    # Google Colab
    if "COLAB_GPU" in os.environ:
        from google.colab import drive
        drive.mount("/content/drive")
        base_dir = "/content/drive/MyDrive"
        env = "colab"

    # Kaggle
    elif os.path.exists("/kaggle"):
        base_dir = "/kaggle/working"
        env = "kaggle"

    # Local fallback
    else:
        base_dir = os.getcwd()
        env = "local"

    print(f"✔ Running on: {env}")
    print(f"✔ Base directory: {base_dir}")

    return base_dir, env


BASE_DIR, ENV = setup_storage()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✔ Running on: colab
✔ Base directory: /content/drive/MyDrive


In [None]:
PROJECT_ROOT = f"{BASE_DIR}/SEMEVAL2026_EMOVA"
CKPT_DIR = f"{PROJECT_ROOT}/model_checkpoints"

os.makedirs(CKPT_DIR, exist_ok=True)

In [None]:
!git clone https://github.com/AndreaLolli2912/SemEval2026-EmoVA.git
%cd SemEval2026-EmoVA

Cloning into 'SemEval2026-EmoVA'...
remote: Enumerating objects: 948, done.[K
remote: Counting objects: 100% (134/134), done.[K
remote: Compressing objects: 100% (129/129), done.[K
remote: Total 948 (delta 96), reused 5 (delta 5), pack-reused 814 (from 2)[K
Receiving objects: 100% (948/948), 1.51 MiB | 26.71 MiB/s, done.
Resolving deltas: 100% (595/595), done.
/content/SemEval2026-EmoVA/SemEval2026-EmoVA


In [None]:
import torch
import random
from dataclasses import dataclass
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, WeightedRandomSampler, Subset, random_split
import numpy as np


# Make sure these imports match your actual file structure
from src.data.dataset import EmoVADataset
from src.data.collate import create_collate_fn
from src.models.affect_model import AffectModel
from src.models.tokenizer_wrapper import TokenizerWrapper
from src.training import train, GradientClipper

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # For full determinism (may slow down training)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # For PyTorch 1.8+
    import os
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    torch.use_deterministic_algorithms(True)

set_seed(128)

In [None]:
from typing import Optional
@dataclass
class Config:
    # Data
    data_path: str = '/content/drive/MyDrive/SEMEVAL2026_EMOVA/dataset/train_subtask1.csv'  if BASE_DIR != '/kaggle/working' else "/kaggle/input/semeval-2026-emova/train_subtask1.csv"
    val_split: float = 0.15

    # Model
    model_name: str = 'bert-base-uncased' # bert-base-uncased, distilbert/distilroberta-base, microsoft/deberta-v3-base, google/electra-base-discriminator
    encoder_bitfit: bool = False
    encoder_lora: bool = True
    max_text_length: int = 128
    isab_inducing_points: Optional[int] = 32
    n_heads: float = 8
    pma_num_seeds: Optional[float] = 8
    lstm_hidden_dim: int = 256
    lstm_num_layers: int = 1
    dropout: float = 0.2
    lstm_bidirectional = False
    constrain_output = False # for normalization and sigmoid, tanh activation

    # Training
    epochs: int = 150
    batch_size: int = 1
    accumulation_steps: int = 16
    lr: float = 1e-4
    scheduler_factor: float = 0.5
    scheduler_patience: float = 5
    weight_decay: float = 0.01
    max_grad_norm: float = 1.0
    patience: int = 15
    loss: str = 'combined_loss' # "masked_mse_loss",  "ccc_loss", "combined_loss"
    # 10% Valence / 90% Arousal importance
    valence_share: float = 0.2
    # 20% MSE (Stability) / 80% CCC (Ranking)
    mse_share: float = 0.15

    # System
    num_workers: int = 0
    seed: int = 128

config = Config()

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [None]:
tokenizer = TokenizerWrapper(config.model_name, config.max_text_length)
full_dataset = EmoVADataset(path = config.data_path, dtype=torch.float32, constrain_output=config.constrain_output)

# Calculate split sizes
n_total = len(full_dataset)
n_val = int(n_total * 0.15)
n_train = n_total - n_val

# Standard Random Split (No filtering)
train_dataset, val_dataset = random_split(
    full_dataset,
    [n_train, n_val],
    generator=torch.Generator().manual_seed(config.seed)
)

collate_fn = create_collate_fn(tokenizer)

# --- LOADERS ---
train_loader = DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    shuffle=True,       # Standard Shuffle
    collate_fn=collate_fn,
    num_workers=config.num_workers
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=config.num_workers
)

In [None]:
# Model
model = AffectModel(
    model_path=config.model_name,
    encoder_bitfit=config.encoder_bitfit,
    encoder_use_lora=config.encoder_lora,
    lora_r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    n_heads=config.n_heads,
    isab_inducing_points=config.isab_inducing_points,
    pma_num_seeds=config.pma_num_seeds,
    lstm_hidden_dim=config.lstm_hidden_dim,
    lstm_num_layers=config.lstm_num_layers,
    lstm_bidirectional=config.lstm_bidirectional,
    dropout=config.dropout,
    constrain_output=config.constrain_output,
)

if config.encoder_bitfit or config.encoder_lora:
    model.encoder.backbone.gradient_checkpointing_enable()

model = model.to(device)

total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Parameters: {total:,} total, {trainable:,} trainable")

Parameters: 140,095,234 total, 30,612,994 trainable


In [None]:
# Optimizer with separate LRs
param_groups = [
    {'params': list([p for n, p in model.encoder.named_parameters() if p.requires_grad]),
     'lr': 5e-6, 'name': 'encoder_bias'},
    {'params': list(model.isab.parameters()), 'lr': config.lr, 'name': 'isab'} if model.isab else None,
    {'params': list(model.pma.parameters()), 'lr': config.lr, 'name': 'pma'} if model.pma else None,
    {'params': list(model.lstm.parameters()), 'lr': config.lr, 'name': 'lstm'},
    {'params': list(model.head.parameters()), 'lr': config.lr, 'name': 'head'},
]

param_groups = [
    pg for pg in param_groups
    if pg is not None and len(pg['params']) > 0
]

optimizer = AdamW(param_groups, weight_decay=config.weight_decay)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=config.scheduler_factor, patience=config.patience)
clipper = GradientClipper(max_norm=config.max_grad_norm)

for pg in optimizer.param_groups:
    n_params = sum(p.numel() for p in pg['params'])
    print(f"{pg.get('name', 'unnamed')}: {n_params:,} params, lr={pg['lr']:.1e}")

encoder_bias: 2,762,496 params, lr=5.0e-06
isab: 14,200,320 params, lr=1.0e-04
pma: 7,094,016 params, lr=1.0e-04
lstm: 6,555,648 params, lr=1.0e-04
head: 514 params, lr=1.0e-04


In [None]:
# Train
save_dir = f'{BASE_DIR}/SEMEVAL2026_EMOVA/model_checkpoints'

history, run_dir = train(
    model, train_loader, val_loader,
    config.loss, optimizer, scheduler, device, config,
    clipper=clipper, save_dir=save_dir
)

Training:   0%|          | 0/117 [00:00<?, ?it/s]



KeyboardInterrupt: 