In [1]:
# A single, complete script: Multimodal MELD + score-function baselines
# Trains per-view models once per simulation, then loops
# over score functions (hinge/margin/cross_entropy/raps) to match your first
# procedure and save comparable tables.
#
# NOTE: MVCP REMOVED (functions, config fields, calibration/eval/logging).

import os
import gc
import warnings
from dataclasses import dataclass
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision import transforms, models as tv_models
from transformers import (
    RobertaTokenizer,
    RobertaModel,
    get_linear_schedule_with_warmup,
)

from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.stats import chi2

warnings.filterwarnings("ignore")

# ==================== Models ====================
class ImprovedAudioModel(nn.Module):
    """Audio CNN for mel-spectrograms w/ GAP, BN, Dropout, residual"""
    def __init__(self, num_classes=7, hidden_dim=512, dropout=0.4):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(True),
            nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(True),
            nn.MaxPool2d(2, 2), nn.Dropout2d(0.2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(True),
            nn.Conv2d(128, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(True),
            nn.MaxPool2d(2, 2), nn.Dropout2d(0.2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(True),
            nn.Conv2d(256, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(True),
            nn.MaxPool2d(2, 2), nn.Dropout2d(0.2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(256, 512, 3, padding=1), nn.BatchNorm2d(512), nn.ReLU(True),
            nn.Conv2d(512, 512, 3, padding=1), nn.BatchNorm2d(512), nn.ReLU(True),
            nn.AdaptiveAvgPool2d((1, 1)), nn.Dropout2d(0.3)
        )
        self.fc1 = nn.Linear(512, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn3 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim // 2, num_classes)

    def forward(self, audio_mel, **kwargs):
        x = self.conv1(audio_mel)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = x.view(x.size(0), -1)
        identity = x
        x = self.dropout(F.relu(self.bn1(self.fc1(x))))
        x = self.dropout(F.relu(self.bn2(self.fc2(x))))
        if identity.size(1) == x.size(1):
            x = x + identity
        x = self.dropout(F.relu(self.bn3(self.fc3(x))))
        logits = self.classifier(x)
        return logits


class ImprovedVideoModel(nn.Module):
    """ResNet34 feature extractor (partial freeze) + MLP head"""
    def __init__(self, num_classes=7, hidden_dim=512, dropout=0.4):
        super().__init__()
        resnet = tv_models.resnet34(weights=tv_models.ResNet34_Weights.IMAGENET1K_V1)
        for p in resnet.parameters(): p.requires_grad = False
        for p in resnet.layer2.parameters(): p.requires_grad = True
        for p in resnet.layer3.parameters(): p.requires_grad = True
        for p in resnet.layer4.parameters(): p.requires_grad = True

        self.frozen_layers = nn.Sequential(
            resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
        )
        self.trainable_layers = nn.Sequential(
            resnet.layer2, resnet.layer3, resnet.layer4, resnet.avgpool
        )
        self.fc1 = nn.Linear(512, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn3 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim // 2, num_classes)

    def forward(self, face, **kwargs):
        self.frozen_layers.eval()
        with torch.no_grad():
            x = self.frozen_layers(face)
        x = self.trainable_layers(x)
        x = torch.flatten(x, 1)
        identity = x
        x1 = self.dropout(F.relu(self.bn1(self.fc1(x))))
        x2 = self.dropout(F.relu(self.bn2(self.fc2(x1))))
        if identity.size(1) == x2.size(1):
            x2 = x2 + identity
        x3 = self.dropout(F.relu(self.bn3(self.fc3(x2))))
        logits = self.classifier(x3)
        return logits


class ImprovedTextModel(nn.Module):
    """RoBERTa-base (partial unfreeze) + self-attn pooling + MLP head"""
    def __init__(self, num_classes=7, hidden_dim=768, dropout=0.3):
        super().__init__()
        self.text_encoder = RobertaModel.from_pretrained('roberta-base')
        for p in self.text_encoder.parameters(): p.requires_grad = False
        for p in self.text_encoder.encoder.layer[-8:].parameters(): p.requires_grad = True
        for p in self.text_encoder.pooler.parameters(): p.requires_grad = True

        text_dim = 768
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=text_dim, num_heads=4, dropout=dropout, batch_first=True
        )
        self.fc1 = nn.Linear(text_dim, hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.ln3 = nn.LayerNorm(hidden_dim // 2)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim // 2, num_classes)

    def forward(self, input_ids, attention_mask, **kwargs):
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = text_out.last_hidden_state
        attn_output, _ = self.multihead_attn(
            hidden_states, hidden_states, hidden_states,
            key_padding_mask=(attention_mask == 0)
        )
        mask_expanded = attention_mask.unsqueeze(-1).expand(attn_output.size()).float()
        sum_embeddings = torch.sum(attn_output * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        text_features = sum_embeddings / sum_mask
        x1 = self.dropout(F.gelu(self.ln1(self.fc1(text_features))))
        x2 = self.dropout(F.gelu(self.ln2(self.fc2(x1))))
        x3 = self.dropout(F.gelu(self.ln3(self.fc3(x2 + x1))))
        logits = self.classifier(x3)
        return logits

# ==================== Dataset ====================
class PreprocessedMELDDataset(Dataset):
    def __init__(self, data_dir, tokenizer, max_length=128, files=None, augment=False):
        self.data_dir = data_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment
        self.files = files if files is not None else sorted(
            [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.pt')]
        )
        self.emotion_map = {
            'neutral': 0, 'joy': 1, 'surprise': 2, 'anger': 3,
            'sadness': 4, 'disgust': 5, 'fear': 6
        }

    def __len__(self): return len(self.files)

    def __getitem__(self, idx):
        sample = torch.load(self.files[idx], map_location='cpu')

        # text
        text = sample['utterance']
        encoded = self.tokenizer(
            text, padding='max_length', truncation=True,
            max_length=self.max_length, return_tensors='pt'
        )

        # audio (mel)
        audio_mel = sample['audio_mel']
        if audio_mel.dim() == 2: audio_mel = audio_mel.unsqueeze(0)
        max_audio_len = 300
        pad_len = max_audio_len - audio_mel.shape[-1]
        if pad_len > 0:
            audio_mel = F.pad(audio_mel, (0, pad_len), value=audio_mel.min())
        audio_mel = torch.log1p(audio_mel.clamp(min=0))
        audio_mel = (audio_mel - audio_mel.mean()) / (audio_mel.std() + 1e-8)
        audio_mel = torch.clamp(audio_mel, -3, 3)

        # light augmentation
        if self.augment and np.random.rand() < 0.3:
            if np.random.rand() < 0.5:
                t_mask_width = int(audio_mel.shape[-1] * 0.1)
                t_start = np.random.randint(0, max(1, audio_mel.shape[-1] - t_mask_width))
                audio_mel[..., t_start:t_start + t_mask_width] = audio_mel.min()
            if np.random.rand() < 0.5 and audio_mel.shape[-2] > 1:
                f_mask_width = int(audio_mel.shape[-2] * 0.1)
                f_start = np.random.randint(0, max(1, audio_mel.shape[-2] - f_mask_width))
                audio_mel[..., f_start:f_start + f_mask_width, :] = audio_mel.min()

        # face image
        face = torch.from_numpy(sample['face']).float() / 255.0
        face = face.permute(2, 0, 1)
        if self.augment and np.random.rand() < 0.3:
            if np.random.rand() < 0.2:
                face = torch.flip(face, [-1])
            if np.random.rand() < 0.5:
                face = torch.clamp(face * (0.8 + np.random.rand()*0.4), 0, 1)
            if np.random.rand() < 0.5:
                mean = face.mean(dim=[1, 2], keepdim=True)
                face = torch.clamp((face - mean) * (0.9 + np.random.rand()*0.2) + mean, 0, 1)
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        face = normalize(face)

        # label
        emotion = sample['emotion']
        if isinstance(emotion, str):
            label = self.emotion_map.get(emotion.lower(), 0)
        else:
            label = int(emotion)

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'audio_mel': audio_mel[:, :, :300],
            'face': face,
            'label': label
        }

def collate_fn(batch, max_audio_length=300):
    return {
        'input_ids': torch.stack([b['input_ids'] for b in batch]),
        'attention_mask': torch.stack([b['attention_mask'] for b in batch]),
        'audio_mel': torch.stack([b['audio_mel'] for b in batch]),
        'face': torch.stack([b['face'] for b in batch]),
        'label': torch.tensor([b['label'] for b in batch])
    }

# ==================== Training/Eval ====================
def train_epoch(model, dataloader, optimizer, criterion, device, scheduler=None, grad_clip=5.0):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    pbar = tqdm(dataloader, desc='Training', leave=False)
    for batch_idx, batch in enumerate(pbar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_mel = batch['audio_mel'].to(device)
        face = batch['face'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad(set_to_none=True)
        logits = model(input_ids=input_ids, attention_mask=attention_mask,
                       audio_mel=audio_mel, face=face)
        loss = criterion(logits, labels)
        if torch.isnan(loss):
            continue
        loss.backward()
        total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1).detach().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.detach().cpu().numpy())

        if batch_idx % 50 == 0:
            pbar.set_postfix({'loss': f'{loss.item():.3f}', 'grad_norm': f'{float(total_norm):.2f}'})

        del input_ids, attention_mask, audio_mel, face, labels, logits
        torch.cuda.empty_cache()

    avg_loss = total_loss / max(len(dataloader), 1)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    return avg_loss, accuracy, f1

@torch.no_grad()
def evaluate(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_mel = batch['audio_mel'].to(device)
        face = batch['face'].to(device)
        labels = batch['label'].cpu().numpy()

        logits = model(input_ids=input_ids, attention_mask=attention_mask,
                       audio_mel=audio_mel, face=face)
        preds = logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)

        del input_ids, attention_mask, audio_mel, face, logits
        torch.cuda.empty_cache()

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    return accuracy, f1

# ==================== Score functions (as in your first script) ====================
def compute_s(probs: np.ndarray, y: int, score_type: str, params=None) -> float:
    if params is None: params = {}
    if score_type == 'hinge':
        return 1 - probs[y]
    elif score_type == 'cross_entropy':
        return -np.log(probs[y] + 1e-12)
    elif score_type == 'margin':
        max_other = np.max(np.delete(probs, y))
        return max_other - probs[y]
    elif score_type == 'raps':
        u = params.get('u', 0.1)
        lam = params.get('lam', 0.01)
        k_reg = params.get('k_reg', 5)
        sorted_idx = np.argsort(-probs)
        ranks = np.empty(len(probs), int); ranks[sorted_idx] = np.arange(1, len(probs)+1)
        R_y = ranks[y]
        cumsum = np.sum(probs[sorted_idx[:R_y-1]]) if R_y > 1 else 0.0
        s = cumsum + u * probs[y] + lam * max(R_y - k_reg, 0)
        return s
    else:
        raise ValueError(f"Unknown score_type: {score_type}")

@torch.no_grad()
def compute_nonconformity_scores(model: nn.Module, loader, device, score_type='hinge', params=None) -> Tuple[np.ndarray, np.ndarray]:
    model.eval()
    scores, labels = [], []
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_mel = batch['audio_mel'].to(device)
        face = batch['face'].to(device)
        yb = batch['label'].cpu().numpy()

        logits = model(input_ids=input_ids, attention_mask=attention_mask,
                       audio_mel=audio_mel, face=face)
        probs = F.softmax(logits, dim=1).cpu().numpy()
        for i in range(len(yb)):
            s = compute_s(probs[i], int(yb[i]), score_type, params)
            scores.append(s)
        labels.extend(yb)

        del input_ids, attention_mask, audio_mel, face, logits
        torch.cuda.empty_cache()
    return np.array(scores), np.array(labels)

def classwise_scores(scores: np.ndarray, labels: np.ndarray, L: int) -> Dict[int, np.ndarray]:
    out = {c: [] for c in range(L)}
    for s, y in zip(scores, labels):
        out[int(y)].append(float(s))
    return {c: np.asarray(v, float) for c, v in out.items()}

@torch.no_grad()
def per_view_pvalues_and_probs(model: nn.Module, class_scores: Dict[int, np.ndarray], loader, L: int, device, score_type='hinge', params=None) -> Tuple[np.ndarray, np.ndarray]:
    model.eval()
    probs_all = []
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_mel = batch['audio_mel'].to(device)
        face = batch['face'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask,
                       audio_mel=audio_mel, face=face)
        probs = F.softmax(logits, dim=1).cpu().numpy()
        probs_all.append(probs)

        del input_ids, attention_mask, audio_mel, face, logits
        torch.cuda.empty_cache()
    probs_all = np.vstack(probs_all)  # (n, L)

    n = probs_all.shape[0]
    pvals = np.zeros((n, L))
    for i in range(n):
        for y in range(L):
            s_y = compute_s(probs_all[i], y, score_type, params)
            cal = class_scores.get(y, np.array([]))
            if cal.size == 0:
                pvals[i, y] = 1.0
            else:
                counts = np.sum(cal >= s_y)
                pvals[i, y] = (1 + counts) / (len(cal) + 1)
    return pvals, probs_all

# ==================== Fusion utilities ====================
def build_fusion_features(pvals_list: List[np.ndarray], probs_list: List[np.ndarray]) -> np.ndarray:
    blocks = [np.hstack([pvals_list[k], probs_list[k]]) for k in range(len(pvals_list))]
    return np.hstack(blocks)

def min_p_value_fusion(P_all: np.ndarray) -> np.ndarray:
    K = P_all.shape[0]
    return K * np.min(P_all, axis=0)

def fisher_fusion(P_all: np.ndarray) -> np.ndarray:
    eps = 1e-12
    p = np.clip(P_all, eps, 1.0)
    T = -2 * np.sum(np.log(p), axis=0)
    df = 2 * P_all.shape[0]
    return 1 - chi2.cdf(T, df=df)

def adjusted_fisher_fusion(P_train: np.ndarray, y_train: np.ndarray, P_test: np.ndarray, L: int) -> np.ndarray:
    K, _, _ = P_train.shape
    n_test = P_test.shape[1]
    eps = 1e-12
    out = np.zeros((n_test, L))
    for y in range(L):
        idx = np.where(y_train == y)[0]
        if idx.size < 5:
            out[:, y] = fisher_fusion(P_test)[:, y]
            continue
        P_cls = np.clip(P_train[:, idx, y], eps, 1.0)  # (K, n_y)
        W = -2 * np.log(P_cls)                          # (K, n_y)
        Wc = W - W.mean(axis=1, keepdims=True)
        Sigma = (Wc @ Wc.T) / max(W.shape[1] - 1, 1)    # (K, K)
        var_T = np.sum(Sigma)
        if not np.isfinite(var_T) or var_T <= 0:
            var_T = 4 * K
        f_y = (8.0 * K * K) / var_T
        c_y = var_T / (4 * K)
        P_t = np.clip(P_test[:, :, y], eps, 1.0)
        T_t = -2 * np.sum(np.log(P_t), axis=0)
        out[:, y] = 1 - chi2.cdf(T_t / c_y, df=f_y)
    return out

def weighted_average_fusion(P_all: np.ndarray, weights: np.ndarray) -> np.ndarray:
    return np.tensordot(weights, P_all, axes=(0, 0))

def learn_view_weights_from_pvals(pv_train_concat: np.ndarray, y_train: np.ndarray, K: int, L: int, max_iter: int, seed: int) -> np.ndarray:
    lr = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=max_iter, random_state=seed)
    lr.fit(pv_train_concat, y_train)
    B = lr.coef_  # (L, K*L)
    imps = []
    for k in range(K):
        block = B[:, k*L:(k+1)*L]
        imps.append(np.linalg.norm(block, ord="fro"))
    w = np.array(imps, float)
    w = np.maximum(w, 1e-12)
    return w / w.sum()

def fused_class_cal_scores(y_cal: np.ndarray, fused_probs_cal: np.ndarray, L: int) -> Dict[int, np.ndarray]:
    s = 1 - fused_probs_cal[np.arange(len(y_cal)), y_cal]
    out = {c: [] for c in range(L)}
    for sc, yy in zip(s, y_cal):
        out[int(yy)].append(float(sc))
    return {c: np.asarray(v, float) for c, v in out.items()}

def fused_p_values_from_cal(fused_probs: np.ndarray, cal_class_scores: Dict[int, np.ndarray]) -> np.ndarray:
    n, L = fused_probs.shape
    out = np.zeros((n, L))
    for y in range(L):
        cal = cal_class_scores.get(y, np.array([]))
        if cal.size == 0:
            out[:, y] = 1.0
        else:
            s_test = 1 - fused_probs[:, y]
            counts = np.sum(cal[:, None] >= s_test[None, :], axis=0)
            out[:, y] = (1 + counts) / (len(cal) + 1)
    return out

def evaluate_sets(P: np.ndarray, y_true: np.ndarray, alpha: float) -> Tuple[float, float]:
    C = (P > alpha)
    cov = float(np.mean(C[np.arange(len(y_true)), y_true]))
    size = float(np.mean(C.sum(axis=1)))
    return cov, size

# ==================== Utility: recompute probs on a loader ====================
@torch.no_grad()
def compute_probs_mm(model: nn.Module, loader, device) -> np.ndarray:
    model.eval()
    probs_all = []
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_mel = batch['audio_mel'].to(device)
        face = batch['face'].to(device)
        logits = model(input_ids=input_ids, attention_mask=attention_mask,
                       audio_mel=audio_mel, face=face)
        probs = F.softmax(logits, dim=1).cpu().numpy()
        probs_all.append(probs)
        del input_ids, attention_mask, audio_mel, face, logits
        torch.cuda.empty_cache()
    return np.vstack(probs_all)

# ==================== Summaries ====================
def summarize_table(df: pd.DataFrame, methods: List[str], metric_name: str) -> pd.DataFrame:
    g = df.groupby(["Score", "K"]).agg({m: ["mean", "std"] for m in methods})
    g.columns = [f"{a}_{b}" for a, b in g.columns]
    g = g.reset_index()
    for m in methods:
        g[m] = g.apply(lambda r: f"{r[f'{m}_mean']:.2f} ({r[f'{m}_std']:.2f})", axis=1)
    g.insert(2, "Metric", metric_name)
    return g[["Score", "K", "Metric"] + methods]

# ==================== Config ====================
@dataclass
class MELDConfig:
    alpha: float = 0.1
    Ks: Tuple[int, ...] = (3,)
    num_classes: int = 7
    num_simulations: int = 10

    epochs_text: int = 10
    epochs_audio: int = 20
    epochs_video: int = 20

    lr_text: float = 5e-5
    lr_audio: float = 1e-3
    lr_video: float = 1e-3

    batch_size: int = 64
    inference_batch_size: int = 8
    warmup_ratio: float = 0.15
    max_iter_lr: int = 1000
    train_seed_base: int = 40
    weight_decay: float = 0.01
    label_smoothing: float = 0.0
    grad_clip: float = 5.0
    patience: int = 3

    train_frac: float = 0.6
    cal_frac_of_temp: float = 0.2
    fuse_train_frac_of_rest: float = 0.5

# ==================== Main experiment ====================
def run_experiments(cfg: MELDConfig):
    results_cov, results_size, results_acc = [], [], []
    global device, tokenizer, full_train_files, test_files, Y_full, Y_test

    # Score functions to match your first code
    score_types = {
        'hinge': {},
        'margin': {},
        'cross_entropy': {},
        'raps': {'u': 0.1, 'lam': 0.01, 'k_reg': 5}
    }

    for sim in range(cfg.num_simulations):
        print(f"\n{'='*60}\nSimulation {sim+1}/{cfg.num_simulations}\n{'='*60}")
        seed = cfg.train_seed_base + sim
        torch.manual_seed(seed); np.random.seed(seed)
        rng = np.random.RandomState(seed)

        # Stratified splits on file indices
        indices = np.arange(len(full_train_files))
        trP_idx, tmp_idx = train_test_split(indices, test_size=1 - cfg.train_frac, stratify=Y_full, random_state=seed)
        cal_idx, rest_idx = train_test_split(tmp_idx, test_size=1 - cfg.cal_frac_of_temp, stratify=Y_full[tmp_idx], random_state=seed)
        ftr_idx, fcal_idx = train_test_split(rest_idx, test_size=1 - cfg.fuse_train_frac_of_rest, stratify=Y_full[rest_idx], random_state=seed)

        X_trP_files = [full_train_files[i] for i in trP_idx]
        X_cal_files = [full_train_files[i] for i in cal_idx]
        X_fuse_tr_files = [full_train_files[i] for i in ftr_idx]
        X_fuse_cal_files = [full_train_files[i] for i in fcal_idx]
        X_te_files = test_files

        y_trP = Y_full[trP_idx]
        y_cal = Y_full[cal_idx]
        y_fuse_tr = Y_full[ftr_idx]
        y_fuse_cal = Y_full[fcal_idx]
        y_te = Y_test

        # Datasets / loaders
        train_dataset = PreprocessedMELDDataset(None, tokenizer, files=X_trP_files, augment=False)
        cal_dataset = PreprocessedMELDDataset(None, tokenizer, files=X_cal_files, augment=False)
        ftr_dataset = PreprocessedMELDDataset(None, tokenizer, files=X_fuse_tr_files, augment=False)
        fcal_dataset = PreprocessedMELDDataset(None, tokenizer, files=X_fuse_cal_files, augment=False)
        te_dataset = PreprocessedMELDDataset(None, tokenizer, files=X_te_files, augment=False)

        train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True,
                                  collate_fn=lambda b: collate_fn(b, 300), num_workers=4, pin_memory=True, drop_last=True)
        cal_loader = DataLoader(cal_dataset, batch_size=cfg.inference_batch_size, shuffle=False,
                                collate_fn=lambda b: collate_fn(b, 300), num_workers=4, pin_memory=True)
        ftr_loader = DataLoader(ftr_dataset, batch_size=cfg.inference_batch_size, shuffle=False,
                                collate_fn=lambda b: collate_fn(b, 300), num_workers=4, pin_memory=True)
        fcal_loader = DataLoader(fcal_dataset, batch_size=cfg.inference_batch_size, shuffle=False,
                                 collate_fn=lambda b: collate_fn(b, 300), num_workers=4, pin_memory=True)
        te_loader = DataLoader(te_dataset, batch_size=cfg.inference_batch_size, shuffle=False,
                               collate_fn=lambda b: collate_fn(b, 300), num_workers=4, pin_memory=True)

        # Train per-view models once
        view_model_classes = [ImprovedTextModel, ImprovedAudioModel, ImprovedVideoModel]
        learning_rates = [cfg.lr_text, cfg.lr_audio, cfg.lr_video]
        model_names = ["Text (RoBERTa)", "Audio (CNN)", "Video (ResNet34)"]
        epochs_per_model = [cfg.epochs_text, cfg.epochs_audio, cfg.epochs_video]

        per_view_models = []
        pr_te_allviews = []  # test probs (for per-score fusion/accuracy)
        for v in range(3):
            print(f"\n [{model_names[v]}] Training...")
            m = view_model_classes[v](num_classes=cfg.num_classes).to(device)
            criterion = nn.CrossEntropyLoss()

            if v == 0:
                base_params = [p for n, p in m.named_parameters() if 'text_encoder.encoder.layer' in n]
                other_params = [p for n, p in m.named_parameters() if 'text_encoder.encoder.layer' not in n]
                optimizer = torch.optim.AdamW([
                    {'params': base_params, 'lr': learning_rates[v]},
                    {'params': other_params, 'lr': learning_rates[v] * 10}
                ], weight_decay=cfg.weight_decay)
            elif v == 1:
                optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rates[v], weight_decay=cfg.weight_decay)
            else:
                trainable_params = [p for p in m.parameters() if p.requires_grad]
                optimizer = torch.optim.AdamW(trainable_params, lr=learning_rates[v], weight_decay=cfg.weight_decay)

            total_steps = epochs_per_model[v] * len(train_loader)
            warmup_steps = int(total_steps * cfg.warmup_ratio)
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

            best_f1, best_acc, best_state, patience_counter = 0.0, 0.0, None, 0
            for epoch in range(epochs_per_model[v]):
                train_loss, train_acc, train_f1 = train_epoch(
                    m, train_loader, optimizer, criterion, device,
                    scheduler=scheduler, grad_clip=cfg.grad_clip
                )
                val_acc, val_f1 = evaluate(m, cal_loader, device)
                if val_f1 > best_f1:
                    best_f1, best_acc = val_f1, val_acc
                    patience_counter = 0
                    best_state = {k: vv.cpu() for k, vv in m.state_dict().items()}
                else:
                    patience_counter += 1

                print(f" Epoch {epoch+1:2d}/{epochs_per_model[v]}: "
                      f"Loss={train_loss:.3f}, Train Acc={train_acc*100:.1f}%, F1={train_f1:.3f} | "
                      f"Val Acc={val_acc*100:.1f}%, F1={val_f1:.3f} | Best F1={best_f1:.3f}")

                if patience_counter >= cfg.patience:
                    print(" Early stopping")
                    break

            if best_state is not None:
                m.load_state_dict(best_state)
            print(f" [{model_names[v]}] Best Val: Acc={best_acc*100:.2f}%, F1={best_f1:.4f}")
            m.eval()

            # Compute test probs once and cache
            _, pr_te_v = per_view_pvalues_and_probs(m, {}, te_loader, cfg.num_classes, device, score_type='hinge', params={})
            pr_te_allviews.append(pr_te_v)
            per_view_models.append(m)

            del optimizer, scheduler, criterion, best_state
            torch.cuda.empty_cache(); gc.collect()

        # ===== Loop over score functions (like your first script) =====
        for score_name, score_params in score_types.items():
            print(f"\n{'-'*60}\nScore Function: {score_name}\n{'-'*60}")

            pv_tr, pr_tr = [], []
            pv_cal, pr_cal = [], []
            pv_te_list = []
            cal_classwise = []

            # Per-view conformal calibration per score
            for v in range(3):
                m = per_view_models[v]
                m.to(device)
                m.eval()

                # Calibration nonconformity scores
                sc, lab = compute_nonconformity_scores(m, cal_loader, device, score_type=score_name, params=score_params)
                cal_classwise_v = classwise_scores(sc, lab, cfg.num_classes)
                cal_classwise.append(cal_classwise_v)

                # p/probs on fuse-train / fuse-cal / test
                p_tr, pr_tr_v = per_view_pvalues_and_probs(m, cal_classwise_v, ftr_loader, cfg.num_classes, device, score_type=score_name, params=score_params)
                pv_tr.append(p_tr); pr_tr.append(pr_tr_v)

                p_cal, pr_cal_v = per_view_pvalues_and_probs(m, cal_classwise_v, fcal_loader, cfg.num_classes, device, score_type=score_name, params=score_params)
                pv_cal.append(p_cal); pr_cal.append(pr_cal_v)

                p_te, _ = per_view_pvalues_and_probs(m, cal_classwise_v, te_loader, cfg.num_classes, device, score_type=score_name, params=score_params)
                pv_te_list.append(p_te)

                # keep model on CPU after
                m.to('cpu')

                del sc, lab, p_tr, pr_tr_v, p_cal, pr_cal_v, p_te, cal_classwise_v
                torch.cuda.empty_cache(); gc.collect()

            # ===== Conformal Fusion and baselines =====
            print(" Training fusion model...")
            X_ftr = build_fusion_features(pv_tr, pr_tr)
            fusion_lr = LogisticRegression(max_iter=cfg.max_iter_lr, multi_class="multinomial",
                                           solver="lbfgs", random_state=seed)
            fusion_lr.fit(X_ftr, y_fuse_tr)

            X_fcal = build_fusion_features(pv_cal, pr_cal)
            fused_probs_cal = fusion_lr.predict_proba(X_fcal)
            fused_cal_scores = fused_class_cal_scores(y_fuse_cal, fused_probs_cal, cfg.num_classes)

            X_ftest = build_fusion_features(pv_te_list, pr_te_allviews)
            fused_probs_test = fusion_lr.predict_proba(X_ftest)
            P_cf = fused_p_values_from_cal(fused_probs_test, fused_cal_scores)

            # Baselines on stacked per-view pvals
            P_train = np.stack(pv_tr, axis=0)
            P_test = np.stack(pv_te_list, axis=0)
            P_min = min_p_value_fusion(P_test)
            P_fisher = fisher_fusion(P_test)
            P_adjF = adjusted_fisher_fusion(P_train, y_fuse_tr, P_test, cfg.num_classes)

            pv_tr_concat = np.concatenate(pv_tr, axis=1)
            w_learned = learn_view_weights_from_pvals(pv_tr_concat, y_fuse_tr, 3, cfg.num_classes, cfg.max_iter_lr, seed)
            P_wavgL = weighted_average_fusion(P_test, w_learned)

            # Evaluate all methods
            cov_cf, set_cf = evaluate_sets(P_cf, y_te, cfg.alpha)
            cov_min, set_min = evaluate_sets(P_min, y_te, cfg.alpha)
            cov_fi, set_fi = evaluate_sets(P_fisher, y_te, cfg.alpha)
            cov_afi, set_afi = evaluate_sets(P_adjF, y_te, cfg.alpha)
            cov_wl, set_wl = evaluate_sets(P_wavgL, y_te, cfg.alpha)

            # Accuracies
            avg_probs = np.mean(np.stack(pr_te_allviews, axis=0), axis=0)
            fused_acc = accuracy_score(y_te, np.argmax(fused_probs_test, axis=1)) * 100
            avg_acc = accuracy_score(y_te, np.argmax(avg_probs, axis=1)) * 100
            view_accs = [accuracy_score(y_te, np.argmax(pr_te_allviews[v], axis=1)) * 100 for v in range(3)]

            print(f"\n Results Summary ({score_name}):")
            print(f" {'='*50}")
            print(f" Individual Accuracies:")
            print(f" Text:  {view_accs[0]:5.2f}% | Audio: {view_accs[1]:5.2f}% | Video: {view_accs[2]:5.2f}%")
            print(f" Fusion Accuracies: Avg Ensemble {avg_acc:5.2f}% | Learned Fusion {fused_acc:5.2f}%")
            print(f" Conformal Metrics (target {100*(1-cfg.alpha):.0f}%):")
            print(f" CF:     cov={cov_cf*100:5.2f}% | set={set_cf:5.2f}")
            print(f" MinPV:  cov={cov_min*100:5.2f}% | set={set_min:5.2f}")
            print(f" Fisher: cov={cov_fi*100:5.2f}% | set={set_fi:5.2f}")
            print(f" AdjFis: cov={cov_afi*100:5.2f}% | set={set_afi:5.2f}")
            print(f" WAvgL:  cov={cov_wl*100:5.2f}% | set={set_wl:5.2f}")
            print(f" {'='*50}")

            # Append results
            results_cov.append({
                "Score": score_name, "Sim": sim, "K": 3,
                "Conformal Fusion": cov_cf * 100,
                "Min p-Value": cov_min * 100,
                "Fisher": cov_fi * 100,
                "Adjusted Fisher": cov_afi * 100,
                "Weighted Avg (learned)": cov_wl * 100,
            })
            results_size.append({
                "Score": score_name, "Sim": sim, "K": 3,
                "Conformal Fusion": set_cf,
                "Min p-Value": set_min,
                "Fisher": set_fi,
                "Adjusted Fisher": set_afi,
                "Weighted Avg (learned)": set_wl,
            })
            results_acc.append({
                "Score": score_name, "Sim": sim, "K": 3,
                "Fused Acc": fused_acc,
                "Average Acc": avg_acc,
                "Text Acc": view_accs[0],
                "Audio Acc": view_accs[1],
                "Video Acc": view_accs[2]
            })

            # cleanup per-score
            del X_ftr, fusion_lr, X_fcal, fused_probs_cal, fused_cal_scores, X_ftest, fused_probs_test, P_cf
            del P_train, P_test, P_min, P_fisher, P_adjF, pv_tr_concat, w_learned, P_wavgL
            torch.cuda.empty_cache(); gc.collect()

        # cleanup per simulation
        for m in per_view_models:
            m.to('cpu')
            del m
        del per_view_models, pr_te_allviews
        torch.cuda.empty_cache(); gc.collect()

    return pd.DataFrame(results_cov), pd.DataFrame(results_size), pd.DataFrame(results_acc)

# ==================== Save + print ====================
def save_tables(df_cov: pd.DataFrame, df_size: pd.DataFrame, df_acc: pd.DataFrame):
    methods = [
        "Conformal Fusion",
        "Min p-Value",
        "Fisher",
        "Adjusted Fisher",
        "Weighted Avg (learned)",
    ]
    sum_cov = summarize_table(df_cov, methods, "Coverage (%)")
    sum_set = summarize_table(df_size, methods, "Average Set Size")

    sum_cov.to_csv("meld_summary_coverage.csv", index=False)
    sum_set.to_csv("meld_summary_setsize.csv", index=False)
    with open("meld_summary_coverage.tex", "w") as f:
        f.write(sum_cov.to_latex(index=False, escape=False))
    with open("meld_summary_setsize.tex", "w") as f:
        f.write(sum_set.to_latex(index=False, escape=False))

    cov_comp = sum_cov.drop(columns=["Metric"]).rename(columns={
        "Conformal Fusion": "CF Cov",
        "Min p-Value": "MinPV Cov",
        "Fisher": "Fisher Cov",
        "Adjusted Fisher": "AdjF Cov",
        "Weighted Avg (learned)": "WAvgL Cov",
    })
    set_comp = sum_set.drop(columns=["Metric"]).rename(columns={
        "Conformal Fusion": "CF Set",
        "Min p-Value": "MinPV Set",
        "Fisher": "Fisher Set",
        "Adjusted Fisher": "AdjF Set",
        "Weighted Avg (learned)": "WAvgL Set",
    })
    final = cov_comp.merge(set_comp, on=["Score", "K"]).sort_values(["Score", "K"])
    final.to_csv("meld_summary_final.csv", index=False)
    with open("meld_summary_final.tex", "w") as f:
        f.write(final.to_latex(index=False, escape=False))

    acc_summary = df_acc.groupby(["Score", "K"]).agg({
        "Fused Acc": ["mean", "std"],
        "Average Acc": ["mean", "std"],
        "Text Acc": ["mean", "std"],
        "Audio Acc": ["mean", "std"],
        "Video Acc": ["mean", "std"]
    }).reset_index()
    acc_summary.columns = ['Score', 'K'] + [f"{col[0]}_{col[1]}" if col[1] else col[0]
                                   for col in acc_summary.columns[2:]]
    acc_summary.to_csv("meld_accuracy_summary.csv", index=False)
    with open("meld_accuracy_summary.tex", "w") as f:
        f.write(acc_summary.to_latex(index=False, float_format="%.2f"))

    print("\nSaved:")
    print(" - meld_summary_coverage.csv/tex")
    print(" - meld_summary_setsize.csv/tex")
    print(" - meld_summary_final.csv/tex")
    print(" - meld_accuracy_summary.csv/tex")

# ==================== Setup / Dataset loading ====================
def setup_environment():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
        print(f"Available GPUs: {torch.cuda.device_count()}")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
    elif torch.backends.mps.is_available():
        device = torch.device("mps"); print("Using MPS device")
    else:
        device = torch.device("cpu"); print("Using CPU device")
    print("Loading RoBERTa tokenizer...")
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    return device, tokenizer

def load_dataset_labels(data_dir: str):
    emotion_map = {'neutral': 0, 'joy': 1, 'surprise': 2, 'anger': 3, 'sadness': 4, 'disgust': 5, 'fear': 6}
    print(f"\nLoading data from: {data_dir}")
    train_dir = os.path.join(data_dir, 'train')
    dev_dir = os.path.join(data_dir, 'dev')
    test_dir = os.path.join(data_dir, 'test')

    train_files = [os.path.join(train_dir, f) for f in os.listdir(train_dir) if f.endswith('.pt')]
    dev_files = [os.path.join(dev_dir, f) for f in os.listdir(dev_dir) if f.endswith('.pt')]
    test_files = [os.path.join(test_dir, f) for f in os.listdir(test_dir) if f.endswith('.pt')]

    full_train_files = sorted(train_files + dev_files)
    test_files = sorted(test_files)

    print("Loading labels for stratification...")
    Y_full = []
    for f in tqdm(full_train_files, desc="Loading train labels"):
        sample = torch.load(f, map_location='cpu')
        emotion = sample['emotion'].lower() if isinstance(sample['emotion'], str) else str(sample['emotion'])
        Y_full.append(emotion_map.get(emotion, 0))
    Y_full = np.array(Y_full)

    Y_test = []
    for f in tqdm(test_files, desc="Loading test labels"):
        sample = torch.load(f, map_location='cpu')
        emotion = sample['emotion'].lower() if isinstance(sample['emotion'], str) else str(sample['emotion'])
        Y_test.append(emotion_map.get(emotion, 0))
    Y_test = np.array(Y_test)

    print(f"\nDataset Statistics:")
    print(f" Train samples: {len(Y_full)}")
    print(f" Test samples:  {len(Y_test)}")
    emotion_names = ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']
    for i, emotion in enumerate(emotion_names):
        count = np.sum(Y_full == i)
        print(f" {emotion:10s}: {count:4d} ({count/len(Y_full)*100:.1f}%)")
    return full_train_files, test_files, Y_full, Y_test

# ==================== Entry ====================
device = None
tokenizer = None
full_train_files = None
test_files = None
Y_full = None
Y_test = None

def main():
    global device, tokenizer, full_train_files, test_files, Y_full, Y_test
    device, tokenizer = setup_environment()

    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    # TODO: set to your preprocessed MELD root (with train/dev/test)
    DATA_DIR = '/depot/gupta869/data/farbod/preprocessed_data'
    # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

    full_train_files, test_files, Y_full, Y_test = load_dataset_labels(DATA_DIR)

    cfg = MELDConfig()
    print(f"\nExperiment Configuration:")
    print(f" Device: {device}")
    print(f" Epochs: Text={cfg.epochs_text}, Audio={cfg.epochs_audio}, Video={cfg.epochs_video}")
    print(f" LRs:    Text={cfg.lr_text}, Audio={cfg.lr_audio}, Video={cfg.lr_video}")
    print(f" Batch size: {cfg.batch_size} (train), {cfg.inference_batch_size} (inf)")
    print(f" Weight decay: {cfg.weight_decay} | Label smoothing: {cfg.label_smoothing}")
    print(f" Sims: {cfg.num_simulations}")

    df_cov, df_size, df_acc = run_experiments(cfg)

    print("\n=== Coverage (raw rows) ===")
    print(df_cov.head())
    print("\n=== Set Size (raw rows) ===")
    print(df_size.head())
    print("\n=== Accuracy (raw rows) ===")
    print(df_acc.head())

    save_tables(df_cov, df_size, df_acc)

if __name__ == "__main__":
    main()


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Using CUDA device: NVIDIA A30
Available GPUs: 1
Loading RoBERTa tokenizer...

Loading data from: /depot/gupta869/data/farbod/preprocessed_data
Loading labels for stratification...


Loading train labels: 100%|██████████| 11096/11096 [03:34<00:00, 51.73it/s]
Loading test labels: 100%|██████████| 2610/2610 [00:49<00:00, 52.89it/s]



Dataset Statistics:
 Train samples: 11096
 Test samples:  2610
 neutral   : 5178 (46.7%)
 joy       : 1906 (17.2%)
 surprise  : 1355 (12.2%)
 anger     : 1262 (11.4%)
 sadness   :  794 (7.2%)
 disgust   :  293 (2.6%)
 fear      :  308 (2.8%)

Experiment Configuration:
 Device: cuda
 Epochs: Text=10, Audio=20, Video=20
 LRs:    Text=5e-05, Audio=0.001, Video=0.001
 Batch size: 64 (train), 8 (inf)
 Weight decay: 0.01 | Label smoothing: 0.0
 Sims: 10

Simulation 1/10

 [Text (RoBERTa)] Training...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                                       

 Epoch  1/10: Loss=1.400, Train Acc=52.5%, F1=0.466 | Val Acc=59.6%, F1=0.577 | Best F1=0.577


                                                                                       

 Epoch  2/10: Loss=1.172, Train Acc=62.1%, F1=0.590 | Val Acc=62.0%, F1=0.596 | Best F1=0.596


                                                                                        

 Epoch  5/10: Loss=0.720, Train Acc=77.1%, F1=0.761 | Val Acc=58.7%, F1=0.586 | Best F1=0.596
 Early stopping
 [Text (RoBERTa)] Best Val: Acc=62.01%, F1=0.5958

 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.819, Train Acc=30.7%, F1=0.289 | Val Acc=40.8%, F1=0.312 | Best F1=0.312


                                                                                       

 Epoch  2/20: Loss=1.594, Train Acc=45.3%, F1=0.309 | Val Acc=46.2%, F1=0.308 | Best F1=0.312


                                                                                       

 Epoch  3/20: Loss=1.586, Train Acc=45.7%, F1=0.305 | Val Acc=46.7%, F1=0.297 | Best F1=0.312


                                                                                       

 Epoch  4/20: Loss=1.572, Train Acc=46.0%, F1=0.303 | Val Acc=46.7%, F1=0.297 | Best F1=0.312
 Early stopping
 [Audio (CNN)] Best Val: Acc=40.81%, F1=0.3119

 [Video (ResNet34)] Training...


                                                                                       

 Epoch  1/20: Loss=1.882, Train Acc=25.9%, F1=0.272 | Val Acc=36.2%, F1=0.310 | Best F1=0.310


                                                                                       

 Epoch  2/20: Loss=1.606, Train Acc=44.9%, F1=0.308 | Val Acc=46.7%, F1=0.297 | Best F1=0.310


                                                                                       

 Epoch  3/20: Loss=1.579, Train Acc=45.9%, F1=0.304 | Val Acc=46.7%, F1=0.297 | Best F1=0.310


                                                                                       

 Epoch  4/20: Loss=1.577, Train Acc=46.1%, F1=0.301 | Val Acc=46.6%, F1=0.297 | Best F1=0.310
 Early stopping
 [Video (ResNet34)] Best Val: Acc=36.19%, F1=0.3102

------------------------------------------------------------
Score Function: hinge
------------------------------------------------------------
 Training fusion model...

 Results Summary (hinge):
 Individual Accuracies:
 Text:  61.72% | Audio: 41.30% | Video: 37.59%
 Fusion Accuracies: Avg Ensemble 61.03% | Learned Fusion 63.30%
 Conformal Metrics (target 90%):
 CF:     cov=90.23% | set= 4.35
 MinPV:  cov=91.26% | set= 5.57
 Fisher: cov=90.84% | set= 5.35
 AdjFis: cov=90.15% | set= 5.07
 WAvgL:  cov=99.31% | set= 6.78

------------------------------------------------------------
Score Function: margin
------------------------------------------------------------
 Training fusion model...

 Results Summary (margin):
 Individual Accuracies:
 Text:  61.72% | Audio: 41.30% | Video: 37.59%
 Fusion Accuracies: Avg Ensemble 61.03% |

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                                       

 Epoch  1/10: Loss=1.408, Train Acc=52.0%, F1=0.462 | Val Acc=63.2%, F1=0.601 | Best F1=0.601


                                                                                       

 Epoch  2/10: Loss=1.154, Train Acc=62.6%, F1=0.593 | Val Acc=63.5%, F1=0.608 | Best F1=0.608


                                                                                       

 Epoch  3/10: Loss=1.021, Train Acc=66.6%, F1=0.640 | Val Acc=60.9%, F1=0.597 | Best F1=0.608


                                                                                        

 Epoch  4/10: Loss=0.857, Train Acc=72.5%, F1=0.711 | Val Acc=63.7%, F1=0.632 | Best F1=0.632


                                                                                        

 Epoch  5/10: Loss=0.684, Train Acc=78.4%, F1=0.777 | Val Acc=62.2%, F1=0.611 | Best F1=0.632


                                                                                        

 Epoch  6/10: Loss=0.537, Train Acc=83.4%, F1=0.830 | Val Acc=59.6%, F1=0.602 | Best F1=0.632


                                                                                        

 Epoch  7/10: Loss=0.407, Train Acc=87.3%, F1=0.871 | Val Acc=61.3%, F1=0.611 | Best F1=0.632
 Early stopping
 [Text (RoBERTa)] Best Val: Acc=63.70%, F1=0.6316

 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.885, Train Acc=25.6%, F1=0.269 | Val Acc=46.0%, F1=0.306 | Best F1=0.306


                                                                                       

 Epoch  2/20: Loss=1.604, Train Acc=45.0%, F1=0.311 | Val Acc=45.0%, F1=0.326 | Best F1=0.326


                                                                                       

 Epoch  3/20: Loss=1.585, Train Acc=45.8%, F1=0.310 | Val Acc=46.7%, F1=0.297 | Best F1=0.326


                                                                                       

 Epoch  4/20: Loss=1.569, Train Acc=46.3%, F1=0.314 | Val Acc=46.8%, F1=0.300 | Best F1=0.326


                                                                                       

 Epoch  5/20: Loss=1.561, Train Acc=46.1%, F1=0.312 | Val Acc=46.7%, F1=0.297 | Best F1=0.326
 Early stopping
 [Audio (CNN)] Best Val: Acc=44.98%, F1=0.3261

 [Video (ResNet34)] Training...


                                                                                       

 Epoch  1/20: Loss=1.788, Train Acc=31.4%, F1=0.299 | Val Acc=45.0%, F1=0.302 | Best F1=0.302


                                                                                       

 Epoch  2/20: Loss=1.603, Train Acc=45.3%, F1=0.306 | Val Acc=44.3%, F1=0.313 | Best F1=0.313


                                                                                       

 Epoch  3/20: Loss=1.592, Train Acc=46.0%, F1=0.301 | Val Acc=43.3%, F1=0.293 | Best F1=0.313


                                                                                       

 Epoch  4/20: Loss=1.575, Train Acc=46.2%, F1=0.300 | Val Acc=46.7%, F1=0.297 | Best F1=0.313


                                                                                       

 Epoch  5/20: Loss=1.568, Train Acc=46.5%, F1=0.301 | Val Acc=46.7%, F1=0.297 | Best F1=0.313
 Early stopping
 [Video (ResNet34)] Best Val: Acc=44.31%, F1=0.3131

------------------------------------------------------------
Score Function: hinge
------------------------------------------------------------
 Training fusion model...

 Results Summary (hinge):
 Individual Accuracies:
 Text:  61.23% | Audio: 46.36% | Video: 45.13%
 Fusion Accuracies: Avg Ensemble 60.69% | Learned Fusion 63.03%
 Conformal Metrics (target 90%):
 CF:     cov=90.50% | set= 4.42
 MinPV:  cov=91.72% | set= 5.88
 Fisher: cov=90.38% | set= 5.45
 AdjFis: cov=89.12% | set= 5.22
 WAvgL:  cov=99.46% | set= 6.75

------------------------------------------------------------
Score Function: margin
------------------------------------------------------------
 Training fusion model...

 Results Summary (margin):
 Individual Accuracies:
 Text:  61.23% | Audio: 46.36% | Video: 45.13%
 Fusion Accuracies: Avg Ensemble 60.69% |

                                                                                        

 Epoch  1/10: Loss=1.403, Train Acc=51.5%, F1=0.466 | Val Acc=62.7%, F1=0.592 | Best F1=0.592


                                                                                       

 Epoch  2/10: Loss=1.147, Train Acc=62.8%, F1=0.598 | Val Acc=61.1%, F1=0.588 | Best F1=0.592


                                                                                       

 Epoch  3/10: Loss=1.011, Train Acc=67.1%, F1=0.648 | Val Acc=62.8%, F1=0.587 | Best F1=0.592


                                                                                       

 Epoch  4/10: Loss=0.843, Train Acc=73.0%, F1=0.714 | Val Acc=61.9%, F1=0.610 | Best F1=0.610


                                                                                        

 Epoch  5/10: Loss=0.680, Train Acc=78.5%, F1=0.778 | Val Acc=60.3%, F1=0.598 | Best F1=0.610


                                                                                        

 Epoch  6/10: Loss=0.544, Train Acc=83.0%, F1=0.827 | Val Acc=60.8%, F1=0.602 | Best F1=0.610


                                                                                        

 Epoch  7/10: Loss=0.425, Train Acc=86.4%, F1=0.862 | Val Acc=59.9%, F1=0.593 | Best F1=0.610
 Early stopping
 [Text (RoBERTa)] Best Val: Acc=61.89%, F1=0.6101

 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.816, Train Acc=29.7%, F1=0.293 | Val Acc=46.7%, F1=0.297 | Best F1=0.297


                                                                                       

 Epoch  2/20: Loss=1.609, Train Acc=45.2%, F1=0.311 | Val Acc=46.3%, F1=0.312 | Best F1=0.312


                                                                                       

 Epoch  3/20: Loss=1.588, Train Acc=46.0%, F1=0.311 | Val Acc=47.4%, F1=0.324 | Best F1=0.324


                                                                                       

 Epoch  4/20: Loss=1.568, Train Acc=46.3%, F1=0.311 | Val Acc=46.7%, F1=0.297 | Best F1=0.324


                                                                                       

 Epoch  5/20: Loss=1.557, Train Acc=46.1%, F1=0.307 | Val Acc=43.7%, F1=0.301 | Best F1=0.324


                                                                                       

 Epoch  6/20: Loss=1.553, Train Acc=46.3%, F1=0.310 | Val Acc=46.9%, F1=0.326 | Best F1=0.326


                                                                                       

 Epoch  7/20: Loss=1.550, Train Acc=46.3%, F1=0.309 | Val Acc=46.7%, F1=0.297 | Best F1=0.326


                                                                                       

 Epoch  8/20: Loss=1.548, Train Acc=46.3%, F1=0.305 | Val Acc=46.3%, F1=0.297 | Best F1=0.326


                                                                                       

 Epoch  9/20: Loss=1.546, Train Acc=46.4%, F1=0.313 | Val Acc=47.7%, F1=0.346 | Best F1=0.346


                                                                                       

 Epoch 10/20: Loss=1.537, Train Acc=46.6%, F1=0.308 | Val Acc=46.7%, F1=0.297 | Best F1=0.346


                                                                                       

 Epoch 11/20: Loss=1.537, Train Acc=46.5%, F1=0.304 | Val Acc=46.7%, F1=0.298 | Best F1=0.346


                                                                                       

 Epoch 12/20: Loss=1.533, Train Acc=46.6%, F1=0.312 | Val Acc=46.8%, F1=0.306 | Best F1=0.346
 Early stopping
 [Audio (CNN)] Best Val: Acc=47.69%, F1=0.3465

 [Video (ResNet34)] Training...


                                                                                       

 Epoch  1/20: Loss=1.926, Train Acc=25.0%, F1=0.259 | Val Acc=46.7%, F1=0.300 | Best F1=0.300


                                                                                       

 Epoch  2/20: Loss=1.610, Train Acc=44.7%, F1=0.308 | Val Acc=46.7%, F1=0.297 | Best F1=0.300


                                                                                       

 Epoch  3/20: Loss=1.589, Train Acc=45.9%, F1=0.301 | Val Acc=46.7%, F1=0.297 | Best F1=0.300


                                                                                       

 Epoch  4/20: Loss=1.571, Train Acc=46.4%, F1=0.302 | Val Acc=46.7%, F1=0.297 | Best F1=0.300
 Early stopping
 [Video (ResNet34)] Best Val: Acc=46.67%, F1=0.3004

------------------------------------------------------------
Score Function: hinge
------------------------------------------------------------
 Training fusion model...

 Results Summary (hinge):
 Individual Accuracies:
 Text:  62.68% | Audio: 47.28% | Video: 47.85%
 Fusion Accuracies: Avg Ensemble 61.49% | Learned Fusion 63.41%
 Conformal Metrics (target 90%):
 CF:     cov=91.46% | set= 4.66
 MinPV:  cov=90.80% | set= 5.71
 Fisher: cov=89.58% | set= 4.98
 AdjFis: cov=89.50% | set= 4.74
 WAvgL:  cov=98.89% | set= 6.64

------------------------------------------------------------
Score Function: margin
------------------------------------------------------------
 Training fusion model...

 Results Summary (margin):
 Individual Accuracies:
 Text:  62.68% | Audio: 47.28% | Video: 47.85%
 Fusion Accuracies: Avg Ensemble 61.49% |

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                                       

 Epoch  1/10: Loss=1.408, Train Acc=52.7%, F1=0.474 | Val Acc=58.2%, F1=0.554 | Best F1=0.554


                                                                                       

 Epoch  3/10: Loss=1.029, Train Acc=67.1%, F1=0.646 | Val Acc=58.9%, F1=0.573 | Best F1=0.581


                                                                                        

 Epoch  4/10: Loss=0.891, Train Acc=71.5%, F1=0.698 | Val Acc=59.5%, F1=0.576 | Best F1=0.581


                                                                                       

 Epoch  5/10: Loss=0.723, Train Acc=77.5%, F1=0.765 | Val Acc=57.3%, F1=0.569 | Best F1=0.581
 Early stopping
 [Text (RoBERTa)] Best Val: Acc=60.54%, F1=0.5814

 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.763, Train Acc=34.0%, F1=0.304 | Val Acc=46.7%, F1=0.297 | Best F1=0.297


                                                                                       

 Epoch  2/20: Loss=1.603, Train Acc=45.2%, F1=0.310 | Val Acc=46.7%, F1=0.297 | Best F1=0.297


                                                                                       

 Epoch  2/10: Loss=1.165, Train Acc=62.1%, F1=0.589 | Val Acc=61.9%, F1=0.587 | Best F1=0.587


Training:  81%|████████  | 84/104 [00:39<00:09,  2.18it/s, loss=0.898, grad_norm=7.70] 

 Epoch  3/10: Loss=1.037, Train Acc=66.4%, F1=0.640 | Val Acc=62.1%, F1=0.587 | Best F1=0.587


                                                                                        

 Epoch  4/10: Loss=0.880, Train Acc=72.4%, F1=0.704 | Val Acc=56.7%, F1=0.565 | Best F1=0.587


                                                                                        

 Epoch  5/10: Loss=0.721, Train Acc=77.2%, F1=0.762 | Val Acc=58.2%, F1=0.566 | Best F1=0.587
 Early stopping
 [Text (RoBERTa)] Best Val: Acc=61.89%, F1=0.5870

 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.856, Train Acc=27.6%, F1=0.274 | Val Acc=40.1%, F1=0.324 | Best F1=0.324


                                                                                       

 Epoch  6/20: Loss=1.554, Train Acc=46.3%, F1=0.300 | Val Acc=46.7%, F1=0.297 | Best F1=0.304
 Early stopping
 [Video (ResNet34)] Best Val: Acc=46.45%, F1=0.3037

------------------------------------------------------------
Score Function: hinge
------------------------------------------------------------
 Training fusion model...

 Results Summary (hinge):
 Individual Accuracies:
 Text:  63.22% | Audio: 38.70% | Video: 47.97%
 Fusion Accuracies: Avg Ensemble 59.35% | Learned Fusion 63.83%
 Conformal Metrics (target 90%):
 CF:     cov=89.85% | set= 4.34
 MinPV:  cov=93.37% | set= 5.94
 Fisher: cov=91.53% | set= 5.49
 AdjFis: cov=90.46% | set= 5.30
 WAvgL:  cov=99.35% | set= 6.76

------------------------------------------------------------
Score Function: margin
------------------------------------------------------------
 Training fusion model...

 Results Summary (margin):
 Individual Accuracies:
 Text:  63.22% | Audio: 38.70% | Video: 47.97%
 Fusion Accuracies: Avg Ensemble 59.35% |

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                                       

 Epoch  1/10: Loss=1.404, Train Acc=52.7%, F1=0.464 | Val Acc=62.8%, F1=0.606 | Best F1=0.606


                                                                                       

 Epoch  2/10: Loss=1.150, Train Acc=62.4%, F1=0.593 | Val Acc=60.3%, F1=0.553 | Best F1=0.606


                                                                                        

 Epoch  3/10: Loss=1.020, Train Acc=66.8%, F1=0.644 | Val Acc=62.5%, F1=0.598 | Best F1=0.606


                                                                                        

 Epoch  4/10: Loss=0.858, Train Acc=71.9%, F1=0.705 | Val Acc=59.2%, F1=0.578 | Best F1=0.606
 Early stopping
 [Text (RoBERTa)] Best Val: Acc=62.80%, F1=0.6057

 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.754, Train Acc=34.0%, F1=0.315 | Val Acc=45.8%, F1=0.323 | Best F1=0.323


                                                                                       

 Epoch  2/20: Loss=1.606, Train Acc=44.9%, F1=0.317 | Val Acc=46.7%, F1=0.297 | Best F1=0.323


                                                                                       

 Epoch  3/20: Loss=1.583, Train Acc=45.4%, F1=0.308 | Val Acc=46.7%, F1=0.297 | Best F1=0.323


                                                                                       

 Epoch  4/20: Loss=1.567, Train Acc=46.1%, F1=0.306 | Val Acc=46.7%, F1=0.297 | Best F1=0.323
 Early stopping
 [Audio (CNN)] Best Val: Acc=45.77%, F1=0.3228

 [Video (ResNet34)] Training...


                                                                                       

 Epoch  1/20: Loss=1.848, Train Acc=28.9%, F1=0.281 | Val Acc=45.7%, F1=0.304 | Best F1=0.304


                                                                                       

 Epoch  2/20: Loss=1.594, Train Acc=45.6%, F1=0.307 | Val Acc=46.7%, F1=0.297 | Best F1=0.304


                                                                                       

 Epoch  3/20: Loss=1.585, Train Acc=46.1%, F1=0.302 | Val Acc=46.7%, F1=0.297 | Best F1=0.304


                                                                                       

 Epoch  4/20: Loss=1.573, Train Acc=46.3%, F1=0.301 | Val Acc=45.4%, F1=0.296 | Best F1=0.304
 Early stopping
 [Video (ResNet34)] Best Val: Acc=45.66%, F1=0.3043

------------------------------------------------------------
Score Function: hinge
------------------------------------------------------------
 Training fusion model...

 Results Summary (hinge):
 Individual Accuracies:
 Text:  63.30% | Audio: 47.16% | Video: 46.21%
 Fusion Accuracies: Avg Ensemble 58.58% | Learned Fusion 63.98%
 Conformal Metrics (target 90%):
 CF:     cov=89.23% | set= 4.74
 MinPV:  cov=91.61% | set= 5.78
 Fisher: cov=90.23% | set= 5.40
 AdjFis: cov=89.77% | set= 5.15
 WAvgL:  cov=99.23% | set= 6.68

------------------------------------------------------------
Score Function: margin
------------------------------------------------------------
 Training fusion model...

 Results Summary (margin):
 Individual Accuracies:
 Text:  63.30% | Audio: 47.16% | Video: 46.21%
 Fusion Accuracies: Avg Ensemble 58.58% |

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                                        

 Epoch  1/10: Loss=1.415, Train Acc=51.1%, F1=0.449 | Val Acc=60.2%, F1=0.544 | Best F1=0.544


                                                                                       

 Epoch  2/10: Loss=1.175, Train Acc=61.7%, F1=0.583 | Val Acc=61.2%, F1=0.581 | Best F1=0.581


                                                                                        

 Epoch  3/10: Loss=1.042, Train Acc=66.6%, F1=0.640 | Val Acc=64.3%, F1=0.608 | Best F1=0.608


                                                                                        

 Epoch  4/10: Loss=0.876, Train Acc=71.7%, F1=0.701 | Val Acc=62.8%, F1=0.609 | Best F1=0.609


                                                                                        


 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.814, Train Acc=30.9%, F1=0.303 | Val Acc=46.7%, F1=0.298 | Best F1=0.298


                                                                                       

 Epoch  2/20: Loss=1.599, Train Acc=45.5%, F1=0.313 | Val Acc=46.7%, F1=0.297 | Best F1=0.298


                                                                                       

 Epoch  3/20: Loss=1.572, Train Acc=46.1%, F1=0.308 | Val Acc=44.8%, F1=0.304 | Best F1=0.304


                                                                                       

 Epoch  4/20: Loss=1.560, Train Acc=46.2%, F1=0.317 | Val Acc=46.7%, F1=0.297 | Best F1=0.304


                                                                                       

 Epoch  5/20: Loss=1.550, Train Acc=46.2%, F1=0.310 | Val Acc=46.4%, F1=0.302 | Best F1=0.304


                                                                                       

 Epoch  6/20: Loss=1.544, Train Acc=46.3%, F1=0.314 | Val Acc=46.4%, F1=0.303 | Best F1=0.304
 Early stopping
 [Audio (CNN)] Best Val: Acc=44.76%, F1=0.3040

 [Video (ResNet34)] Training...


                                                                                       

 Epoch  1/20: Loss=1.878, Train Acc=25.5%, F1=0.267 | Val Acc=45.3%, F1=0.298 | Best F1=0.298


                                                                                       

 Epoch  2/20: Loss=1.600, Train Acc=45.6%, F1=0.307 | Val Acc=45.9%, F1=0.295 | Best F1=0.298


                                                                                       

 Epoch  3/20: Loss=1.582, Train Acc=46.0%, F1=0.299 | Val Acc=46.7%, F1=0.297 | Best F1=0.298


                                                                                       

 Epoch  4/20: Loss=1.576, Train Acc=46.2%, F1=0.298 | Val Acc=46.7%, F1=0.297 | Best F1=0.298
 Early stopping
 [Video (ResNet34)] Best Val: Acc=45.32%, F1=0.2984

------------------------------------------------------------
Score Function: hinge
------------------------------------------------------------
 Training fusion model...

 Results Summary (hinge):
 Individual Accuracies:
 Text:  63.37% | Audio: 47.97% | Video: 46.44%
 Fusion Accuracies: Avg Ensemble 60.46% | Learned Fusion 63.60%
 Conformal Metrics (target 90%):
 CF:     cov=90.31% | set= 4.50
 MinPV:  cov=92.30% | set= 5.72
 Fisher: cov=91.72% | set= 5.43
 AdjFis: cov=91.07% | set= 5.20
 WAvgL:  cov=98.97% | set= 6.76

------------------------------------------------------------
Score Function: margin
------------------------------------------------------------
 Training fusion model...

 Results Summary (margin):
 Individual Accuracies:
 Text:  63.37% | Audio: 47.97% | Video: 46.44%
 Fusion Accuracies: Avg Ensemble 60.46% |

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                                        

 Epoch  1/10: Loss=1.390, Train Acc=52.2%, F1=0.459 | Val Acc=59.6%, F1=0.586 | Best F1=0.586


                                                                                       

 Epoch  2/10: Loss=1.180, Train Acc=61.5%, F1=0.583 | Val Acc=63.4%, F1=0.593 | Best F1=0.593


                                                                                       

 Epoch  3/10: Loss=1.038, Train Acc=66.7%, F1=0.639 | Val Acc=64.6%, F1=0.621 | Best F1=0.621


                                                                                        

 Epoch  4/10: Loss=0.856, Train Acc=72.8%, F1=0.710 | Val Acc=60.1%, F1=0.589 | Best F1=0.621


                                                                                        

 Epoch  5/10: Loss=0.708, Train Acc=77.4%, F1=0.762 | Val Acc=55.8%, F1=0.567 | Best F1=0.621


                                                                                        

 Epoch  6/10: Loss=0.556, Train Acc=82.2%, F1=0.819 | Val Acc=60.1%, F1=0.593 | Best F1=0.621
 Early stopping
 [Text (RoBERTa)] Best Val: Acc=64.60%, F1=0.6212

 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.785, Train Acc=32.7%, F1=0.303 | Val Acc=45.7%, F1=0.322 | Best F1=0.322


                                                                                       

 Epoch  2/20: Loss=1.597, Train Acc=45.1%, F1=0.316 | Val Acc=41.5%, F1=0.324 | Best F1=0.324


                                                                                       

 Epoch  3/20: Loss=1.584, Train Acc=45.8%, F1=0.309 | Val Acc=46.7%, F1=0.297 | Best F1=0.324


                                                                                       

 Epoch  4/20: Loss=1.574, Train Acc=46.2%, F1=0.300 | Val Acc=46.7%, F1=0.297 | Best F1=0.324


                                                                                       

 Epoch  5/20: Loss=1.560, Train Acc=46.5%, F1=0.298 | Val Acc=46.7%, F1=0.297 | Best F1=0.324
 Early stopping
 [Audio (CNN)] Best Val: Acc=41.49%, F1=0.3237

 [Video (ResNet34)] Training...


                                                                                       

 Epoch  1/20: Loss=1.818, Train Acc=30.1%, F1=0.287 | Val Acc=44.5%, F1=0.316 | Best F1=0.316


                                                                                       

 Epoch  2/20: Loss=1.613, Train Acc=45.2%, F1=0.311 | Val Acc=39.8%, F1=0.306 | Best F1=0.316


                                                                                       

 Epoch  3/20: Loss=1.590, Train Acc=45.8%, F1=0.302 | Val Acc=46.7%, F1=0.297 | Best F1=0.316


                                                                                       

 Epoch  4/20: Loss=1.579, Train Acc=46.3%, F1=0.302 | Val Acc=46.7%, F1=0.297 | Best F1=0.316
 Early stopping
 [Video (ResNet34)] Best Val: Acc=44.53%, F1=0.3163

------------------------------------------------------------
Score Function: hinge
------------------------------------------------------------
 Training fusion model...

 Results Summary (hinge):
 Individual Accuracies:
 Text:  64.21% | Audio: 40.54% | Video: 44.75%
 Fusion Accuracies: Avg Ensemble 60.31% | Learned Fusion 63.45%
 Conformal Metrics (target 90%):
 CF:     cov=90.00% | set= 4.15
 MinPV:  cov=91.80% | set= 5.83
 Fisher: cov=90.15% | set= 5.56
 AdjFis: cov=88.89% | set= 5.20
 WAvgL:  cov=99.35% | set= 6.79

------------------------------------------------------------
Score Function: margin
------------------------------------------------------------
 Training fusion model...

 Results Summary (margin):
 Individual Accuracies:
 Text:  64.21% | Audio: 40.54% | Video: 44.75%
 Fusion Accuracies: Avg Ensemble 60.31% |

                                                                                        

 Epoch  2/10: Loss=1.146, Train Acc=63.0%, F1=0.601 | Val Acc=60.2%, F1=0.580 | Best F1=0.580


                                                                                       

 Epoch  3/10: Loss=1.014, Train Acc=67.5%, F1=0.650 | Val Acc=61.4%, F1=0.597 | Best F1=0.597


                                                                                        

 Epoch  4/10: Loss=0.854, Train Acc=72.7%, F1=0.709 | Val Acc=59.5%, F1=0.579 | Best F1=0.597


                                                                                        

 Epoch  5/10: Loss=0.696, Train Acc=77.6%, F1=0.765 | Val Acc=61.4%, F1=0.603 | Best F1=0.603


                                                                                        

 Epoch  6/10: Loss=0.549, Train Acc=82.6%, F1=0.822 | Val Acc=59.9%, F1=0.580 | Best F1=0.603


                                                                                        

 Epoch  7/10: Loss=0.430, Train Acc=86.7%, F1=0.865 | Val Acc=58.9%, F1=0.586 | Best F1=0.603


                                                                                        

 Epoch  8/10: Loss=0.353, Train Acc=89.4%, F1=0.893 | Val Acc=58.6%, F1=0.585 | Best F1=0.603
 Early stopping
 [Text (RoBERTa)] Best Val: Acc=61.44%, F1=0.6030

 [Audio (CNN)] Training...


                                                                                       

 Epoch  1/20: Loss=1.822, Train Acc=29.7%, F1=0.289 | Val Acc=46.7%, F1=0.297 | Best F1=0.297


                                                                                       

 Epoch  2/20: Loss=1.599, Train Acc=44.8%, F1=0.306 | Val Acc=46.7%, F1=0.297 | Best F1=0.297


Training:  96%|█████████▌| 100/104 [00:14<00:00,  7.06it/s, loss=1.525, grad_norm=2.13]

 Training fusion model...

 Results Summary (raps):
 Individual Accuracies:
 Text:  63.72% | Audio: 41.61% | Video: 43.68%
 Fusion Accuracies: Avg Ensemble 60.61% | Learned Fusion 64.25%
 Conformal Metrics (target 90%):
 CF:     cov=90.38% | set= 4.36
 MinPV:  cov=91.38% | set= 5.93
 Fisher: cov=91.42% | set= 5.43
 AdjFis: cov=91.38% | set= 5.35
 WAvgL:  cov=99.46% | set= 6.77

=== Coverage (raw rows) ===
           Score  Sim  K  Conformal Fusion  Min p-Value     Fisher  \
0          hinge    0  3         90.229885    91.264368  90.842912   
1         margin    0  3         89.770115    90.000000  90.919540   
2  cross_entropy    0  3         90.229885    91.264368  90.842912   
3           raps    0  3         89.425287    92.452107  92.030651   
4          hinge    1  3         90.498084    91.724138  90.383142   

   Adjusted Fisher  Weighted Avg (learned)  
0        90.153257               99.310345  
1        90.076628               99.501916  
2        90.153257               99