In [None]:
# =============================================================================
# This code was prepared by Aaliyah Chang (20231595) for the ELEC 872 Final Project,
# with the support of Claude Sonnet 4.5 (https://claude.ai/) for troubleshooting.
# =========================
# The SOFTMENT architecture used was developed based on the following paper:

# [1]Amrita Singh, Preethu Rose Anish, and Smita Ghaisas. 2024. SOFTMENT: Detecting
# Mental Health and Wellbeing of Women in the Software Sector. In Companion of the
# 2024 on ACM International Joint Conference on Pervasive and Ubiquitous Computing (UbiComp '24).
# Association for Computing Machinery, New York, NY, USA, 405â€“411. https://doi.org/10.1145/3675094.3678493
# =========================

# The DAIC-WOZ dataset was used for this project and can be accessed/learned about here:

# [2]David DeVault, Ron Artstein, Grace Benn, Teresa Dey, Ed Fast, Alesia Gainer,
# Kallirroi Georgila, Jon Gratch, Arno Hartholt, Margaux Lhommet, Gale Lucas, Stacy Marsella,
# Fabrizio Morbini, Angela Nazarian, Stefan Scherer, Giota Stratou, Apar Suri, David Traum,
# Rachel Wood, Yuyu Xu, Albert Rizzo, and Louis-Philippe Morency. 2014.
# SimSensei kiosk: a virtual human interviewer for healthcare decision support.
# In Proceedings of the 2014 International Conference on Autonomous Agents and Multi-Agent Systems
# (AAMAS â€™14), International Foundation for Autonomous Agents, Paris, France, 1061â€“1068.

# [3]Jonathan Gratch, Ron Artstein, Gale M Lucas, Giota Stratou, Stefan Scherer,
# Angela Nazarian, Rachel Wood, Jill Boberg, David DeVault, Stacy Marsella, and others.
# 2014. The distress analysis interview corpus of human and computer interviews.
# In Lrec, Reykjavik, 3123â€“3128.
# =============================================================================

# =============================================================================
# Setup
# =============================================================================
print("="*80)
print("ELEC 872 Project Code")
print("="*80)

!pip install transformers torch pandas numpy matplotlib seaborn tqdm scipy scikit-learn

#from google.colab import drive
import os, re, torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, PegasusForConditionalGeneration, PegasusTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# # Mount drive
# drive.mount('/content/drive')
# os.chdir('/content/drive/MyDrive/GradSchool/ELEC872')

# # Configuration
# DATA_ROOT = "/content/drive/MyDrive/GradSchool/ELEC872/daicWoz"
# LABELS_DIR = "/content/drive/MyDrive/GradSchool/ELEC872/labels"

DATA_ROOT = "data"
LABELS_DIR = "labels"

TRAIN_LABEL_FILE = os.path.join(LABELS_DIR, "train_split_Depression_AVEC2017.csv")
DEV_LABEL_FILE = os.path.join(LABELS_DIR, "dev_split_Depression_AVEC2017.csv")
TEST_LABEL_FILE = os.path.join(LABELS_DIR, "full_test_split.csv")

OUTPUT_DIR = "outputs"
FIGURES_DIR = os.path.join(OUTPUT_DIR, "figures")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FIGURES_DIR, exist_ok=True)

# Hyperparameters
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 10
MAX_LENGTH = 128
CORR_THRESHOLD = 0.85
MIN_UTTERANCE_WORDS = 5

# Set seeds (randomly chosen, but to )
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print(f"âœ“ Configuration loaded")
print(f"  Device: {DEVICE}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Max length: {MAX_LENGTH} tokens (utterance-level)")
print(f"  Min utterance: {MIN_UTTERANCE_WORDS} words")

In [None]:
# =============================================================================
# Function Definitions
# =============================================================================
print("\n" + "="*80)
print("Function Definitions Loaded")
print("="*80)

def clean_text(text):
    """Text normalization"""
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)  # Remove annotations
    text = re.sub(r"[^a-z\s]", "", text)  # Keep only letters
    return text.strip()

def load_transcript_utterances(pid, data_root, min_words=5):
    """
    Load transcript and split into utterances (speaker turns)
    Returns list of utterances for the participant
    """
    path = os.path.join(data_root, f"{pid}_P", f"{pid}_TRANSCRIPT.csv")
    if not os.path.exists(path):
        return []

    try:
        df = pd.read_csv(path, sep=None, engine='python')

        # Filter participant speech only
        df = df[df["speaker"].str.contains("Participant", case=False, na=False)]

        utterances = []
        for text in df["value"].astype(str):
            cleaned = clean_text(text)
            word_count = len(cleaned.split())

            # Filter short utterances
            if word_count >= min_words:
                utterances.append(cleaned)

        return utterances
    except:
        return []

def select_aus_globally(pids, data_root, corr_threshold=0.85):
    """Global AU selection with correlation-based pruning"""
    print("\nðŸ”¬ Global AU Selection")
    all_au_data = []

    for pid in tqdm(pids, desc="  Loading AU files"):
        path = os.path.join(data_root, f"{pid}_P", f"{pid}_CLNF_AUs.txt")
        if not os.path.exists(path):
            continue

        try:
            df = pd.read_csv(path, sep=None, engine='python')
            df.columns = df.columns.str.strip().str.replace(' ', '')

            # Get AU intensity columns
            au_cols = [c for c in df.columns if c.startswith("AU") and c.endswith("_r")]
            if not au_cols:
                continue

            # Filter successful frames
            if 'success' in df.columns:
                df = df[df['success'] == 1]

            if not df.empty:
                all_au_data.append(df[au_cols])
        except:
            continue

    # Combine and prune
    combined_df = pd.concat(all_au_data, ignore_index=True)
    df_au = combined_df.loc[:, combined_df.var() > 1e-6]

    print(f"  Initial AUs: {len(combined_df.columns)} â†’ After variance: {len(df_au.columns)}")

    # Correlation pruning
    corr_matrix = df_au.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = set()
    for col in upper_tri.columns:
        for row in upper_tri.index:
            if upper_tri.loc[row, col] > corr_threshold:
                if df_au[row].var() < df_au[col].var():
                    to_drop.add(row)
                else:
                    to_drop.add(col)

    selected_aus = sorted([au for au in df_au.columns if au not in to_drop])
    print(f"  After pruning (threshold={corr_threshold}): {len(selected_aus)} AUs")

    return selected_aus

def load_pooled_au_features(pid, data_root, selected_aus):
    """Load and pool AU features for a participant"""
    path = os.path.join(data_root, f"{pid}_P", f"{pid}_CLNF_AUs.txt")
    if not os.path.exists(path):
        return None

    try:
        df = pd.read_csv(path, sep=None, engine='python')
        df.columns = df.columns.str.strip().str.replace(' ', '')

        # Filter successful frames
        if 'success' in df.columns:
            df = df[df['success'] == 1]

        if df.empty:
            return None

        # Mean pool across frames
        au_features = df[selected_aus].mean().values
        return au_features
    except:
        return None

def build_utterance_dataset(labels_df, data_root, selected_aus, min_words=5):
    """
    Build dataset at utterance-level
    Each row = one utterance with participant-level label and AU features
    """
    records = []

    for _, row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="  Building dataset"):
        pid = str(int(row["Participant_ID"]))
        phq8_score = row["PHQ8_Score"]

        # Load utterances
        utterances = load_transcript_utterances(pid, data_root, min_words)
        if not utterances:
            continue

        # Load pooled AUs (same for all utterances from this participant)
        au_features = load_pooled_au_features(pid, data_root, selected_aus)
        if au_features is None:
            continue

        # Create record for each utterance
        for utterance in utterances:
            record = {
                "Participant_ID": pid,
                "utterance": utterance,
                "PHQ8_Score": phq8_score
            }
            # Add AU features
            for i, au_name in enumerate(selected_aus):
                record[au_name] = au_features[i]

            records.append(record)

    return pd.DataFrame(records)

def create_emotion_labels(df):
    """Create emotion labels from PHQ-8 using tertile split"""
    scores = df['PHQ8_Score'].values
    low_thresh = np.percentile(scores, 33)
    high_thresh = np.percentile(scores, 67)

    labels = []
    for score in scores:
        if score <= low_thresh:
            labels.append(0)  # Positive (low depression)
        elif score <= high_thresh:
            labels.append(1)  # Neutral (medium)
        else:
            labels.append(2)  # Negative (high depression)

    return np.array(labels), low_thresh, high_thresh



In [None]:

# =============================================================================
# PHASE 3: Pre-Processing
# =============================================================================
print("\n" + "="*80)
print("Starting Preprocessing")
print("="*80)

# Load label
train_labels = pd.read_csv(TRAIN_LABEL_FILE)[["Participant_ID", "PHQ8_Score"]]
dev_labels = pd.read_csv(DEV_LABEL_FILE)[["Participant_ID", "PHQ8_Score"]]
test_labels = pd.read_csv(TEST_LABEL_FILE)[["Participant_ID", "PHQ8_Score"]]

print(f"  Train: {len(train_labels)} participants")
print(f"  Dev: {len(dev_labels)} participants")
print(f"  Test: {len(test_labels)} participants")

# Global AU selection
all_pids = (train_labels["Participant_ID"].tolist() +
            dev_labels["Participant_ID"].tolist() +
            test_labels["Participant_ID"].tolist())

au_columns = select_aus_globally(all_pids, DATA_ROOT, CORR_THRESHOLD)

# Build utterance-level datasets
print("\n Building test, train and dev datasets")
train_df_utterances = build_utterance_dataset(train_labels, DATA_ROOT, au_columns, MIN_UTTERANCE_WORDS)
dev_df_utterances = build_utterance_dataset(dev_labels, DATA_ROOT, au_columns, MIN_UTTERANCE_WORDS)
test_df_utterances = build_utterance_dataset(test_labels, DATA_ROOT, au_columns, MIN_UTTERANCE_WORDS)

print(f"\n  Train: {len(train_df_utterances)} utterances from {train_df_utterances['Participant_ID'].nunique()} participants")
print(f"  Dev: {len(dev_df_utterances)} utterances from {dev_df_utterances['Participant_ID'].nunique()} participants")
print(f"  Test: {len(test_df_utterances)} utterances from {test_df_utterances['Participant_ID'].nunique()} participants")

# Create emotion labels
print("\nCreating emotion labels...")
train_emotion_labels, thresh_low, thresh_high = create_emotion_labels(train_df_utterances)
dev_emotion_labels, _, _ = create_emotion_labels(dev_df_utterances)
test_emotion_labels, _, _ = create_emotion_labels(test_df_utterances)

train_df_utterances['emotion_label'] = train_emotion_labels
dev_df_utterances['emotion_label'] = dev_emotion_labels
test_df_utterances['emotion_label'] = test_emotion_labels

print(f"  Thresholds: Pos â‰¤ {thresh_low:.1f} | {thresh_low:.1f} < Neu â‰¤ {thresh_high:.1f} | Neg > {thresh_high:.1f}")

for split_name, labels in [('Train', train_emotion_labels),
                           ('Dev', dev_emotion_labels),
                           ('Test', test_emotion_labels)]:
    counts = np.bincount(labels, minlength=3)
    print(f"  {split_name}: Pos={counts[0]}, Neu={counts[1]}, Neg={counts[2]}")

# Data Augmentation
print("\nData Augmentation Pegasus Utterance:")
pegasus_model = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').to(DEVICE)
pegasus_tokenizer = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')

augmented_records = []
print(f"  Paraphrasing {len(train_df_utterances)} training utterances...")

for _, row in tqdm(train_df_utterances.iterrows(), total=len(train_df_utterances), desc="  Augmenting"):
    # Generate 1 paraphrase per utterance
    inputs = pegasus_tokenizer(
        row['utterance'],
        padding=True,
        truncation=True,
        max_length=60,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = pegasus_model.generate(
        **inputs,
        num_beams=5,
        num_return_sequences=1,
        max_length=60
    )

    paraphrased = pegasus_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Create augmented record
    aug_row = row.copy()
    aug_row['utterance'] = paraphrased
    aug_row['Participant_ID'] = f"{row['Participant_ID']}_aug"
    augmented_records.append(aug_row)

# Combine
train_df_augmented = pd.concat([train_df_utterances, pd.DataFrame(augmented_records)], ignore_index=True)
print(f"  Training utterances: {len(train_df_utterances)} â†’ {len(train_df_augmented)}")

# Clean up GPU
del pegasus_model, pegasus_tokenizer
torch.cuda.empty_cache()

print("\nâœ“ Preprocessing complete")
print(f"  Final train utterances: {len(train_df_augmented)}")
print(f"  AU features: {len(au_columns)}")


In [None]:
# =============================================================================
# PHASE 4: Dataset classes
# =============================================================================
print("\n" + "="*80)
print("Defining dataset classes:")
print("="*80)


class DAICUtteranceDataset(Dataset):
    # Change: Added au_mean and au_std as optional arguments
    def __init__(self, df, tokenizer, au_cols, au_mean=None, au_std=None):
        self.df = df.dropna(subset=['utterance', 'emotion_label']).reset_index(drop=True)
        self.tokenizer = tokenizer
        self.au_cols = au_cols

        # Change: Use the passed-in mean/std instead of calculating from self.df
        if au_mean is not None and au_std is not None:
            self.df[au_cols] = (self.df[au_cols] - au_mean) / (au_std + 1e-8)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Tokenize utterance
        encoding = self.tokenizer(
            str(row["utterance"]),
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "au_features": torch.tensor(row[self.au_cols].values.astype(np.float32), dtype=torch.float),
            "labels": torch.tensor(int(row["emotion_label"]), dtype=torch.long),
            "participant_id": str(row["Participant_ID"]).split('_')[0],  # Remove _aug suffix
            "phq8_score": torch.tensor(float(row["PHQ8_Score"]), dtype=torch.float)
        }

class DAICBaselineDataset(Dataset):
    """Dataset for baseline: pooled transcript per participant"""
    def __init__(self, labels_df, data_root, tokenizer, au_cols):
        self.records = []
        self.tokenizer = tokenizer
        self.au_cols = au_cols

        for _, row in labels_df.iterrows():
            pid = str(int(row["Participant_ID"]))

            # Load and pool all utterances
            utterances = load_transcript_utterances(pid, data_root, min_words=1)  # Don't filter for baseline
            if not utterances:
                continue

            pooled_text = " ".join(utterances)

            # Get PHQ-8 threshold label (0=Low, 1=Med, 2=High)
            phq8 = row["PHQ8_Score"]
            if phq8 <= thresh_low:
                label = 0
            elif phq8 <= thresh_high:
                label = 1
            else:
                label = 2

            self.records.append({
                'text': pooled_text,
                'label': label,
                'phq8': phq8,
                'pid': pid
            })

        self.records = [r for r in self.records if len(r['text']) > 20]

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]

        encoding = self.tokenizer(
            record['text'],
            truncation=True,
            padding="max_length",
            max_length=512,  # Full transcript
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(record['label'], dtype=torch.long),
            "participant_id": record['pid']
        }

print("Dataset classes defined")

In [None]:
# =============================================================================
# PHASE 5: MODEL ARCHITECTURES
# =============================================================================
print("\n" + "="*80)
print("Model Architectures Defined")
print("="*80)

class EarlyStopping:
    """Early stopping to prevent overfitting"""
    def __init__(self, patience=7, min_delta=0.0, mode='min'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
            return False

        if self.mode == 'min':
            improved = score < (self.best_score - self.min_delta)
        else:
            improved = score > (self.best_score + self.min_delta)

        if improved:
            self.best_score = score
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

        return self.early_stop

class ModelCheckpoint:
    """Save best model based on validation metric"""
    def __init__(self, filepath, mode='min', save_best_only=True):
        self.filepath = filepath
        self.mode = mode
        self.save_best_only = save_best_only
        self.best_score = None

    def __call__(self, model, score, epoch):
        if not self.save_best_only:
            self._save(model, epoch)
            return True

        if self.best_score is None:
            self.best_score = score
            self._save(model, epoch)
            return True

        improved = (score < self.best_score) if self.mode == 'min' else (score > self.best_score)

        if improved:
            self.best_score = score
            self._save(model, epoch)
            return True

        return False

    def _save(self, model, epoch):
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'best_score': self.best_score
        }, self.filepath)
        print(f"  â†’ Model saved to {self.filepath}")

def get_scheduler(optimizer, scheduler_type='cosine', num_epochs=10, num_steps_per_epoch=100):
    """Create learning rate scheduler"""
    if scheduler_type == 'cosine':
        return torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    elif scheduler_type == 'step':
        return torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)
    elif scheduler_type == 'reduce_on_plateau':
        return torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
    elif scheduler_type == 'warmup_cosine':
        def lr_lambda(epoch):
            warmup_epochs = 2
            if epoch < warmup_epochs:
                return (epoch + 1) / warmup_epochs
            return 0.5 * (1 + np.cos(np.pi * (epoch - warmup_epochs) / (num_epochs - warmup_epochs)))
        return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    else:
        return None

print("âœ“ Early stopping and checkpoint utilities defined")
def mean_pooling(token_embeddings, attention_mask):
    """Mean pooling over token embeddings"""
    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    summed = torch.sum(token_embeddings * mask, dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

# Baseline: Direct PHQ-8 Classification
class BaselineClassifier(nn.Module):
    """Baseline: Pooled transcript â†’ Direct PHQ-8 threshold classification"""
    def __init__(self):
        super().__init__()
        self.bert = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Linear(768, 3)  # 3 PHQ-8 thresholds

    def forward(self, input_ids, attention_mask, au_features=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = mean_pooling(outputs.last_hidden_state, attention_mask)
        return self.classifier(pooled)

# SOFTMENT: Text-Only Emotion Classifier
class TextEmotionClassifier(nn.Module):
    """Utterance â†’ Emotion classification (Pos/Neu/Neg)"""
    def __init__(self):
        super().__init__()
        self.bert = RobertaModel.from_pretrained("roberta-base")
        self.classifier = nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask, au_features=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = mean_pooling(outputs.last_hidden_state, attention_mask)
        return self.classifier(pooled)

# Video-Only for Late Fusion
class VideoEmotionClassifier(nn.Module):
    def __init__(self, n_au):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(n_au, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 3)
        )

    def forward(self, input_ids, attention_mask, au_features):
        return self.model(au_features)

# Early Fusion: Concatenation
class EarlyFusionConcat(nn.Module):
    def __init__(self, n_au):
        super().__init__()
        self.bert = RobertaModel.from_pretrained("roberta-base")
        self.au_proj = nn.Linear(n_au, 128)
        self.classifier = nn.Sequential(
            nn.Linear(768 + 128, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 3)
        )

    def forward(self, input_ids, attention_mask, au_features):
        text_emb = mean_pooling(self.bert(input_ids, attention_mask)[0], attention_mask)
        au_emb = torch.relu(self.au_proj(au_features))
        combined = torch.cat([text_emb, au_emb], dim=1)
        return self.classifier(combined)

# Early Fusion: Gated
class EarlyFusionGated(nn.Module):
    def __init__(self, n_au):
        super().__init__()
        self.bert = RobertaModel.from_pretrained("roberta-base")
        self.au_proj = nn.Linear(n_au, 768)
        self.gate = nn.Sequential(nn.Linear(768 * 2, 1), nn.Sigmoid())
        self.classifier = nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask, au_features):
        text_emb = mean_pooling(self.bert(input_ids, attention_mask)[0], attention_mask)
        au_emb = torch.relu(self.au_proj(au_features))
        gate_weight = self.gate(torch.cat([text_emb, au_emb], dim=1))
        fused = gate_weight * text_emb + (1 - gate_weight) * au_emb
        return self.classifier(fused)


In [None]:
# =============================================================================
# Training and Eval Functions
# =============================================================================
print("\n" + "="*80)
print("Training functions defined")
print("="*80)



def train_model(model, train_loader, dev_loader, model_name, is_baseline=False):
    model.to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    best_f1 = 0

    for epoch in range(NUM_EPOCHS):
        # Training
        model.train()
        train_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}"):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            if is_baseline:
                logits = model(input_ids, attention_mask, None)
            else:
                au_features = batch['au_features'].to(DEVICE)
                logits = model(input_ids, attention_mask, au_features)

            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in dev_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels']

                if is_baseline:
                    logits = model(input_ids, attention_mask, None)
                else:
                    au_features = batch['au_features'].to(DEVICE)
                    logits = model(input_ids, attention_mask, au_features)

                preds = torch.argmax(logits, dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_labels.extend(labels.numpy())

        val_f1 = f1_score(val_labels, val_preds, average='weighted')
        val_acc = accuracy_score(val_labels, val_preds)

        print(f"  Loss: {train_loss/len(train_loader):.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, f"{model_name}_best.pt"))

    # Load best
    model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, f"{model_name}_best.pt")))
    print(f"âœ“ Best Val F1: {best_f1:.4f}")
    return model

def calculate_participant_mh_scores(model, test_loader, test_df, is_baseline=False):
    # Calculate MH-Scores by aggregating predictions per participant
    # SOFTMENT formula: MH-Score = sqrt(mean(negative_confidences)) [1]

    model.eval()
    utterance_results = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            participant_ids = batch['participant_id']
            phq8_scores = batch['phq8_score']

            if is_baseline:
                logits = model(input_ids, attention_mask, None)
            else:
                au_features = batch['au_features'].to(DEVICE)
                logits = model(input_ids, attention_mask, au_features)

            probs = torch.softmax(logits, dim=1).cpu().numpy()

            for i in range(len(participant_ids)):
                utterance_results.append({
                    'participant_id': participant_ids[i],
                    'phq8_score': phq8_scores[i].item(),
                    'negative_conf': probs[i, 2],  # Confidence of Negative class
                    'confidences': probs[i]
                })

    participant_mh_scores = {}
    participant_phq8 = {}

    for result in utterance_results:
        pid = result['participant_id']
        if pid not in participant_mh_scores:
            participant_mh_scores[pid] = []
            participant_phq8[pid] = result['phq8_score']
        participant_mh_scores[pid].append(result['negative_conf'])

    mh_scores = []
    phq8_scores = []

    for pid in sorted(participant_mh_scores.keys()):
        negative_confs = participant_mh_scores[pid]
        mh_score = np.sqrt(np.mean(negative_confs))
        mh_scores.append(mh_score)
        phq8_scores.append(participant_phq8[pid])

    return np.array(mh_scores), np.array(phq8_scores)

def evaluate_baseline(model, test_loader):
    """Evaluate baseline: Direct classification metrics"""
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels']

            logits = model(input_ids, attention_mask, None)
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')
    acc = accuracy_score(all_labels, all_preds)

    return {
        'f1': f1,
        'accuracy': acc,
        'predictions': all_preds,
        'labels': all_labels
    }

def evaluate_softment(model, test_loader, test_df, model_name, is_baseline=False):
    """Evaluate SOFTMENT: MH-Score correlation with PHQ-8"""
    mh_scores, phq8_scores = calculate_participant_mh_scores(model, test_loader, test_df, is_baseline)

    pearson_r, pearson_p = pearsonr(mh_scores, phq8_scores)
    spearman_r, spearman_p = spearmanr(mh_scores, phq8_scores)

    mh_scaled = mh_scores * 20
    mae = np.mean(np.abs(phq8_scores - mh_scaled))
    rmse = np.sqrt(np.mean((phq8_scores - mh_scaled)**2))

    results = {
        'model': model_name,
        'mh_pearson': pearson_r,
        'mh_spearman': spearman_r,
        'mh_mae': mae,
        'mh_rmse': rmse,
        'mh_scores': mh_scores,
        'phq8_scores': phq8_scores,
        'n_participants': len(mh_scores)
    }

    print(f"\n {model_name}")
    print(f"  Participants: {len(mh_scores)}")
    print(f"  MH-Score Pearson: {pearson_r:.4f} (p={pearson_p:.4f})")
    print(f"  MH-Score Spearman: {spearman_r:.4f}")
    print(f"  MH-Score MAE: {mae:.4f}")
    print(f"  MH-Score RMSE: {rmse:.4f}")

    return results

def evaluate_late_fusion(text_model, video_model, test_loader, test_df, fusion_weight=0.5):
    """Late fusion: Combine separate text and video MH-Scores"""
    text_model.eval()
    video_model.eval()
    utterance_results = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            au_features = batch['au_features'].to(DEVICE)
            participant_ids = batch['participant_id']
            phq8_scores = batch['phq8_score']

            text_logits = text_model(input_ids, attention_mask, None)
            text_probs = torch.softmax(text_logits, dim=1).cpu().numpy()

            video_logits = video_model(None, None, au_features)
            video_probs = torch.softmax(video_logits, dim=1).cpu().numpy()

            for i in range(len(participant_ids)):
                utterance_results.append({
                    'participant_id': participant_ids[i],
                    'phq8_score': phq8_scores[i].item(),
                    'text_neg_conf': text_probs[i, 2],
                    'video_neg_conf': video_probs[i, 2]
                })

    participant_data = {}
    for result in utterance_results:
        pid = result['participant_id']
        if pid not in participant_data:
            participant_data[pid] = {
                'text_neg_confs': [],
                'video_neg_confs': [],
                'phq8': result['phq8_score']
            }
        participant_data[pid]['text_neg_confs'].append(result['text_neg_conf'])
        participant_data[pid]['video_neg_confs'].append(result['video_neg_conf'])

    mh_scores = []
    phq8_scores = []

    for pid in sorted(participant_data.keys()):
        data = participant_data[pid]
        text_mh = np.sqrt(np.mean(data['text_neg_confs']))
        video_mh = np.sqrt(np.mean(data['video_neg_confs']))
        combined_mh = fusion_weight * text_mh + (1 - fusion_weight) * video_mh

        mh_scores.append(combined_mh)
        phq8_scores.append(data['phq8'])

    mh_scores = np.array(mh_scores)
    phq8_scores = np.array(phq8_scores)

    pearson_r, _ = pearsonr(mh_scores, phq8_scores)
    spearman_r, _ = spearmanr(mh_scores, phq8_scores)
    mh_scaled = mh_scores * 20
    mae = np.mean(np.abs(phq8_scores - mh_scaled))
    rmse = np.sqrt(np.mean((phq8_scores - mh_scaled)**2))

    results = {
        'model': 'Late Fusion (Text+Video)',
        'mh_pearson': pearson_r,
        'mh_spearman': spearman_r,
        'mh_mae': mae,
        'mh_rmse': rmse,
        'mh_scores': mh_scores,
        'phq8_scores': phq8_scores,
        'n_participants': len(mh_scores)
    }

    print(f"\n Late Fusion")
    print(f"  Fusion weight: {fusion_weight} text, {1-fusion_weight} video")
    print(f"  MH-Score Pearson: {pearson_r:.4f}")
    print(f"  MH-Score MAE: {mae:.4f}")

    return results






In [None]:
# =============================================================================
# Experimentation
# =============================================================================
print("\n" + "="*80)
print("Training Time! (Finally)")
print("="*80)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

train_au_mean = train_df_augmented[au_columns].mean()
train_au_std = train_df_augmented[au_columns].std()


train_dataset = DAICUtteranceDataset(train_df_augmented, tokenizer, au_columns,
                                     au_mean=train_au_mean, au_std=train_au_std)

dev_dataset = DAICUtteranceDataset(dev_df_utterances, tokenizer, au_columns,
                                   au_mean=train_au_mean, au_std=train_au_std)

test_dataset = DAICUtteranceDataset(test_df_utterances, tokenizer, au_columns,
                                    au_mean=train_au_mean, au_std=train_au_std)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

baseline_train = DAICBaselineDataset(train_labels, DATA_ROOT, tokenizer, au_columns)
baseline_dev = DAICBaselineDataset(dev_labels, DATA_ROOT, tokenizer, au_columns)
baseline_test = DAICBaselineDataset(test_labels, DATA_ROOT, tokenizer, au_columns)

baseline_train_loader = DataLoader(baseline_train, batch_size=8, shuffle=True)
baseline_dev_loader = DataLoader(baseline_dev, batch_size=8)
baseline_test_loader = DataLoader(baseline_test, batch_size=8)

all_results = []

# BASELINE
print("\n" + "="*80)
print("BASELINE: Direct PHQ-8 Threshold Classification")
print("="*80)
baseline_model = train_model(BaselineClassifier(), baseline_train_loader, baseline_dev_loader, "baseline", is_baseline=True)
baseline_results = evaluate_baseline(baseline_model, baseline_test_loader)

# RQ1: TEXT-ONLY
print("\n" + "="*80)
print("RQ1: TEXT-ONLY SOFTMENT")
print("="*80)
text_model = train_model(TextEmotionClassifier(), train_loader, dev_loader, "text_softment")
rq1_results = evaluate_softment(text_model, test_loader, test_df_utterances, "RQ1: Text-Only SOFTMENT")
all_results.append(rq1_results)

# RQ2: MULTIMODAL
print("\n" + "="*80)
print("RQ2: MULTIMODAL SOFTMENT (Early Fusion - Concatenation)")
print("="*80)
early_concat_model = train_model(EarlyFusionConcat(len(au_columns)), train_loader, dev_loader, "early_concat")
rq2_results = evaluate_softment(early_concat_model, test_loader, test_df_utterances, "RQ2: Early Fusion (Concat)")
all_results.append(rq2_results)

# RQ3: FUSION COMPARISON
print("\n" + "="*80)
print("RQ3: FUSION METHOD COMPARISON")
print("="*80)

print("\n Training Early Fusion (Gated)...")
early_gated_model = train_model(EarlyFusionGated(len(au_columns)), train_loader, dev_loader, "early_gated")
rq3_gated_results = evaluate_softment(early_gated_model, test_loader, test_df_utterances, "RQ3: Early Fusion (Gated)")
all_results.append(rq3_gated_results)

print("\n Training Video Model for Late Fusion...")
video_model = train_model(VideoEmotionClassifier(len(au_columns)), train_loader, dev_loader, "video_softment")
rq3_late_results = evaluate_late_fusion(text_model, video_model, test_loader, test_df_utterances, fusion_weight=0.5)
all_results.append(rq3_late_results)


print("\n" + "="*80)
print("Results Summary")
print("="*80)

results_df = pd.DataFrame([
    {
        'Model': r['model'],
        'N_Participants': r['n_participants'],
        'MH Pearson': r['mh_pearson'],
        'MH Spearman': r['mh_spearman'],
        'MH MAE': r['mh_mae'],
        'MH RMSE': r['mh_rmse']
    }
    for r in all_results
])

baseline_row = pd.DataFrame([{
    'Model': 'Baseline (Direct Classification)',
    'N_Participants': len(baseline_test),
    'MH Pearson': '-', 'MH Spearman': '-', 'MH MAE': '-', 'MH RMSE': '-'
}])
results_df = pd.concat([baseline_row, results_df], ignore_index=True)
print("\n" + results_df.to_string(index=False))

results_df.to_csv(os.path.join(OUTPUT_DIR, "final_results.csv"), index=False)

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
softment_models = [r['model'] for r in all_results]
x_pos = np.arange(len(softment_models))

# Pearson
pearson_vals = [r['mh_pearson'] for r in all_results]
axes[0, 0].bar(x_pos, pearson_vals, color='steelblue', alpha=0.8, edgecolor='black')
axes[0, 0].set_xticks(x_pos)
axes[0, 0].set_xticklabels(softment_models, rotation=45, ha='right', fontsize=9)
axes[0, 0].set_ylabel('Pearson r')
axes[0, 0].set_title('MH-Score Correlation with PHQ-8')

# MAE
mae_vals = [r['mh_mae'] for r in all_results]
axes[0, 1].bar(x_pos, mae_vals, color='coral', alpha=0.8, edgecolor='black')
axes[0, 1].set_xticks(x_pos)
axes[0, 1].set_xticklabels(softment_models, rotation=45, ha='right', fontsize=9)
axes[0, 1].set_ylabel('MAE')
axes[0, 1].set_title('MH-Score Mean Absolute Error')

# Scatter
best_idx = np.argmax(pearson_vals)
best_mh = all_results[best_idx]['mh_scores']
best_phq8 = all_results[best_idx]['phq8_scores']
axes[1, 0].scatter(best_phq8, best_mh, alpha=0.6, edgecolors='k')
axes[1, 0].plot([0, 1], [0, 1], 'r--', transform=axes[1, 0].transAxes)
axes[1, 0].set_xlabel('PHQ-8 Score')
axes[1, 0].set_ylabel('MH-Score')
axes[1, 0].set_title(f'Best Model: {softment_models[best_idx]}')

# Comparison
axes[1, 1].bar(['Baseline F1', 'Best SOFTMENT Pearson'], [baseline_results['f1'], max(pearson_vals)], color=['gray', 'green'])
axes[1, 1].set_title('Baseline vs SOFTMENT')

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'final_results.png'), dpi=300)
plt.show()

