# Ensemble model

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import random
import torch
from torch.nn.functional import softmax
from torch.utils.data import DataLoader
from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, WavLMForSequenceClassification

In [None]:
notebook_path = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_path, '../..'))
sys.path.insert(0, project_root)

In [None]:
print(torch.__version__)
print(torch.cuda.is_available())

In [None]:
# Set seed for reproducibility
seed = 42
set_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

Load Dataset

In [None]:
df_val = pd.read_csv('../../data/val_dataset.csv')
df_val = df_val[['Filepath', 'Emotion']]

In [None]:
df_val

In [None]:
# Convert labels to integers
unique_labels = sorted(df_val['Emotion'].unique())
label_map = {label: idx for idx, label in enumerate(unique_labels)}
print(label_map)

df_val['Emotion'] = df_val['Emotion'].map(label_map)

In [None]:
df_val

Loading pretrained models

In [None]:
# Load Model 1: facebook/wav2vec2-base
model1_checkpoint_path = '../models/wav2vec2-base_standardpad/checkpoint-16584' # change to wav2vec2 best model
processor1 = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model1 = Wav2Vec2ForSequenceClassification.from_pretrained(
    model1_checkpoint_path, num_labels=len(label_map))

# Load Model 2: microsoft/wavlm-base
model2_checkpoint_path = '../models/wavlm-base_standardpad/checkpoint-13820' # change to wavlm best model
processor2 = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base")
model2 = WavLMForSequenceClassification.from_pretrained(
    model2_checkpoint_path, num_labels=len(label_map))

In [None]:
from transformer_models.emotion_datasets.SpeechEmotionDatasetStandardPad import SpeechEmotionDatasetStandardPad

# Create two validation datasets, one for each model
val_dataset1 = SpeechEmotionDatasetStandardPad(df_val, processor1)
val_dataset2 = SpeechEmotionDatasetStandardPad(df_val, processor2)

In [None]:
val_dataset1[0]

In [None]:
val_dataset2[0]

Get predictions from each model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Helper function to obtain model probabilities
def get_model_probs(model, dataset, batch_size=128):
    model.eval()
    model.to(device)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    all_probs = []

    with torch.no_grad():
        for batch in dataloader:
            # The dataset returns a dict with 'input_values'
            input_values = batch["input_values"].to(device)
            outputs = model(input_values).logits  # shape: (B, num_labels)
            probs = softmax(outputs, dim=1).cpu().numpy()
            all_probs.append(probs)
    return np.vstack(all_probs)  # shape: (N, num_labels)

In [None]:
# Get probabilities for each model from their respective datasets
probs1 = get_model_probs(model1, val_dataset1, batch_size=128)
probs2 = get_model_probs(model2, val_dataset2, batch_size=128)

# Stack predictions horizontally
X_meta = np.hstack([probs1, probs2])  # shape: (N, num_labels * 2)

# Extract ground truth labels from one of the datasets (they should be the same)
y_meta = np.array([sample['labels'].item() for sample in val_dataset1])

print("Shape of X_meta (stacked predictions):", X_meta.shape)
print("Shape of y_meta (labels):", y_meta.shape)

Define simple Feed-Forward Neural Network for meta classifier

In [None]:
import torch.nn as nn

class MetaFFNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MetaFFNN, self).__init__()
        
        self.model = nn.Sequential(
            # First hidden layer
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),

            # Second hidden layer
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),

            # Third hidden layer
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),

            # Output layer
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)


Define meta dataset class

In [None]:
from torch.utils.data import Dataset

class MetaDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


Train meta classifier

In [None]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

input_dim = X_meta.shape[1]  # 2 * num_classes
hidden_dim = 128
output_dim = len(set(y_meta))  # number of emotion classes

class_weights_path = '../../data/class_weights.pt'
class_weights = torch.load(class_weights_path).to(device)

dataset = MetaDataset(X_meta, y_meta)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the best F1 score tracker
best_f1 = 0.0
best_model = None

# Early stopping counters
patience = 10
patience_counter = 0

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"\n Fold {fold + 1}")
    
    train_subset = torch.utils.data.Subset(dataset, train_idx)
    val_subset = torch.utils.data.Subset(dataset, val_idx)
    
    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=32)
    
    model = MetaFFNN(input_dim, hidden_dim, output_dim).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights) # Use weighted cross-entropy loss
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    best_loss = float('inf')  # Initialize best loss to a very high value

    for epoch in range(1000):
        model.train()
        total_loss = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

        # Check if the loss has improved
        if total_loss < best_loss:
            best_loss = total_loss
            patience_counter = 0  # Reset the counter since loss improved
        else:
            patience_counter += 1
        
        # If patience is reached, stop training early
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    # Evaluate
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            preds = torch.argmax(out, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(yb.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    # Print out the metrics for the current fold
    print(f"Fold {fold + 1} Metrics: Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    
    # Save the model if it has the best F1 score
    if f1 > best_f1:
        best_f1 = f1
        best_model = model.state_dict()  # Save the state dict of the model

# After all folds are done, save the best model
torch.save(best_model, 'best_meta_ffnn_model.pt')
print("Best model saved")
