# Ensemble model

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
import torch
from torch.nn.functional import softmax
from torch.utils.data import DataLoader
from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, WavLMForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
notebook_path = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_path, '../..'))
sys.path.insert(0, project_root)

In [3]:
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu121
True


In [4]:
# Set seed for reproducibility
seed = 42
set_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

Load Dataset

In [5]:
df_val = pd.read_csv('../../data/val_dataset.csv')
df_val = df_val[['Filepath', 'Emotion']]

In [6]:
df_val

Unnamed: 0,Filepath,Emotion
0,./dataset/esd\0020\Sad\0020_001395.wav,Sad
1,./dataset/meld\train\dia930_utt5.mp4,Neutral
2,./dataset/mlend\MLEndSND_Public\24481.wav,Bored
3,./dataset/crema-d\AudioWAV\1002_IEO_SAD_HI.wav,Sad
4,./dataset/esd\0011\Angry\0011_000373.wav,Anger
...,...,...
9471,./dataset/tess\YAF_disgust\YAF_take_disgust.wav,Disgust
9472,./dataset/mlend\MLEndSND_Public\43418.wav,Bored
9473,./dataset/mlend\MLEndSND_Public\02459.wav,Bored
9474,./dataset/mlend\MLEndSND_Public\10609.wav,Question


In [7]:
# Convert labels to integers
unique_labels = sorted(df_val['Emotion'].unique())
label_map = {label: idx for idx, label in enumerate(unique_labels)}
print(label_map)

df_val['Emotion'] = df_val['Emotion'].map(label_map)

{'Anger': 0, 'Bored': 1, 'Disgust': 2, 'Fear': 3, 'Happy': 4, 'Neutral': 5, 'Question': 6, 'Sad': 7, 'Surprise': 8}


In [8]:
df_val

Unnamed: 0,Filepath,Emotion
0,./dataset/esd\0020\Sad\0020_001395.wav,7
1,./dataset/meld\train\dia930_utt5.mp4,5
2,./dataset/mlend\MLEndSND_Public\24481.wav,1
3,./dataset/crema-d\AudioWAV\1002_IEO_SAD_HI.wav,7
4,./dataset/esd\0011\Angry\0011_000373.wav,0
...,...,...
9471,./dataset/tess\YAF_disgust\YAF_take_disgust.wav,2
9472,./dataset/mlend\MLEndSND_Public\43418.wav,1
9473,./dataset/mlend\MLEndSND_Public\02459.wav,1
9474,./dataset/mlend\MLEndSND_Public\10609.wav,6


Loading pretrained models

In [9]:
# Load Model 1: facebook/wav2vec2-base
model1_checkpoint_path = '../models/wav2vec2-base_standardpad_augmentation/checkpoint-33168' # change to wav2vec2 best model
processor1 = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model1 = Wav2Vec2ForSequenceClassification.from_pretrained(
    model1_checkpoint_path, num_labels=len(label_map))

# Load Model 2: microsoft/wavlm-base
model2_checkpoint_path = '../models/wavlm-base_standardpad_augmentation/checkpoint-55280' # change to wavlm best model
processor2 = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base")
model2 = WavLMForSequenceClassification.from_pretrained(
    model2_checkpoint_path, num_labels=len(label_map))



In [10]:
from transformer_models.emotion_datasets.SpeechEmotionDatasetStandardPad import SpeechEmotionDatasetStandardPad

# Create two validation datasets, one for each model
val_dataset1 = SpeechEmotionDatasetStandardPad(df_val, processor1)
val_dataset2 = SpeechEmotionDatasetStandardPad(df_val, processor2)

In [11]:
val_dataset1[0]

Keyword argument `truncate` is not a valid argument for this processor and will be ignored.


{'input_values': tensor([ 1.0359e-02,  1.0359e-02,  1.0359e-02,  ..., -7.3200e-05,
         -7.3200e-05, -7.3200e-05]),
 'labels': tensor(7)}

In [12]:
val_dataset2[0]

{'input_values': tensor([0.0005, 0.0005, 0.0005,  ..., 0.0000, 0.0000, 0.0000]),
 'labels': tensor(7)}

Get predictions from each model

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
# Helper function to obtain model probabilities
def get_model_logits(model, dataset, batch_size=128):
    model.eval()
    model.to(device)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    all_logits = []

    with torch.no_grad():
        for batch in dataloader:
            # The dataset returns a dict with 'input_values'
            input_values = batch["input_values"].to(device)
            logits = model(input_values).logits  # shape: (B, num_labels)
            all_logits.append(logits.cpu().numpy())
    return np.vstack(all_logits)  # shape: (N, num_labels)

In [15]:
# Get probabilities for each model from their respective datasets
logits1 = get_model_logits(model1, val_dataset1, batch_size=128)
logits2 = get_model_logits(model2, val_dataset2, batch_size=128)

# Stack predictions horizontally
X_meta = np.hstack([logits1, logits2])  # shape: (N, num_labels * 2)

# Extract ground truth labels from one of the datasets (they should be the same)
y_meta = np.array([sample['labels'].item() for sample in val_dataset1])

print("Shape of X_meta (stacked predictions):", X_meta.shape)
print("Shape of y_meta (labels):", y_meta.shape)

  speech, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Shape of X_meta (stacked predictions): (9476, 18)
Shape of y_meta (labels): (9476,)


Define simple Feed-Forward Neural Network for meta classifier

In [16]:
import torch.nn as nn

class MetaFFNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MetaFFNN, self).__init__()
        
        self.model = nn.Sequential(
            # First hidden layer
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(),
            nn.Dropout(0.3),

            # Second hidden layer
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(),
            nn.Dropout(0.3),

            # Third hidden layer
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(),
            nn.Dropout(0.3),

            # Output layer
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)


Define meta dataset class

In [17]:
from torch.utils.data import Dataset

class MetaDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


Train meta classifier

In [18]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

input_dim = X_meta.shape[1]  # 2 * num_classes
hidden_dim = 128
output_dim = len(set(y_meta))  # number of emotion classes

class_weights_path = '../../data/class_weights.pt'
class_weights = torch.load(class_weights_path, weights_only=True).to(device)

dataset = MetaDataset(X_meta, y_meta)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the best F1 score tracker
best_f1 = 0.0
best_model = None

# Early stopping counters
patience = 10
patience_counter = 0

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset.X, dataset.y)):
    print(f"\n Fold {fold + 1}")
    
    train_subset = torch.utils.data.Subset(dataset, train_idx)
    val_subset = torch.utils.data.Subset(dataset, val_idx)
    
    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=32)
    
    model = MetaFFNN(input_dim, hidden_dim, output_dim).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights) # Use weighted cross-entropy loss
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    best_loss = float('inf')  # Initialize best loss to a very high value

    for epoch in range(1000):
        model.train()
        total_loss = 0
        num_batches = len(train_loader)

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        average_loss = total_loss / num_batches  # Calculate average loss
        print(f"Epoch {epoch+1}, Average Loss: {average_loss:.4f}")

        # Check if the loss has improved
        if average_loss < best_loss:
            best_loss = average_loss
            patience_counter = 0  # Reset the counter since loss improved
        else:
            patience_counter += 1
        
        # If patience is reached, stop training early
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    # Evaluate
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            preds = torch.argmax(out, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(yb.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    # Print out the metrics for the current fold
    print(f"Fold {fold + 1} Metrics: Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    
    # Save the model if it has the best F1 score
    if f1 > best_f1:
        best_f1 = f1
        best_model = model.state_dict()  # Save the state dict of the model

# After all folds are done, save the best model
torch.save(best_model, '../meta_learner/best_meta_ffnn_model.pt')
print("Best model saved")



 Fold 1
Epoch 1, Average Loss: 0.6643
Epoch 2, Average Loss: 0.4845
Epoch 3, Average Loss: 0.4580
Epoch 4, Average Loss: 0.4402
Epoch 5, Average Loss: 0.4397
Epoch 6, Average Loss: 0.4216
Epoch 7, Average Loss: 0.4205
Epoch 8, Average Loss: 0.4182
Epoch 9, Average Loss: 0.4126
Epoch 10, Average Loss: 0.4159
Epoch 11, Average Loss: 0.4024
Epoch 12, Average Loss: 0.3976
Epoch 13, Average Loss: 0.4043
Epoch 14, Average Loss: 0.3947
Epoch 15, Average Loss: 0.3979
Epoch 16, Average Loss: 0.3826
Epoch 17, Average Loss: 0.3850
Epoch 18, Average Loss: 0.3776
Epoch 19, Average Loss: 0.3797
Epoch 20, Average Loss: 0.3764
Epoch 21, Average Loss: 0.3760
Epoch 22, Average Loss: 0.3763
Epoch 23, Average Loss: 0.3601
Epoch 24, Average Loss: 0.3622
Epoch 25, Average Loss: 0.3676
Epoch 26, Average Loss: 0.3620
Epoch 27, Average Loss: 0.3621
Epoch 28, Average Loss: 0.3563
Epoch 29, Average Loss: 0.3449
Epoch 30, Average Loss: 0.3496
Epoch 31, Average Loss: 0.3648
Epoch 32, Average Loss: 0.3460
Epoch 33