# Ensemble model

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch.nn.functional import softmax
from torch.utils.data import DataLoader
from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, WavLMForSequenceClassification, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
notebook_path = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_path, '../..'))
sys.path.insert(0, project_root)

In [3]:
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu121
True


In [4]:
# Set seed for reproducibility
seed = 42
set_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

Load Dataset

In [5]:
df_val = pd.read_csv('../../data/val_dataset.csv')
df_val = df_val[['Filepath', 'Emotion']]

In [6]:
df_val

Unnamed: 0,Filepath,Emotion
0,./dataset/esd\0020\Sad\0020_001395.wav,Sad
1,./dataset/meld\train\dia930_utt5.mp4,Neutral
2,./dataset/mlend\MLEndSND_Public\24481.wav,Bored
3,./dataset/crema-d\AudioWAV\1002_IEO_SAD_HI.wav,Sad
4,./dataset/esd\0011\Angry\0011_000373.wav,Anger
...,...,...
9471,./dataset/tess\YAF_disgust\YAF_take_disgust.wav,Disgust
9472,./dataset/mlend\MLEndSND_Public\43418.wav,Bored
9473,./dataset/mlend\MLEndSND_Public\02459.wav,Bored
9474,./dataset/mlend\MLEndSND_Public\10609.wav,Question


In [7]:
# Convert labels to integers
unique_labels = sorted(df_val['Emotion'].unique())
label_map = {label: idx for idx, label in enumerate(unique_labels)}
print(label_map)

df_val['Emotion'] = df_val['Emotion'].map(label_map)

{'Anger': 0, 'Bored': 1, 'Disgust': 2, 'Fear': 3, 'Happy': 4, 'Neutral': 5, 'Question': 6, 'Sad': 7, 'Surprise': 8}


In [8]:
df_val

Unnamed: 0,Filepath,Emotion
0,./dataset/esd\0020\Sad\0020_001395.wav,7
1,./dataset/meld\train\dia930_utt5.mp4,5
2,./dataset/mlend\MLEndSND_Public\24481.wav,1
3,./dataset/crema-d\AudioWAV\1002_IEO_SAD_HI.wav,7
4,./dataset/esd\0011\Angry\0011_000373.wav,0
...,...,...
9471,./dataset/tess\YAF_disgust\YAF_take_disgust.wav,2
9472,./dataset/mlend\MLEndSND_Public\43418.wav,1
9473,./dataset/mlend\MLEndSND_Public\02459.wav,1
9474,./dataset/mlend\MLEndSND_Public\10609.wav,6


Loading pretrained models

In [9]:
# Load Model 1: facebook/wav2vec2-base
model1_checkpoint_path = '../models/wav2vec2-base_standardpad/checkpoint-16584'
processor1 = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model1 = Wav2Vec2ForSequenceClassification.from_pretrained(
    model1_checkpoint_path, num_labels=len(label_map))

# Load Model 2: microsoft/wavlm-base
model2_checkpoint_path = '../models/wavlm-base_standardpad/checkpoint-13820'
processor2 = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base")
model2 = WavLMForSequenceClassification.from_pretrained(
    model2_checkpoint_path, num_labels=len(label_map))



In [10]:
from transformer_models.emotion_datasets.SpeechEmotionDatasetStandardPad import SpeechEmotionDatasetStandardPad

# Create two validation datasets, one for each model
val_dataset1 = SpeechEmotionDatasetStandardPad(df_val, processor1)
val_dataset2 = SpeechEmotionDatasetStandardPad(df_val, processor2)

In [11]:
val_dataset1[0]

Keyword argument `truncate` is not a valid argument for this processor and will be ignored.


{'input_values': tensor([ 1.0359e-02,  1.0359e-02,  1.0359e-02,  ..., -7.3200e-05,
         -7.3200e-05, -7.3200e-05]),
 'labels': tensor(7)}

In [12]:
val_dataset2[0]

{'input_values': tensor([0.0005, 0.0005, 0.0005,  ..., 0.0000, 0.0000, 0.0000]),
 'labels': tensor(7)}

Get predictions from each model

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
# Helper function to obtain model probabilities
def get_model_probs(model, dataset, batch_size=128):
    model.eval()
    model.to(device)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    all_probs = []

    with torch.no_grad():
        for batch in dataloader:
            # The dataset returns a dict with 'input_values'
            input_values = batch["input_values"].to(device)
            outputs = model(input_values).logits  # shape: (B, num_labels)
            probs = softmax(outputs, dim=1).cpu().numpy()
            all_probs.append(probs)
    return np.vstack(all_probs)  # shape: (N, num_labels)

In [15]:
# Get probabilities for each model from their respective datasets
probs1 = get_model_probs(model1, val_dataset1, batch_size=128)
probs2 = get_model_probs(model2, val_dataset2, batch_size=128)

# Stack predictions horizontally
X_meta = np.hstack([probs1, probs2])  # shape: (N, num_labels * 2)

# Extract ground truth labels from one of the datasets (they should be the same)
y_meta = np.array([sample['labels'].item() for sample in val_dataset1])

print("Shape of X_meta (stacked predictions):", X_meta.shape)
print("Shape of y_meta (labels):", y_meta.shape)

  speech, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Shape of X_meta (stacked predictions): (9476, 18)
Shape of y_meta (labels): (9476,)
