# Transfer learning on facebook/wav2vec2-base


With raw audio files all padded to 4s without data augmentation

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from transformers import set_seed, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

In [None]:
notebook_path = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_path, '../..'))
sys.path.insert(0, project_root)

In [None]:
print(torch.__version__)
print(torch.cuda.is_available())

In [None]:
# Set seed for reproducibility
seed = 42
set_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [None]:
root_dir = '../../data/augmentation/'

## Load Dataset


In [None]:
df_train = pd.read_csv('../../data/augmentation/augmented_combined_train_dataset.csv')
df_train = df_train[['Filepath', 'Emotion']]
df_val = pd.read_csv('../../data/val_dataset.csv')
df_val = df_val[['Filepath', 'Emotion']]
df_test = pd.read_csv('../../data/test_dataset.csv')
df_test = df_test[['Filepath', 'Emotion']]

In [None]:
df_train

In [None]:
df_val

In [None]:
df_test

In [None]:
# Convert labels to integers
unique_labels = sorted(df_train['Emotion'].unique())
label_map = {label: idx for idx, label in enumerate(unique_labels)}
print(label_map)

df_train['Emotion'] = df_train['Emotion'].map(label_map)
df_val['Emotion'] = df_val['Emotion'].map(label_map)
df_test['Emotion'] = df_test['Emotion'].map(label_map)

In [None]:
df_train

In [None]:
df_val

In [None]:
df_test

In [None]:
model_name = 'facebook/wav2vec2-base'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_map))

In [None]:
from transformer_models.emotion_datasets.SpeechEmotionDatasetStandardPadAugment import SpeechEmotionDatasetStandardPadAugment
from transformer_models.emotion_datasets.SpeechEmotionDatasetStandardPad import SpeechEmotionDatasetStandardPad

In [None]:
# Load the dataset
train_dataset = SpeechEmotionDatasetStandardPadAugment(df_train, processor, root_dir)
val_dataset = SpeechEmotionDatasetStandardPad(df_val, processor)
test_dataset = SpeechEmotionDatasetStandardPad(df_test, processor)

In [None]:
train_dataset[0]

## Set Training Arguments


In [None]:
train_args = TrainingArguments(
    output_dir='../models/wav2vec2-base_standardpad_augmentation',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to=[]
)

In [None]:
# Create function for computing metrics
def compute_metrics(pred):
    labels = pred.label_ids  # original labels
    preds = np.argmax(pred.predictions, axis=1)  # model predicted labels
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

Import CustomTrainer and EarlyStoppingCallback

In [None]:
from transformer_models.trainer.CustomTrainer import CustomTrainer
from transformer_models.early_stopping.EarlyStopping import EarlyStoppingCallback

In [None]:
# Initialize the trainer
trainer = CustomTrainer(
    model=model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    class_weights_path="../../data/class_weights.pt",
    callbacks=[EarlyStoppingCallback(patience=3, min_delta=0.001)]
)

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print(results)

## Test Predictions


In [None]:
# Get model predictions on the test dataset
predictions = trainer.predict(test_dataset)
# Convert logits to predicted class labels
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids  # Ground truth labels

# Compute metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}\n")

# Print detailed classification report
print(classification_report(true_labels, pred_labels, target_names=list(label_map.keys())))

# Compute the confusion matrix
conf_matrix = confusion_matrix(true_labels, pred_labels)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_map.keys(), yticklabels=label_map.keys())

plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')

plt.xticks(rotation=45)
plt.yticks(rotation=45)

plt.tight_layout()
plt.show()