# Wav2Vec2 transfer learning on wav2vec2-base


In [None]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
import torch
import random
from torch.utils.data import Dataset
from transformers import Wav2Vec2Processor, Trainer, TrainingArguments, Wav2Vec2ForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

In [None]:
# Set seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [None]:
print(torch.__version__)
print(torch.cuda.is_available())

## Load Dataset


In [None]:
df_train = pd.read_csv('../data/train_dataset.csv')
df_train = df_train[['Filepath', 'Emotion']]
df_test = pd.read_csv('../data/test_dataset.csv')
df_test = df_test[['Filepath', 'Emotion']]

In [None]:
print(df_train)
print(df_test)

In [None]:
# Convert labels to integers
unique_labels = sorted(df_train['Emotion'].unique())
label_map = {label: idx for idx, label in enumerate(unique_labels)}
print(label_map)

df_train['Emotion'] = df_train['Emotion'].map(label_map)
df_test['Emotion'] = df_test['Emotion'].map(label_map)

In [None]:
print(df_train)
print(df_test)

In [None]:
# Custom dataset class
class SpeechEmotionDataset(Dataset):
    # Max_length = 4s, 64000 because sampling rate is 16000
    def __init__(self, df, processor, max_length=64000):
        self.df = df
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = '../data/' + self.df.iloc[idx]['Filepath']
        label = self.df.iloc[idx]['Emotion']

        # Load audio file
        speech, sr = librosa.load(audio_path, sr=16000)

        # Pad speech to required length
        speech = np.pad(speech, (0, self.max_length -
                        len(speech)), mode='constant')

        # Preprocess audio
        inputs = self.processor(speech, sampling_rate=16000, return_tensors='pt',
                                padding=True, truncate=True, max_length=self.max_length)

        input_values = inputs.input_values.squeeze()
        return {'input_values': input_values, 'labels': torch.tensor(label, dtype=torch.long)}

In [None]:
model_name = 'facebook/wav2vec2-base'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_map))

In [None]:
# Load the dataset
train_dataset = SpeechEmotionDataset(df_train, processor)
test_dataset = SpeechEmotionDataset(df_test, processor)

In [None]:
train_dataset[0]

## Set Training Arguments


In [None]:
train_args = TrainingArguments(
    output_dir='./models/wav2vec2-base',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to=[]
)

In [None]:
# Create function for computing metrics
def compute_metrics(pred):
    labels = pred.label_ids  # original labels
    preds = np.argmax(pred.predictions, axis=1)  # model predicted labels
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
# Initialize the trainer
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print(results)

## Test Predictions


In [None]:
# Get model predictions on the test dataset
predictions = trainer.predict(test_dataset)
# Convert logits to predicted class labels
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids  # Ground truth labels

# Compute the confusion matrix
conf_matrix = confusion_matrix(true_labels, pred_labels)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_map.keys(), yticklabels=label_map.keys())

plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix Heatmap")
plt.show()

In [None]:
inv_label_map = {idx: label for label, idx in label_map.items()}
print(inv_label_map)

In [None]:
idx = random.randrange(0, len(test_dataset))
print("Original Label:", inv_label_map[int(test_dataset[idx]['labels'])])
input_values = test_dataset[idx]['input_values'].unsqueeze(0).to('cuda')

with torch.no_grad():
    outputs = model(input_values)
logits = outputs.logits

predicted_class = logits.argmax(dim=-1).item()
print('Predicted Label:', inv_label_map[predicted_class])

In [None]:
idx = random.randrange(0, len(test_dataset))
print("Original Label:", inv_label_map[int(test_dataset[idx]['labels'])])
input_values = test_dataset[idx]['input_values'].unsqueeze(0).to('cuda')

with torch.no_grad():
    outputs = model(input_values)
logits = outputs.logits

predicted_class = logits.argmax(dim=-1).item()
print('Predicted Label:', inv_label_map[predicted_class])

In [None]:
idx = random.randrange(0, len(test_dataset))
print("Original Label:", inv_label_map[int(test_dataset[idx]['labels'])])
input_values = test_dataset[idx]['input_values'].unsqueeze(0).to('cuda')

with torch.no_grad():
    outputs = model(input_values)
logits = outputs.logits

predicted_class = logits.argmax(dim=-1).item()
print('Predicted Label:', inv_label_map[predicted_class])