Import Modules

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio

import torchaudio
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Model, Wav2Vec2Processor, Trainer, TrainingArguments, Wav2Vec2ForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

cpu


Load the Dataset

In [2]:
paths = []
labels = []


for dirname, _ , filenames in os.walk('./dataset'):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        label = filename.split('_')[-1]
        label = label.split('.')[0]
        labels.append(label.lower())
        
    if len(paths) == 2800:
        break


Create DataFrame

In [3]:
df = pd.DataFrame()
df['audio_path'] = paths
df['label'] = labels 

Create custom dataset class


In [None]:
# Convert labels to integers
labels_map = {label: idx for idx, label in enumerate(df['label'].unique())}
inverse_label_map = {idx:label for label, idx in labels_map.items()}
df['label'] = df['label'].map(labels_map)


In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np
import librosa

class SpeechEmotionDataset(Dataset):
    def __init__(self, df, processor, max_length=32000):
        self.df = df
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.df) 

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['audio_path']
        label = self.df.iloc[idx]['label']

        # Load the audio file
        speech, sr = librosa.load(audio_path, sr=16000)

        # Pad or truncate
        if len(speech) > self.max_length:
            speech = speech[:self.max_length]
        else:
            speech = np.pad(speech, (0, self.max_length - len(speech)), 'constant')

        # Preprocess with the processor
        inputs = self.processor(
            speech,
            sampling_rate=16000,
            return_tensors='pt'
        )

        input_values = inputs.input_values.squeeze()  
        return {
            'input_values': input_values,
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [8]:
# Split data for train and test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Initial processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForSequenceClassification.from_pretrained('facebook/wav2vec2-base', num_labels = 7)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_dataset = SpeechEmotionDataset(train_df, processor)
test_dataset = SpeechEmotionDataset(test_df, processor)


In [11]:
# Create dataloader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)



Set Training Arguments

In [12]:
training_args = TrainingArguments(
    output_dir = './results',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    weight_decay = 0.01,
    report_to = [],
)

In [48]:
# Create function for computing the metrics

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids                    
    preds  = np.argmax(pred.predictions, axis=1)  

    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"  
    )

    return {
        "accuracy":  accuracy,
        "precision": precision,
        "recall":    recall,
        "f1":        f1
    }


In [15]:
# Initial the trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.493792,0.996429,0.996509,0.996429,0.996426
2,No log,0.217652,1.0,1.0,1.0,1.0
3,No log,0.169286,1.0,1.0,1.0,1.0


TrainOutput(global_step=420, training_loss=0.5904856727236794, metrics={'train_runtime': 6539.8616, 'train_samples_per_second': 1.028, 'train_steps_per_second': 0.064, 'total_flos': 1.2201848064e+17, 'train_loss': 0.5904856727236794, 'epoch': 3.0})

In [31]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.16928617656230927, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 68.6735, 'eval_samples_per_second': 8.155, 'eval_steps_per_second': 0.51, 'epoch': 3.0}


Test predictions

In [None]:
import random

idx = random.randrange(0, len(test_dataset))
input_values = test_dataset[idx]['input_values'].unsqueeze(0)

with torch.no_grad():
    outputs = model(input_values)

logits = outputs.logits

predicted_class = logits.argmax(dim = -1).item()
print('predicted_class: ', predicted_class)