In [3]:
# Libraries
import torchaudio
from transformers import AutoProcessor, AutoModelForAudioClassification
import torchaudio.transforms as T
import torch
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Define path to data
path = os.path.join(os.getcwd(), "data")
wav_files = glob.glob(os.path.join(path, "*.wav"))

In [5]:
# Load pre-trained emotion model
name_model = "Dpngtm/wav2vec2-emotion-recognition"
processor = AutoProcessor.from_pretrained(name_model)
model = AutoModelForAudioClassification.from_pretrained(name_model)
# Given a file, to predict the emotion
def predict_emotion(file_path):
    waveform, sr = torchaudio.load(file_path)
    # Resample if necessary
    if sr != 16000:
        waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
    inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = torch.argmax(logits).item()
    predicted_label = model.config.id2label[predicted_class_id]
    return predicted_label

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
# Feature extraction function
def get_feautres_from_file(wav_files):
    features, labels = [], []
    # Feature extraction loop
    for file in wav_files:
        waveform, sr = torchaudio.load(file)

        # Ensure mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Compute log-Mel Spectrogram
        mel_spec = T.MelSpectrogram(sample_rate=sr)(waveform)
        mel_db = T.AmplitudeToDB()(mel_spec)

        # Global statistics
        mean = mel_db.mean().item()
        std = mel_db.std().item()
        max_val = mel_db.max().item()
        min_val = mel_db.min().item()

        # Duration (in seconds)
        duration = waveform.shape[1] / sr
        # features
        features.append([mean, std, max_val, min_val, duration])
        # emotions' labels
        labels.append(predict_emotion(file))
    return features, labels

In [None]:
features, labels = get_feautres_from_file(wav_files)



In [None]:
# Create DataFrame
X = pd.DataFrame(features, columns=["mean_db", "std_db", "max_db", "min_db", "duration"])
y = pd.Series(labels, name="emotion")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Normalize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

In [None]:
# Evaluation
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))