## Speech Emotion Recognition: Audio Classification

Dataset Source: https://www.kaggle.com/datasets/dmitrybabko/speech-emotion-recognition-en

### Prepare

**bold text**#### Install Missing Libraries

#### Import Necessary Libraries

In [1]:
import os, sys, random, glob
# os.environ['TOKENIZERS_PARALLELISM']='false'

import numpy as np
import pandas as pd

import datasets
from datasets import load_dataset, Audio, DatasetDict
from datasets import Audio, Features, ClassLabel

import torch

import transformers
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
from transformers import TrainingArguments, Trainer

import evaluate

from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ and/or you do not have an MPS-enabled device on this machine.")
else:
    mps_device = torch.device("mps")

#### Display Library Versions

In [2]:
print("Python:".rjust(15), sys.version[0:6])
print("NumPy:".rjust(15), np.__version__)
print("Pandas:".rjust(15), pd.__version__)
print("Datasets:".rjust(15), datasets.__version__)
print("Torch:".rjust(15), torch.__version__)
print("Transformers:".rjust(15), transformers.__version__)
print("Evaluate:".rjust(15), evaluate.__version__)

        Python: 3.11.8
         NumPy: 1.26.4
        Pandas: 2.2.1
      Datasets: 2.18.0
         Torch: 2.2.1
  Transformers: 4.39.0
      Evaluate: 0.4.1


### Load Data

### Load our test set

links: 
- https://discuss.huggingface.co/t/loading-train-and-test-splits-with-audiofolder/22447/4
- https://huggingface.co/docs/datasets/en/loading

#### Prepare Metadata File

In [3]:
label2id = {"angry": 0,"disgust": 1,"fearful": 2, "happy": 3, "neutral": 4, "sad": 5, "surprised": 6}
id2label = {0: "angry",1: "disgust", 2: "fearful", 3: "happy",  4: "neutral", 5: "sad", 6: "surprised"}

initial_label_update = {"Sadness": "sad",
                        "Anger": "angry",
                        "Disgust": "disgust",
                        "Fear": "fearful",
                        "Happiness": "happy",
                        "Surprise": "surprised",
                       "Calmness": "neutral",
                        "Neutrality": "neutral"}

metadata_all = pd.read_csv('./dataset_info_combined_v4.csv')
metadata = metadata_all[['split', 'renamed_file_path','emotional_category']]
metadata['file_name'] = metadata['renamed_file_path'].apply(lambda x: x.split("/")[-1])
metadata['label'] = metadata['emotional_category'].replace(initial_label_update).replace(label2id)
metadata = metadata.drop(columns=['renamed_file_path','emotional_category'])


parent_dir = "./dataset/splitted/"
metadata_test = metadata[metadata.split=='test']
metadata_file_location = os.path.join(parent_dir, "test/metadata.csv")
metadata_test.drop(columns=['split']).to_csv(metadata_file_location, index=False)

metadata.head()

Unnamed: 0,split,file_name,label
0,train,CREMA-D_1022_ITS_ANG_XX_Anger_-1.wav,0
1,train,CREMA-D_1037_ITS_ANG_XX_Anger_-1.wav,0
2,train,CREMA-D_1060_ITS_NEU_XX_Neutrality_0.wav,4
3,train,CREMA-D_1075_ITS_NEU_XX_Neutrality_0.wav,4
4,train,CREMA-D_1073_IOM_DIS_XX_Disgust_-1.wav,1


#### Ingest & Preprocess Dataset

In [4]:
sampling_rate=16000
audio_data = load_dataset("./dataset/splitted/test")
audio_data['train'][10]
# Cast Audio Feature to Data Type of Audio
audio_data = audio_data.cast_column("audio", Audio(sampling_rate=16000))

Resolving data files:   0%|          | 0/1181 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
print("\n\nEvaluation Dataset")
print("Evaluation Dataset Info: ", audio_data['train'])
print("First Sample in Evaluation Dataset", audio_data['train'][0])
print("Last Sample in Evaluation Dataset", audio_data['train'][-1])
print("Unique Values in Label/Class: ", sorted(audio_data['train'].unique("label")))



Evaluation Dataset
Evaluation Dataset Info:  Dataset({
    features: ['audio', 'label'],
    num_rows: 1180
})
First Sample in Evaluation Dataset {'audio': {'path': '/Users/bianca/Library/CloudStorage/OneDrive-SharedLibraries-NationalUniversityofSingapore/Capstone Scoping - 2. data/dataset/splitted/test/CREMA-D_1004_DFA_ANG_XX_Anger_-1.wav', 'array': array([-0.00033569, -0.00128174, -0.00161743, ...,  0.        ,
        0.        ,  0.        ]), 'sampling_rate': 16000}, 'label': 0}
Last Sample in Evaluation Dataset {'audio': {'path': '/Users/bianca/Library/CloudStorage/OneDrive-SharedLibraries-NationalUniversityofSingapore/Capstone Scoping - 2. data/dataset/splitted/test/TESS_YAF_sad_YAF_wife_sad_Sadness_-1.wav', 'array': array([-2.25831755e-05, -2.09043574e-04, -1.47747167e-04, ...,
       -1.03936618e-04, -7.94933148e-05,  0.00000000e+00]), 'sampling_rate': 16000}, 'label': 5}
Unique Values in Label/Class:  [0, 1, 2, 3, 4, 5, 6]


#### inference

## "Rajaram1996/Hubert_emotion 7class * 2 gender
{"female_angry": 0,"female_disgust": 1,"female_fear": 2,"female_happy": 3,"female_neutral": 4,"female_sad": 5,"female_surprise": 6,  
 "male_angry": 7,"male_disgust": 8,"male_fear": 9,"male_happy": 10,"male_neutral": 11,"male_sad": 12,"male_surprise": 13}

In [8]:
from transformers import pipeline
clf_hubert = pipeline("audio-classification", model="Rajaram1996/Hubert_emotion")
pred_prob_hubert = clf_hubert(audio_data['train']['audio'])

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at Rajaram1996/Hubert_emotion and are newly initialized: ['classifier.bias', 'classifier.weight', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# model_hubert = AutoModelForAudioClassification.from_pretrained("Rajaram1996/Hubert_emotion")
model_hubert_label2id = { 
    'female_angry': 0,'female_disgust': 1,'female_fear': 2,'female_happy': 3,'female_neutral': 4,'female_sad': 5,'female_surprise': 6,
    'male_angry':0,'male_disgust': 1,'male_fear': 2,'male_happy': 3,'male_neutral': 4,'male_sad': 5,'male_surprise': 6}

# {"angry": 0,"disgust": 1,"fearful": 2, "happy": 3, "neutral": 4, "sad": 5, "surprised": 6}

In [11]:
pred_hubert = []
for result in pred_prob_hubert:
    max_score_label = max(result, key=lambda x: x['score'])['label']
    pred_hubert.append(max_score_label)

pred_hubert_ids = [model_hubert_label2id[l] for l in pred_hubert]
# accuracy_hubert = accuracy.compute(predictions=pred_hubert_ids, references=audio_data['train']['label'])
# print(accuracy_hubert)
compute_metrics(pred_hubert_ids,audio_data['train']['label'])

{'accuracy': 0.22372881355932203,
 'Weighted F1': 0.2164919626503917,
 'Micro F1': 0.22372881355932203,
 'Macro F1': 0.19736953281826414,
 'Weighted Recall': 0.22372881355932203,
 'Micro Recall': 0.22372881355932203,
 'Macro Recall': 0.20406851265096204,
 'Weighted Precision': 0.262093400652312,
 'Micro Precision': 0.22372881355932203,
 'Macro Precision': 0.23832568829432801}

#### Define Metrics Evaluation Function

In [9]:
def compute_metrics(pred_ids,ref_ids):
    '''
    This function calculates & returns the following metrics:
    - accuracy
    - f1 score
    - recall
    - precision
    '''
    import evaluate

    accuracy_metric = evaluate.load("accuracy")

    accuracy = accuracy_metric.compute(predictions=pred_ids,
                                       references=ref_ids)['accuracy']

    ### ------------------- F1 scores -------------------

    f1_score_metric = evaluate.load("f1")

    weighted_f1_score = f1_score_metric.compute(predictions=pred_ids,references=ref_ids,average='weighted')["f1"]

    micro_f1_score = f1_score_metric.compute(predictions=pred_ids,references=ref_ids,average='micro')['f1']

    macro_f1_score = f1_score_metric.compute(predictions=pred_ids,references=ref_ids,average='macro')["f1"]

    ### ------------------- recall -------------------

    recall_metric = evaluate.load("recall")

    weighted_recall = recall_metric.compute(predictions=pred_ids,
                                            references=ref_ids,
                                            average='weighted')["recall"]

    micro_recall = recall_metric.compute(predictions=pred_ids,
                                         references=ref_ids,
                                         average='micro')["recall"]

    macro_recall = recall_metric.compute(predictions=pred_ids,
                                         references=ref_ids,
                                         average='macro')["recall"]

    ### ------------------- precision -------------------

    precision_metric = evaluate.load("precision")

    weighted_precision = precision_metric.compute(predictions=pred_ids,
                                                  references=ref_ids,
                                                  average='weighted')["precision"]

    micro_precision = precision_metric.compute(predictions=pred_ids,
                                               references=ref_ids,
                                               average='micro')["precision"]

    macro_precision = precision_metric.compute(predictions=pred_ids,
                                               references=ref_ids,
                                               average='macro')["precision"]

    return {"accuracy" : accuracy,
            "Weighted F1" : weighted_f1_score,
            "Micro F1" : micro_f1_score,
            "Macro F1" : macro_f1_score,
            "Weighted Recall" : weighted_recall,
            "Micro Recall" : micro_recall,
            "Macro Recall" : macro_recall,
            "Weighted Precision" : weighted_precision,
            "Micro Precision" : micro_precision,
            "Macro Precision" : macro_precision
            }