## Install required libraries

In [1]:
%%capture
!pip install gradio
!pip install evaluate

## Load the dataset

Create dictionaries to map labels to ids and vice versa

In [3]:
label2id = {
    'neutral': 0,
    'calm': 1,
    'happy': 2,
    'sad': 3,
    'angry': 4,
    'fearful': 5,
    'disgust': 6,
    'surprised': 7
}

id2label = {}
for key, value in label2id.items():
    id2label.update({value: key})

id2label

{0: 'neutral',
 1: 'calm',
 2: 'happy',
 3: 'sad',
 4: 'angry',
 5: 'fearful',
 6: 'disgust',
 7: 'surprised'}

In [4]:
import os
import pandas as pd

data = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        id = int(filename[7])
        emotion = id2label[id-1]
        data.append({
            'path': file_path,
            'emotion': emotion,
            'id': id-1
        })

df = pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,path,emotion,id
0,/kaggle/input/ravdess-emotional-speech-audio/A...,surprised,7
1,/kaggle/input/ravdess-emotional-speech-audio/A...,neutral,0
2,/kaggle/input/ravdess-emotional-speech-audio/A...,disgust,6
3,/kaggle/input/ravdess-emotional-speech-audio/A...,disgust,6
4,/kaggle/input/ravdess-emotional-speech-audio/A...,neutral,0


In [6]:
df['emotion'].value_counts()

emotion
surprised    384
disgust      384
fearful      384
sad          384
happy        384
calm         384
angry        384
neutral      192
Name: count, dtype: int64

#### Split the dataset

In [7]:
from datasets import Audio, Dataset
#Create hugging face audio dataset
dataset = Dataset.from_dict({"audio": df['path'], "label": df['id']}).cast_column("audio", Audio())
dataset = dataset.train_test_split(seed=42, shuffle=True, test_size=0.2)
dataset['train'][0]

{'audio': {'path': '/kaggle/input/ravdess-emotional-speech-audio/Actor_03/03-01-08-01-01-02-03.wav',
  'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 3.05175781e-05]),
  'sampling_rate': 48000},
 'label': 7}

#### Listen to few examples using gradio interface

In [8]:
import gradio as gr

def generate_audio():
    example = dataset["train"].shuffle()[0]
    audio = example["audio"]
    return (
        audio["sampling_rate"],
        audio["array"],
    ), id2label[example["label"]]

with gr.Blocks() as demo:
    with gr.Column():
        for _ in range(4):
            audio, label = generate_audio()
            output = gr.Audio(audio, label=label)

demo.launch()



* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://0c892292e917aba7a0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




#### Preprocess the data

I will use DistilHuBERT model for fine tuning

In [9]:
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"

feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

sampling_rate = feature_extractor.sampling_rate
print("Model's sample rate: ", sampling_rate)

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

Model's sample rate:  16000


Since sampling rates of dataset and model are different, we will need to convert the sample rate of our samples to 16000 Hz.

In [10]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
dataset['train'][0]

{'audio': {'path': '/kaggle/input/ravdess-emotional-speech-audio/Actor_03/03-01-08-01-01-02-03.wav',
  'array': array([-3.04453351e-10,  3.02634362e-10, -2.81943358e-10, ...,
         -1.75207053e-04, -6.04685229e-05,  2.10753860e-05]),
  'sampling_rate': 16000},
 'label': 7}

In [11]:
def preprocess_function(examples):
    """Prepare the dataset for training"""
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate = feature_extractor.sampling_rate,
        return_attention_mask = True
    )
    return inputs

dataset_enc = dataset.map(
    preprocess_function,
    remove_columns=['audio'],
    batched=True,
    batch_size=100,
    num_proc=1
)

dataset_enc

Map:   0%|          | 0/2304 [00:00<?, ? examples/s]

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 2304
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 576
    })
})

#### Fine-tuning the model

In [12]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label
)

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/94.0M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 7

training_args = TrainingArguments(
    f"{model_name}-finetuned-ravdess",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=False,
    report_to='none'
)



In [18]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [19]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_enc["train"],
    eval_dataset=dataset_enc["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,1.5516,1.462943,0.519097
2,0.8858,0.94037,0.713542
3,0.4844,0.518282,0.857639
4,0.1999,0.28187,0.932292
5,0.162,0.183065,0.953125
6,0.0837,0.130776,0.963542
7,0.0574,0.118378,0.972222


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=1008, training_loss=0.6410924392560172, metrics={'train_runtime': 1365.6103, 'train_samples_per_second': 11.81, 'train_steps_per_second': 0.738, 'total_flos': 1.6137221209918848e+17, 'train_loss': 0.6410924392560172, 'epoch': 7.0})