In [1]:
from dataset import EmotionDataset, create_dataloader, create_dataloaders
from pathlib import Path
import os
from dotenv import load_dotenv
# from model import EmotionDetectionModel

import torch

In [2]:
load_dotenv()

dataset = EmotionDataset(root_dir=Path(os.getenv("PATH_DATASETS")), resample_rate=16_000)

Loading CREMA-D...
Loading ESD...
Loading JL-Corpus...
Loading RAVDESS Actors...
Loading RAVDESS Speech...
Loading SAVE-E...
Loading TESS...
Loaded all datasets


In [3]:
mfccs, label = dataset[0]

# print(X.shape)
print(mfccs.shape)
print(label)


torch.Size([20, 561])
1


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


In [4]:
train_loader, test_loader = create_dataloaders(batch_size=32)

Loading CREMA-D...
Loading ESD...
Loading JL-Corpus...
Loading RAVDESS Actors...
Loading RAVDESS Speech...
Loading SAVE-E...
Loading TESS...
Loaded all datasets
Loading CREMA-D...
Loading ESD...
Loading JL-Corpus...
Loading RAVDESS Actors...
Loading RAVDESS Speech...
Loading SAVE-E...
Loading TESS...
Loaded all datasets


In [5]:
import torch
import torch.nn as nn


class Detector(nn.Module):

    def __init__(self, num_mfccs_features=20, num_classes=8):
        super().__init__()

        self.feature_extractor = nn.Sequential(
            nn.Conv1d(in_channels=num_mfccs_features, out_channels=32, kernel_size=5, stride=2),
            # nn.MaxPool1d(kernel_size=5, stride=2),
            nn.BatchNorm1d(32),
            nn.ELU(),
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1),
            # nn.MaxPool1d(kernel_size=5, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        )

        self.classifier = nn.Linear(in_features=64, out_features=num_classes)

    def forward(self, x):
        # print(x.shape)
        x = self.feature_extractor(x)
        # print(x.shape)
        x = x.mean(dim=2)
        # print(x.shape)
        x = self.classifier(x)
        # print(x.shape)
        return x

In [6]:
model = Detector()

In [7]:
mfccs.shape

torch.Size([20, 561])

In [8]:
detector = Detector()
detector

Detector(
  (feature_extractor): Sequential(
    (0): Conv1d(20, 32, kernel_size=(5,), stride=(2,))
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ELU(alpha=1.0)
    (3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
    (4): ELU(alpha=1.0)
    (5): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  )
  (classifier): Linear(in_features=64, out_features=8, bias=True)
)

In [9]:
X, y = next(iter(train_loader))
X.shape, y.shape

(torch.Size([32, 20, 1793]), torch.Size([32]))

In [10]:
detector(X)

tensor([[ 2.5725e-03, -8.5675e-02, -1.1811e-01,  5.0820e-02, -2.0206e-02,
         -5.4146e-02,  4.4480e-02, -9.1047e-02],
        [ 1.7811e-02, -2.7120e-02, -5.3571e-02, -1.7111e-02, -7.5288e-02,
         -1.2967e-01,  8.1670e-02, -1.2324e-01],
        [-2.9055e-02, -9.6368e-02, -1.4586e-01,  6.0536e-02, -2.0108e-02,
         -4.6663e-02,  2.6467e-02, -5.5843e-02],
        [-5.6705e-02, -1.5930e-01, -2.1151e-01,  1.3603e-01,  2.7248e-02,
          5.7586e-02,  2.3162e-02,  3.5275e-03],
        [ 2.2097e-02,  2.1194e-03,  7.1377e-03, -8.0477e-02, -1.1444e-01,
         -1.9384e-01,  1.0790e-01, -1.6161e-01],
        [-5.5685e-02, -1.4074e-01, -1.8807e-01,  1.0928e-01,  1.0624e-02,
          1.4930e-02,  3.0819e-02, -1.6660e-02],
        [-3.3237e-02, -8.9647e-02, -1.3206e-01,  4.3108e-02, -3.6633e-02,
         -4.7378e-02,  3.4611e-02, -4.2589e-02],
        [-9.6291e-02, -1.8764e-01, -2.5750e-01,  1.7463e-01,  5.2887e-02,
          6.0823e-02, -9.8112e-03,  3.7692e-02],
        [ 3.6649

In [11]:
# number of parameters of the model
print(f'Number of parameters of the model: {sum(p.numel() for p in model.parameters()):,}')

Number of parameters of the model: 22,376


In [12]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torchaudio
from pathlib import Path
import os
import re
import soundfile as sf
import librosa
import random


class EmotionDataset2(Dataset):

    def __init__(self, root_dir, resample_rate=16_000, n_mfcc=20, n_fft=64):

        self.root_dir = root_dir
        self.resample_rate = resample_rate
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft

        self.audio_files = []
        self.labels = []
        # the labels are:
        # 1: anger
        # 2: disgust
        # 3: fear
        # 4: happy
        # 5: neutral
        # 6: sad
        # 7: surprise
        
        self.dataset_source = []
        # the dataset source is:
        # 1: crema_d
        # 2: esd
        # 3: jl_corpus
        # 4: ravdess - Actors
        # 5: ravdess - Speech

        self.indices = None

        self.setup_datasets()

    def setup_datasets(self):
        # read the different datasets and save the audio files and labels

        # ? CREMA-D
        print('Loading CREMA-D...')
        crema_d_path = self.root_dir / "CREMA-D" / "AudioWAV"
        crema_d_files = crema_d_path.glob("*.wav")
        crema_d_conversion_dict = {
            "ANG": 1,
            "DIS": 2,
            "FEA": 3,
            "HAP": 4,
            "NEU": 5,
            "SAD": 6,
            # "SUR": 7
        }


        for file in crema_d_files:
            label = file.stem.split("_")[-2]
            label = crema_d_conversion_dict[label]
            
            self.audio_files.append(file)
            self.labels.append(label)
            self.dataset_source.append(1)

        
        # ? ESD
        print('Loading ESD...')
        esd_path = self.root_dir / "ESD"
        esd_conversion_dict = {
            "Angry": 1,
            # "Discust": 2,
            # "Fear": 3,
            "Happy": 4,
            "Neutral": 5,
            "Sad": 6,
            "Surprise": 7
        }

        
        for speaker in esd_path.iterdir():
            if not speaker.is_dir():
                continue
            for emotion in speaker.iterdir():
                if not emotion.is_dir():
                    continue
                for file in emotion.glob("*.wav"):
                    label = esd_conversion_dict[emotion.name]

                    self.audio_files.append(file)
                    self.labels.append(label)
                    self.dataset_source.append(2)


        # ? JL-Corpus
        print('Loading JL-Corpus...')
        jl_corpus_path = self.root_dir / "JL-Corpus" / 'Raw JL corpus (unchecked and unannotated)' / 'JL(wav+txt)'
        files = jl_corpus_path.glob("*.wav")
        jl_corpus_conversion_dict = {
            "angry": 1,
            # "discust": 2,
            # "fear": 3,
            "happy": 4,
            "neutral": 5,
            "sad": 6,
            "surprise": 7,
            "anxious": 3, # we use it as fear here
            "apologetic": None,
            "assertive": None, 
            "concerned": 3, # we use it as fear here
            "encouraging": None, 
            "excited": 4,
        }

        for file in files:
            label = file.stem.split("_")[1]
            label = jl_corpus_conversion_dict[label]
            if label is None:
                continue # skip the file

            self.audio_files.append(file)
            self.labels.append(label)
            self.dataset_source.append(3)


        # ? RAVDESS Actors
        print('Loading RAVDESS Actors...')
        ravdess_actors_path = self.root_dir / "RAVDESS" / "actors"
        ravdess_conversion_dict = {
            "01": 5,
            "02": 5, # is neutral too
            "03": 4,
            "04": 6,
            "05": 1,
            "06": 3,
            "07": 2,
            "08": 7
        }

        for actor in ravdess_actors_path.iterdir():
            if not actor.is_dir():
                continue
            for file in actor.glob("*.wav"):
                
                label = file.stem.split("-")[2]
                label = ravdess_conversion_dict[label]

                self.audio_files.append(file)
                self.labels.append(label)
                self.dataset_source.append(4)


        # ? RAVDESS Speech
        print('Loading RAVDESS Speech...')
        ravdess_speech_path = self.root_dir / "RAVDESS" / "speech"
        ravdess_conversion_dict = {
            "01": 5,
            "02": 5, # is neutral too
            "03": 4,
            "04": 6,
            "05": 1,
            "06": 3,
            "07": 2,
            "08": 7
        }

        for actor in ravdess_speech_path.iterdir():
            if not actor.is_dir():
                continue
            for file in actor.glob("*.wav"):
                
                label = file.stem.split("-")[2]
                label = ravdess_conversion_dict[label]

                self.audio_files.append(file)
                self.labels.append(label)
                self.dataset_source.append(4)


        # ? SAVE-E
        print('Loading SAVE-E...')
        savee_path = self.root_dir / "SAVE-E" / "ALL"
        savee_conversion_dict = {
            "a": 1,
            "d": 2,
            "f": 3,
            "h": 4,
            "n": 5,
            "sa": 6,
            "su": 7
        }

        for file in savee_path.glob("*.wav"):
            label = re.findall(r'([a-z]+)\d+', file.stem)[0]
            label = savee_conversion_dict[label]

            self.audio_files.append(file)
            self.labels.append(label)
            self.dataset_source.append(5)

        # ? TESS
        print('Loading TESS...')
        tess_path = self.root_dir / "TESS" / "TESS Toronto emotional speech set data"
        tess_conversion_dict = {
            "angry": 1,
            "disgust": 2,
            "fear": 3,
            "happy": 4,
            "neutral": 5,
            "sad": 6,
            "pleasant_surprise": 7,
            "surprise": 7,
            "surprised": 7
        }

        for emotion in tess_path.iterdir():
            if not emotion.is_dir():
                continue
            if emotion.name == "TESS Toronto emotional speech set data":
                continue

            label = emotion.name.split("_")[-1]
            label = tess_conversion_dict[label.lower()]

            for file in emotion.glob("*.wav"):
                self.audio_files.append(file)
                self.labels.append(label)
                self.dataset_source.append(6)

        print('Loaded all datasets')

    def __len__(self):
        if self.indices is None:
            return len(self.audio_files)
        else:
            return len(self.indices)
    
    def __getitem__(self, idx):
        if self.indices is not None:
            idx = self.indices[idx]
        audio_file = self.audio_files[idx]
        label = self.labels[idx]
        dataset_source = self.dataset_source[idx]

        # Use soundfile to read the audio file as a workaround for torchaudio backend issues
        X, sr = librosa.load(str(audio_file), sr=self.resample_rate, mono=True)

        # remove the silences
        X, _ = librosa.effects.trim(X, top_db=20)
       
        mfccs = librosa.feature.mfcc(y=X, sr=sr, n_mfcc=self.n_mfcc, n_fft=self.n_fft, hop_length=self.n_fft//2)

        # to tensor
        mfccs = torch.tensor(mfccs, dtype=torch.float32)

        mel_spec = librosa.feature.melspectrogram(y=X, sr=sr, n_fft=self.n_fft, hop_length=self.n_fft//2)
        mel_spec = torch.tensor(mel_spec, dtype=torch.float32)

        chroma = librosa.feature.chroma_stft(y=X, sr=sr, n_fft=self.n_fft, hop_length=self.n_fft//2)
        chroma = torch.tensor(chroma, dtype=torch.float32)

        return mfccs, mel_spec, chroma, label
    
    def set_indices(self, indices):
        self.indices = indices


In [13]:
dataset2 = EmotionDataset2(root_dir=Path(os.getenv("PATH_DATASETS")), resample_rate=16_000, n_mfcc=40, n_fft=128)

Loading CREMA-D...
Loading ESD...
Loading JL-Corpus...
Loading RAVDESS Actors...
Loading RAVDESS Speech...
Loading SAVE-E...
Loading TESS...
Loaded all datasets


In [14]:
mfccs, mel_spec, chroma, label = dataset2[0]

In [15]:
mfccs.shape, mel_spec.shape, chroma.shape

(torch.Size([40, 281]), torch.Size([128, 281]), torch.Size([12, 281]))

In [16]:
torch.cat([mfccs, mel_spec, chroma], dim=0).shape

torch.Size([180, 281])