# **Environment Setup**

In [None]:
!pip install ipython-autotime
%load_ext autotime

In [2]:
import tensorflow as tf
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import re
import pandas as pd
import librosa

time: 4.1 s (started: 2022-09-26 18:21:45 +00:00)


# **Import Dataset from Kaggle**

In [3]:
!mkdir /root/.kaggle
!cp /content/drive/MyDrive/kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

time: 966 ms (started: 2022-09-26 18:21:50 +00:00)


**Download RAVDESS**

In [4]:
!kaggle datasets download uwrfkaggler/ravdess-emotional-speech-audio -p /content/dataset

Downloading ravdess-emotional-speech-audio.zip to /content/dataset
 99% 425M/429M [00:13<00:00, 21.1MB/s]
100% 429M/429M [00:13<00:00, 33.9MB/s]
time: 14.9 s (started: 2022-09-26 18:21:51 +00:00)


**Download CREMA-D**

In [5]:
!kaggle datasets download ejlok1/cremad -p /content/dataset

Downloading cremad.zip to /content/dataset
 98% 441M/451M [00:08<00:00, 58.3MB/s]
100% 451M/451M [00:08<00:00, 56.6MB/s]
time: 9.51 s (started: 2022-09-26 18:22:05 +00:00)


**Download TESS**

In [6]:
!kaggle datasets download ejlok1/toronto-emotional-speech-set-tess -p /content/dataset

Downloading toronto-emotional-speech-set-tess.zip to /content/dataset
 96% 411M/428M [00:01<00:00, 292MB/s]
100% 428M/428M [00:01<00:00, 254MB/s]
time: 2.81 s (started: 2022-09-26 18:22:15 +00:00)


**Download SAVEE**

In [7]:
!kaggle datasets download barelydedicated/savee-database -p /content/dataset

Downloading savee-database.zip to /content/dataset
 98% 210M/215M [00:00<00:00, 343MB/s]
100% 215M/215M [00:00<00:00, 305MB/s]
time: 1.84 s (started: 2022-09-26 18:22:18 +00:00)


**Extract RAVDESS**

In [8]:
with zipfile.ZipFile("/content/dataset/ravdess-emotional-speech-audio.zip","r") as zip_ref:
  zip_ref.extractall("/content/dataset/ravdess")

time: 8.59 s (started: 2022-09-26 18:22:20 +00:00)


**Extract CREMA-D**

In [9]:
with zipfile.ZipFile("/content/dataset/cremad.zip","r") as zip_ref:
  zip_ref.extractall("/content/dataset/cremad")

time: 6.63 s (started: 2022-09-26 18:22:28 +00:00)


**Extract SAVEE**

In [10]:
with zipfile.ZipFile("/content/dataset/savee-database.zip","r") as zip_ref:
  zip_ref.extractall("/content/dataset/savee")

time: 2.91 s (started: 2022-09-26 18:22:35 +00:00)


**Extract TESS**

In [11]:
with zipfile.ZipFile("/content/dataset/toronto-emotional-speech-set-tess.zip","r") as zip_ref:
  zip_ref.extractall("/content/dataset/tess")

time: 6.29 s (started: 2022-09-26 18:22:38 +00:00)


In [12]:
!rm -r /content/dataset/ravdess-emotional-speech-audio.zip
!rm -r /content/dataset/cremad.zip
!rm -r /content/dataset/toronto-emotional-speech-set-tess.zip
!rm -r /content/dataset/savee-database.zip
!rm -r /content/dataset/ravdess/audio_speech_actors_01-24

time: 846 ms (started: 2022-09-26 18:22:44 +00:00)


In [13]:
!mkdir /content/converted_images
!mkdir /content/converted_images/neutral
!mkdir /content/converted_images/calm
!mkdir /content/converted_images/happy
!mkdir /content/converted_images/sad
!mkdir /content/converted_images/angry
!mkdir /content/converted_images/fearful
!mkdir /content/converted_images/disgust
!mkdir /content/converted_images/surprised

time: 1.13 s (started: 2022-09-26 18:22:45 +00:00)


# **Data Extraction and Preparation**

**CREMA-D Sorting**

In [14]:
def get_label_cremad(file):
  labels = ["ANG","DIS","FEA","HAP","NEU","SAD"]
  for label in labels:
    if label in file:
      return label

  return None

def sort_cremad(path,dest):
  labels = {"ANG":os.path.join(dest,"angry"),
            "DIS":os.path.join(dest,"disgust"),
            "FEA":os.path.join(dest,"fearful"),
            "HAP":os.path.join(dest,"happy"),
            "NEU":os.path.join(dest,"neutral"),
            "SAD":os.path.join(dest,"sad")}
  images = [file.path for file in os.scandir(path) if ".wav" in file.path]
  for image in images:
    image_label = get_label_cremad(image)
    shutil.move(image,os.path.join(dest,labels[image_label]))

time: 1.83 ms (started: 2022-09-26 18:22:46 +00:00)


In [15]:
sort_cremad("/content/dataset/cremad/AudioWAV","/content/converted_images")

time: 295 ms (started: 2022-09-26 18:22:46 +00:00)


**RAVDESS Sorting**

In [16]:
def sort_ravdess(path,dest):
  labels = ["neutral","calm","happy","sad","angry","fearful","disgust","surprised"]
  dirs = [file.path for file in os.scandir(path) if file.is_dir()]
  for dir in dirs:
    files = [file for file in os.scandir(dir) if ".wav" in file.path]
    for file in files:
      dest_path = os.path.join(dest,labels[int(file.name[7])-1])
      shutil.move(file.path,dest_path)

time: 1.39 ms (started: 2022-09-26 18:22:47 +00:00)


In [17]:
sort_ravdess("/content/dataset/ravdess","/content/converted_images")

time: 70.2 ms (started: 2022-09-26 18:22:47 +00:00)


**SAVEE Sorting**

In [18]:
def get_label_savee(file):
  labels = ["a","d","f","h","n","sa","su"]
  for label in labels:
    pattern = "^"+label
    if re.search(pattern,file) != None:
      return label

  return None

def sort_savee(path,dest):
  counter = 0
  labels = {"a":os.path.join(dest,"angry"),
            "d":os.path.join(dest,"disgust"),
            "f":os.path.join(dest,"fearful"),
            "h":os.path.join(dest,"happy"),
            "n":os.path.join(dest,"neutral"),
            "sa":os.path.join(dest,"sad"),
            "su":os.path.join(dest,"surprised")}

  dirs = [file.path for file in os.scandir(path) if file.is_dir()]
  for dir in dirs:
    files = [file for file in os.scandir(dir) if ".wav" in file.path]
    for file in files:
        new_file_name = os.path.join(dest,labels[get_label_savee(file.name)])
        new_file_name = os.path.join(new_file_name,(str(counter) + ".wav"))
        shutil.move(file.path,new_file_name)
        counter+=1

time: 3.44 ms (started: 2022-09-26 18:22:47 +00:00)


In [19]:
sort_savee("/content/dataset/savee/AudioData","/content/converted_images")

time: 21.7 ms (started: 2022-09-26 18:22:47 +00:00)


**TESS Sorting**

In [20]:
def get_label_tess(file):
  labels = ["angry","disgust","fear","happy","neutral","ps","sad"]
  for label in labels:
    if label in file:
      return label

  return None


def sort_tess(path,dest):
  labels = {"angry":os.path.join(dest,"angry"),
            "disgust":os.path.join(dest,"disgust"),
            "fear":os.path.join(dest,"fearful"),
            "happy":os.path.join(dest,"happy"),
            "neutral":os.path.join(dest,"neutral"),
            "ps":os.path.join(dest,"surprised"),
            "sad":os.path.join(dest,"sad")}
  dirs = [file.path for file in os.scandir(path) if file.is_dir()]
  for dir in dirs:
    files = [file for file in os.scandir(dir) if ".wav" in file.path]
    for file in files:
      shutil.move(file.path,os.path.join(dest,labels[get_label_tess(file.name)]))

time: 2.04 ms (started: 2022-09-26 18:22:47 +00:00)


In [21]:
sort_tess("/content/dataset/tess/TESS Toronto emotional speech set data","/content/converted_images")

time: 123 ms (started: 2022-09-26 18:22:47 +00:00)


**WAV to MFCCs**

In [22]:
def get_num_files(path):
  counter = 0
  dirs = [dir.path for dir in os.scandir(path) if dir.is_dir()]
  for dir in dirs:
    files = [file.path for file in os.scandir(dir) if file.is_file()]
    for file in files:
      counter+=1

  return counter

time: 1.05 ms (started: 2022-09-26 18:22:47 +00:00)


In [23]:
sampling_rate=44100
audio_duration=2.5
n_mfcc = 25
mfcc_length = 216 #depends on input_length
num_files = get_num_files("/content/converted_images")

time: 18.2 ms (started: 2022-09-26 18:22:47 +00:00)


In [24]:
def convert_in_dir(dir,sampling_rate,audio_duration,num_files):
  input_length = sampling_rate * audio_duration
  X = np.empty(shape=(num_files,n_mfcc, mfcc_length, 1))
  Y = np.empty(shape=(num_files),dtype="object")
  counter = 0
  subdirs = [subdir.path for subdir in os.scandir(dir) if subdir.is_dir()]
  for subdir in subdirs:
    label = subdir.split("/")[-1]
    files = [wav.path for wav in os.scandir(subdir) if wav.is_file() and ".wav" in wav.path]
    for wav in files:
      data, _ = librosa.load(wav, sr=sampling_rate
                               ,res_type="kaiser_fast"
                               ,duration=audio_duration
                               ,offset=0.5
                              )

      data = librosa.util.fix_length(data,input_length)
      MFCC = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=n_mfcc)
      MFCC = np.expand_dims(MFCC, axis=-1)
      X[counter,] = MFCC
      Y[counter] = label
      counter+=1
      print(counter)

  return X,Y

time: 4.43 ms (started: 2022-09-26 18:22:47 +00:00)


In [None]:
X,Y = convert_in_dir("/content/converted_images", sampling_rate,audio_duration,num_files)

**Train and Test Set building**

In [26]:
from sklearn.model_selection import train_test_split

train_x,test_x,train_y,test_y = train_test_split(X,Y,shuffle=True,test_size=0.2,stratify=Y)

time: 174 ms (started: 2022-09-26 18:32:58 +00:00)


In [27]:
train_y[:10]

array(['fearful', 'happy', 'sad', 'neutral', 'happy', 'neutral', 'angry',
       'angry', 'happy', 'sad'], dtype=object)

time: 8.12 ms (started: 2022-09-26 18:32:58 +00:00)


In [28]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()

train_y = one_hot.fit_transform(train_y.reshape(-1,1)).toarray()
test_y = one_hot.fit_transform(test_y.reshape(-1,1)).toarray()

time: 8.56 ms (started: 2022-09-26 18:32:58 +00:00)


In [29]:
train_x.shape

(9729, 25, 216, 1)

time: 6.93 ms (started: 2022-09-26 18:32:58 +00:00)


In [30]:
mean = np.mean(train_x, axis=0)
std = np.std(train_x, axis=0)

train_x = (train_x - mean)/std
test_x = (test_x - mean)/std

time: 564 ms (started: 2022-09-26 18:32:58 +00:00)


# **Training and Testing with 2D CNN**

In [37]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape=(n_mfcc,mfcc_length,1)))

model.add(tf.keras.layers.Convolution2D(32, 3, padding="same",activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPool2D())
model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Convolution2D(32, 3, padding="same",activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPool2D())
model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Convolution2D(32, 3, padding="same",activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPool2D())
model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Convolution2D(32, 3, padding="same",activation="relu"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPool2D())
model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(64,activation="relu"))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(8,activation="softmax"))

time: 314 ms (started: 2022-09-26 18:51:10 +00:00)


In [None]:
model.summary()

In [54]:
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy",tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])

time: 13.5 ms (started: 2022-09-26 18:58:15 +00:00)


In [55]:
lr_plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_accuracy",patience=8,verbose=1,factor=0.25)

time: 863 µs (started: 2022-09-26 18:58:17 +00:00)


In [56]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="/content/model_checkpoints",
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

time: 11.7 ms (started: 2022-09-26 18:58:17 +00:00)


In [None]:
model.fit(train_x,train_y,validation_data=(test_x,test_y),epochs=100,callbacks=[lr_plateau,model_checkpoint])

In [None]:
accuracy: 0.6954   0.8074 0.5721