## IMPORTING LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import seaborn as sns

import librosa
import librosa.display

import tensorflow as tf
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from scipy.io.wavfile import read as wavread
import sys
import json
import argparse
import pickle
import glob 
import IPython.display as ipd
import os
import time
import warnings

warnings.filterwarnings(action="ignore")
%matplotlib inline

## READING DATASET

In [4]:
TESS = "/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/"
RAV = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
SAVEE="/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/"
CREMA = "../input/cremad/AudioWAV/"

In [5]:
dir_list = sorted(os.listdir(SAVEE))

emotion=[]
path = []
for i in dir_list:
    if i[-8:-6]=='_a':
        emotion.append('male_angry')
    elif i[-8:-6]=='_d':
        emotion.append('male_disgust')
    elif i[-8:-6]=='_f':
        emotion.append('male_fear')
    elif i[-8:-6]=='_h':
        emotion.append('male_happy')
    elif i[-8:-6]=='_n':
        emotion.append('male_neutral')
    elif i[-8:-6]=='sa':
        emotion.append('male_sad')
    elif i[-8:-6]=='su':
        emotion.append('male_surprise')
    else:
        emotion.append('male_error') 
    path.append(SAVEE + i)
    
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df['source'] = 'SAVEE'
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df["labels"].value_counts()

In [6]:
dir_list = sorted(os.listdir(RAV))

emotion = []
gender = []
path = []
for i in dir_list:
    fname = os.listdir(RAV + i)
    for f in fname:
        part = f.split('.')[0].split('-')
        emotion.append(int(part[2]))
        temp = int(part[6])
        if temp%2 == 0:
            temp = "female"
        else:
            temp = "male"
        gender.append(temp)
        path.append(RAV + i + '/' + f)

        
RAVDESS_df = pd.DataFrame(emotion)
RAVDESS_df = RAVDESS_df.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
RAVDESS_df = pd.concat([pd.DataFrame(gender),RAVDESS_df],axis=1)
RAVDESS_df.columns = ['gender','emotion']
RAVDESS_df['labels'] =RAVDESS_df.gender + '_' + RAVDESS_df.emotion
RAVDESS_df['source'] = 'RAVDESS'  
RAVDESS_df = pd.concat([RAVDESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
RAVDESS_df = RAVDESS_df.drop(['gender', 'emotion'], axis=1)
RAVDESS_df["labels"].value_counts()

In [7]:
plt.figure(figsize=(22,5))
sns.set_style("darkgrid")
sns.countplot(RAVDESS_df["labels"])
plt.title("Count Plot",size=20)
plt.plot()

filename = RAV + 'Actor_16/03-01-07-01-02-01-16.wav'  
data, sampling_rate = librosa.load(filename)
plt.figure(figsize=(15, 5))
plt.title("RAVDESS")
librosa.display.waveplot(data, sr=sampling_rate)
ipd.Audio(filename)

In [8]:
dir_list=sorted(os.listdir(TESS))
path = []
emotion = []

for i in dir_list:
    fname = os.listdir(TESS + i)
    for f in fname:
        if i == 'OAF_angry' or i == 'YAF_angry':
            emotion.append('female_angry')
        elif i == 'OAF_disgust' or i == 'YAF_disgust':
            emotion.append('female_disgust')
        elif i == 'OAF_Fear' or i == 'YAF_fear':
            emotion.append('female_fear')
        elif i == 'OAF_happy' or i == 'YAF_happy':
            emotion.append('female_happy')
        elif i == 'OAF_neutral' or i == 'YAF_neutral':
            emotion.append('female_neutral')                                
        elif i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised':
            emotion.append('female_surprise')               
        elif i == 'OAF_Sad' or i == 'YAF_sad':
            emotion.append('female_sad')
        else:
            emotion.append('Unknown')
        path.append(TESS + i + "/" + f)

TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df['source'] = 'TESS'
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
TESS_df.labels.value_counts()

In [9]:
plt.figure(figsize=(15,5))
sns.set_style("darkgrid")
sns.countplot(TESS_df["labels"])
plt.title("Count Plot",size=20)
plt.plot()

filename = TESS + 'YAF_angry/YAF_cab_angry.wav' 

data, sampling_rate = librosa.load(filename)
plt.figure(figsize=(15, 5))
plt.title("TESS")
librosa.display.waveplot(data, sr=sampling_rate)

ipd.Audio(filename)

In [10]:
dir_list = sorted(os.listdir(CREMA))

gender = []
emotion = []
path = []
female = [1002,1003,1004,1006,1007,1008,1009,1010,1012,1013,1018,1020,1021,1024,1025,1028,1029,1030,1037,1043,1046,1047,1049,
          1052,1053,1054,1055,1056,1058,1060,1061,1063,1072,1073,1074,1075,1076,1078,1079,1082,1084,1089,1091]

for i in dir_list: 
    part = i.split('_')
    if int(part[0]) in female:
        temp = 'female'
    else:
        temp = 'male'
    gender.append(temp)
    if part[2] == 'SAD' and temp == 'male':
        emotion.append('male_sad')
    elif part[2] == 'ANG' and temp == 'male':
        emotion.append('male_angry')
    elif part[2] == 'DIS' and temp == 'male':
        emotion.append('male_disgust')
    elif part[2] == 'FEA' and temp == 'male':
        emotion.append('male_fear')
    elif part[2] == 'HAP' and temp == 'male':
        emotion.append('male_happy')
    elif part[2] == 'NEU' and temp == 'male':
        emotion.append('male_neutral')
    elif part[2] == 'SAD' and temp == 'female':
        emotion.append('female_sad')
    elif part[2] == 'ANG' and temp == 'female':
        emotion.append('female_angry')
    elif part[2] == 'DIS' and temp == 'female':
        emotion.append('female_disgust')
    elif part[2] == 'FEA' and temp == 'female':
        emotion.append('female_fear')
    elif part[2] == 'HAP' and temp == 'female':
        emotion.append('female_happy')
    elif part[2] == 'NEU' and temp == 'female':
        emotion.append('female_neutral')
    else:
        emotion.append('Unknown')
    path.append(CREMA + i)
    
CREMA_df = pd.DataFrame(emotion, columns = ['labels'])
CREMA_df['source'] = 'CREMA'
CREMA_df = pd.concat([CREMA_df,pd.DataFrame(path, columns = ['path'])],axis=1)
CREMA_df.labels.value_counts()

In [11]:
ref = pd.concat([SAVEE_df, RAVDESS_df, TESS_df, CREMA_df], axis = 0)
print(ref.labels.value_counts())
ref.head()

### DATA VISUALIZATION

Let's take two samples a sample of male and female angry voice from RAVDESS and try to analyze it using Spectrogram and its wave plot

In [12]:
X,sample_rate=librosa.load(RAVDESS_df["path"][9],res_type="kaiser_fast",duration=2,offset=1)
mfcc=librosa.feature.mfcc(y=X,sr=sample_rate,n_mfcc=13)
log_S=librosa.amplitude_to_db(librosa.feature.melspectrogram(X,sr=sample_rate,n_mels=128))

n_fft=2048
ft=np.abs(librosa.stft(X,hop_length=n_fft+1))

plt.figure(figsize=(20,15))
plt.subplot(4,1,1)
librosa.display.waveplot(X,sr=sample_rate)
plt.title(f"Audio for {RAVDESS_df['labels'][9]}")

plt.figure(figsize=(20,15))
plt.subplot(4,1,2)
librosa.display.specshow(mfcc,x_axis="time")
plt.colorbar(format="%+2.0f dB")
plt.title(f"MFCC for {RAVDESS_df['labels'][9]}")

plt.figure(figsize=(20,15))
plt.subplot(4,1,3)
librosa.display.specshow(log_S,sr=sample_rate, y_axis="mel")
plt.title(f"Log mel spectrogram for {RAVDESS_df['labels'][9]}")

plt.figure(figsize=(20,15))
plt.subplot(4,1,3)
plt.plot(ft)
plt.title(f"Spectrum for {RAVDESS_df['labels'][9]}")
plt.xlabel("Frequency")
plt.ylabel("Amplitude")

In [13]:
X,sample_rate=librosa.load(RAVDESS_df["path"][71],res_type="kaiser_fast",duration=2,offset=1)
mfcc=librosa.feature.mfcc(y=X,sr=sample_rate,n_mfcc=13)
log_S=librosa.amplitude_to_db(librosa.feature.melspectrogram(X,sr=sample_rate,n_mels=128))

n_fft=2048
ft=np.abs(librosa.stft(X,hop_length=n_fft+1))

plt.figure(figsize=(20,15))
plt.subplot(4,1,1)
librosa.display.waveplot(X,sr=sample_rate)
plt.title(f"Audio for {RAVDESS_df['labels'][71]}")

plt.figure(figsize=(20,15))
plt.subplot(4,1,2)
librosa.display.specshow(mfcc,x_axis="time")
plt.colorbar(format="%+2.0f dB")
plt.title(f"MFCC for {RAVDESS_df['labels'][71]}")

plt.figure(figsize=(20,15))
plt.subplot(4,1,3)
librosa.display.specshow(log_S,sr=sample_rate, y_axis="mel")
plt.title(f"Log mel spectrogram for {RAVDESS_df['labels'][71]}")

plt.figure(figsize=(20,15))
plt.subplot(4,1,4)
plt.plot(ft)
plt.title(f"Spectrum for {RAVDESS_df['labels'][71]}")
plt.xlabel("Frequency")
plt.ylabel("Amplitude")

In [14]:
X,sample_rate=librosa.load(TESS_df["path"][600],res_type="kaiser_fast",duration=2,offset=1)
mfcc=librosa.feature.mfcc(y=X,sr=sample_rate,n_mfcc=13)
log_S=librosa.amplitude_to_db(librosa.feature.melspectrogram(X,sr=sample_rate,n_mels=128))

n_fft=2048
ft=np.abs(librosa.stft(X,hop_length=n_fft+1))

plt.figure(figsize=(20,15))
plt.subplot(4,1,1)
librosa.display.waveplot(X,sr=sample_rate)
plt.title(f"Audio for female_angry")


plt.figure(figsize=(20,15))
plt.subplot(4,1,2)
librosa.display.specshow(mfcc,x_axis="time")
plt.colorbar(format="%+2.0f dB")
plt.title(f"MFCC for female_angry")

plt.figure(figsize=(20,15))
plt.subplot(4,1,3)
librosa.display.specshow(log_S,sr=sample_rate, y_axis="mel")
plt.title(f"Log mel spectrogram for female_angry")

plt.figure(figsize=(20,15))
plt.subplot(4,1,4)
plt.plot(ft)
plt.title(f"Spectrum for female angry")
plt.xlabel("Frequency")
plt.ylabel("Amplitude")

## FEATURE EXTRACTION

In [15]:
df = pd.DataFrame(columns=['feature'])

counter=0
for index,path in enumerate(ref.path):
    X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=44100,offset=0.5)
    sample_rate = np.array(sample_rate)
    
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13),axis=0)
    df.loc[counter] = [mfccs]
    counter=counter+1   

df.head()

In [16]:
ref.reset_index(inplace=True, drop=True)

In [17]:
df = pd.concat([ref,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df[:5]

In [18]:
df=df.fillna(0)
print(df.shape)
df[:5]

## TRAIN TEST SPLIT

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['path','labels','source'],axis=1), df.labels, test_size=0.2, shuffle=True, random_state=42)

### NORMALIZATION

In [20]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

### BASELINE TEST

#### DECISION TREE

In [21]:
start_time=time.time()
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
prediction=dtree.predict(X_test)
print(classification_report(y_test,prediction))
print(time.time()-start_time)

#### RANDOM FOREST

In [22]:
start_time=time.time()
rand_forest=RandomForestClassifier(criterion="gini",max_features="log2",max_depth=10,max_leaf_nodes=100,min_samples_leaf=3,
                                   min_samples_split=20,n_estimators=22000,random_state=42)
rand_forest.fit(X_train,y_train)
print(classification_report(y_test,rand_forest.predict(X_test)))
print(time.time()-start_time)

#### SEQUENTIAL MODEL

In [23]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# one hot encode the target 
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(lb.classes_)

# Pickel the lb object for future use 
filename = 'labels'
outfile = open(filename,'wb')
pickle.dump(lb,outfile)
outfile.close()

In [24]:
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape

In [25]:
# New model
model = Sequential()
model.add(Conv1D(256, 8, padding='same',input_shape=(X_train.shape[1],1)))  # X_train.shape[1] = No. of Columns
model.add(Activation('relu'))
model.add(Conv1D(256, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(14))
model.add(Activation('softmax'))

opt = tf.keras.optimizers.RMSprop(lr=0.00001, decay=1e-6)
model.summary()

In [26]:
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
start_time=time.time()
model_history=model.fit(X_train, y_train, batch_size=16, epochs=80, validation_data=(X_test, y_test))
print(time.time()-start_time)