In [2]:
#Importing required libraries 

# Keras
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Other  
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import pickle
import IPython.display as ipd  # To play audio in notebook

Using TensorFlow backend.


# Data preparation & processing


# Data Preparation

In [4]:
# load csv file created in extracting labels and path 

df = pd.read_csv("All_Data_Frames.csv")
df.head()

Unnamed: 0,labels,source,path
0,male_neutral,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...
1,male_fear,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...
2,male_surprise,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...
3,male_sad,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...
4,male_disgust,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...


# Feature Extraction

In [5]:
# MFCC is well known to be a good feature, MFCC-> Mel-frequency cepstral coefficient
# read each audio file, extract its mean across all MFCC bands by time

# feature data frame that is MFCC output for each file, and it's a 2D matrix of the number of bands by time
feature_df = pd.DataFrame(columns=['feature'])

# loop feature extraction over the entire dataset
counter=0
for index,path in enumerate(df.path):
    X, sample_rate = librosa.load(path
                                  , res_type='kaiser_fast'
                                  ,duration=2.5
                                  ,sr=44100
                                  ,offset=0.5
                                 )
    sample_rate = np.array(sample_rate)
    
    # mean as the feature. Could do min and max etc as well. 
    mfccs = np.mean(librosa.feature.mfcc(y=X, 
                                        sr=sample_rate, 
                                        n_mfcc=13), axis=0)
    feature_df.loc[counter] = [mfccs]
    counter=counter+1   

# check feature data frame
print(len(feature_df))
feature_df.head()

12162


Unnamed: 0,feature
0,"[-3.0082576, -5.508383, -11.619889, -11.603205..."
1,"[-10.467623, -8.993829, -11.7763815, -13.74424..."
2,"[-12.981858, -8.290445, -9.102726, -10.725917,..."
3,"[-23.589674, -24.579994, -22.594236, -21.48121..."
4,"[-22.956581, -23.019234, -20.55793, -19.553026..."


# Data processing

In [6]:
# Now extract the mean bands to its own feature columns
final_df = pd.concat([df,pd.DataFrame(feature_df['feature'].values.tolist())],axis=1)
final_df[:5]

Unnamed: 0,labels,source,path,0,1,2,3,4,5,6,...,206,207,208,209,210,211,212,213,214,215
0,male_neutral,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-3.008258,-5.508383,-11.619889,-11.603205,-13.251027,-12.154624,-12.164827,...,-28.975243,-27.88574,-27.087734,-29.09494,-26.330254,-25.438536,-25.739513,-26.417332,-25.851837,-25.928373
1,male_fear,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-10.467623,-8.993829,-11.776381,-13.744241,-15.344474,-15.923333,-13.486449,...,-8.035086,-7.168881,-6.481255,-6.759146,-6.833916,-6.347439,-5.493447,-5.551433,-0.891208,2.769547
2,male_surprise,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-12.981858,-8.290445,-9.102726,-10.725917,-10.041571,-13.318968,-15.079294,...,-25.084118,-26.953909,-28.918743,-28.394785,-26.758858,-27.620451,-27.888905,-28.871309,-27.902435,-28.278313
3,male_sad,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-23.589674,-24.579994,-22.594236,-21.481213,-20.949923,-20.414589,-20.267546,...,-8.499668,-10.080903,-12.700766,-17.040066,-20.24037,-23.302591,-24.621037,-23.829395,-12.847005,-5.907684
4,male_disgust,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-22.956581,-23.019234,-20.55793,-19.553026,-22.532879,-23.454952,-21.624464,...,-17.982821,-18.645363,-21.921246,-23.019463,-21.649454,-23.983215,-24.639437,-26.931631,-27.579979,-27.450191


In [7]:
# replace NA with 0
final_df=final_df.fillna(0)
print(final_df.shape)
final_df[:5]

(12162, 219)


Unnamed: 0,labels,source,path,0,1,2,3,4,5,6,...,206,207,208,209,210,211,212,213,214,215
0,male_neutral,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-3.008258,-5.508383,-11.619889,-11.603205,-13.251027,-12.154624,-12.164827,...,-28.975243,-27.88574,-27.087734,-29.09494,-26.330254,-25.438536,-25.739513,-26.417332,-25.851837,-25.928373
1,male_fear,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-10.467623,-8.993829,-11.776381,-13.744241,-15.344474,-15.923333,-13.486449,...,-8.035086,-7.168881,-6.481255,-6.759146,-6.833916,-6.347439,-5.493447,-5.551433,-0.891208,2.769547
2,male_surprise,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-12.981858,-8.290445,-9.102726,-10.725917,-10.041571,-13.318968,-15.079294,...,-25.084118,-26.953909,-28.918743,-28.394785,-26.758858,-27.620451,-27.888905,-28.871309,-27.902435,-28.278313
3,male_sad,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-23.589674,-24.579994,-22.594236,-21.481213,-20.949923,-20.414589,-20.267546,...,-8.499668,-10.080903,-12.700766,-17.040066,-20.24037,-23.302591,-24.621037,-23.829395,-12.847005,-5.907684
4,male_disgust,SAVEE,/home/bukya/Desktop/Speech_Emotion_Recognition...,-22.956581,-23.019234,-20.55793,-19.553026,-22.532879,-23.454952,-21.624464,...,-17.982821,-18.645363,-21.921246,-23.019463,-21.649454,-23.983215,-24.639437,-26.931631,-27.579979,-27.450191


In [8]:
# Split data into train set and test set 
X_train, X_test, y_train, y_test = train_test_split(final_df.drop(['path','labels','source'],axis=1)
                                                    , final_df.labels
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )

# Lets see how the data present itself before normalisation 
X_train[150:160]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
4950,-18.611179,-17.616539,-18.411484,-18.987419,-17.404621,-16.747272,-17.733747,-18.055025,-17.93121,-15.913172,...,-22.899403,-21.647816,-19.758656,-18.879402,-19.397377,-20.171659,-22.689243,-24.612814,-24.153776,-22.703135
3860,-23.275175,-22.550425,-18.601215,-19.310427,-19.504612,-20.461906,-23.478741,-25.597187,-28.436279,-27.643888,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9761,-1.533947,-4.030602,-9.614023,-12.045173,-9.992992,-11.92625,-14.008465,-13.561555,-14.024568,-15.151947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7620,-4.531077,-3.933792,-4.567834,-5.871509,-5.282475,-6.490459,-8.156466,-9.188803,-8.681725,-8.212409,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11586,-20.621702,-21.587507,-20.563646,-20.703459,-21.205715,-18.608534,-18.446669,-16.211845,-14.257651,-15.160404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7914,-17.514988,-18.551867,-17.043016,-16.977903,-19.369633,-19.562126,-22.008749,-20.178385,-17.989597,-19.336285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9513,-18.740368,-18.82493,-16.149488,-16.963457,-18.229979,-18.183952,-19.274342,-18.395123,-16.951286,-16.672031,...,-17.88213,-19.390713,-17.779472,-19.165974,0.0,0.0,0.0,0.0,0.0,0.0
5835,-19.066849,-18.328381,-17.710285,-18.043192,-18.25248,-18.710625,-16.626352,-17.831005,-18.028343,-17.859104,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5389,-20.76059,-20.047138,-18.961346,-19.468687,-19.316292,-18.162563,-18.102333,-19.914133,-20.931385,-19.215496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11222,-18.252924,-17.727373,-19.222475,-18.469971,-17.572325,-17.850542,-17.932026,-20.5889,-18.612183,-15.990726,...,-18.065437,-18.13509,-19.665306,-20.741905,-20.273037,-18.371035,-15.576723,-17.512489,-17.008547,-18.195284


In [9]:
# Lets do data normalization 
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

# Check the dataset now 
X_train[150:160]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
4950,0.185439,0.302201,0.437356,0.388014,0.49772,0.54096,0.459522,0.431126,0.434075,0.579169,...,-0.871456,-0.803882,-0.685581,-0.624922,-0.671214,-0.713581,-0.856821,-0.978781,-0.961893,-0.871796
3860,-0.13997,-0.055768,0.423287,0.364109,0.342614,0.266639,0.036545,-0.123282,-0.338554,-0.28399,...,0.540423,0.526891,0.526358,0.527078,0.51202,0.512089,0.5129,0.501302,0.49637,0.491865
9761,1.376928,1.287903,1.089695,0.90179,1.045145,0.896987,0.733797,0.761431,0.721401,0.635181,...,0.540423,0.526891,0.526358,0.527078,0.51202,0.512089,0.5129,0.501302,0.49637,0.491865
7620,1.167816,1.294927,1.463874,1.358686,1.393065,1.298414,1.164653,1.082863,1.114358,1.1458,...,0.540423,0.526891,0.526358,0.527078,0.51202,0.512089,0.5129,0.501302,0.49637,0.491865
11586,0.045164,0.014095,0.277771,0.261014,0.21697,0.403508,0.407033,0.566614,0.704259,0.634559,...,0.540423,0.526891,0.526358,0.527078,0.51202,0.512089,0.5129,0.501302,0.49637,0.491865
7914,0.261921,0.23434,0.538829,0.536732,0.352584,0.333086,0.144774,0.275043,0.429781,0.327293,...,0.540423,0.526891,0.526358,0.527078,0.51202,0.512089,0.5129,0.501302,0.49637,0.491865
9513,0.176426,0.214528,0.605085,0.537801,0.436759,0.434863,0.346095,0.406126,0.506146,0.523332,...,-0.562113,-0.665129,-0.564183,-0.642408,0.51202,0.512089,0.5129,0.501302,0.49637,0.491865
5835,0.153647,0.250555,0.48935,0.457893,0.435097,0.395969,0.541054,0.447593,0.426931,0.435985,...,0.540423,0.526891,0.526358,0.527078,0.51202,0.512089,0.5129,0.501302,0.49637,0.491865
5389,0.035474,0.125853,0.396583,0.352396,0.356523,0.436443,0.432385,0.294467,0.213417,0.33618,...,0.540423,0.526891,0.526358,0.527078,0.51202,0.512089,0.5129,0.501302,0.49637,0.491865
11222,0.210435,0.29416,0.37722,0.426308,0.485333,0.459485,0.444924,0.244867,0.383991,0.573463,...,-0.573415,-0.587941,-0.679855,-0.73857,-0.724629,-0.604171,-0.427447,-0.551805,-0.530506,-0.601032


In [10]:
# Lets few preparation steps to get it into the correct format for Keras 
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# one hot encode the target 
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(lb.classes_)
#print(y_train[0:10])
#print(y_test[0:10])

# Pickel the lb object for future use 
filename = 'labels'
outfile = open(filename,'wb')
pickle.dump(lb,outfile)
outfile.close()

(9121, 216)
['female_angry' 'female_disgust' 'female_fear' 'female_happy'
 'female_neutral' 'female_sad' 'female_surprise' 'male_angry'
 'male_disgust' 'male_fear' 'male_happy' 'male_neutral' 'male_sad'
 'male_surprise']


In [11]:
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape

(9121, 216, 1)

# Model

In [24]:
# CNN model

model = Sequential()
model.add(Conv1D(256, 8, padding='same',input_shape=(X_train.shape[1],1)))  # X_train.shape[1] = No. of Columns
model.add(Activation('relu'))
model.add(Conv1D(256, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(14)) # Target class number
model.add(Activation('softmax'))
# opt = keras.optimizers.SGD(lr=0.0001, momentum=0.0, decay=0.0, nesterov=False)
# opt = keras.optimizers.Adam(lr=0.0001)
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 216, 256)          2304      
_________________________________________________________________
activation_1 (Activation)    (None, 216, 256)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 216, 256)          524544    
_________________________________________________________________
batch_normalization_1 (Batch (None, 216, 256)          1024      
_________________________________________________________________
activation_2 (Activation)    (None, 216, 256)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 216, 256)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 27, 256)          

In [25]:
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
model_history=model.fit(X_train, y_train, batch_size=16, epochs=100, validation_data=(X_test, y_test))

Train on 9121 samples, validate on 3041 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

# Model serialisation

In [26]:
# Save model and weights
model_name = 'Emotion_Model.h5'
save_dir = os.path.join(os.getcwd(), 'saved_models')

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Save model and weights at %s ' % model_path)

# Save the model to disk
model_json = model.to_json()
with open("model_json.json", "w") as json_file:
    json_file.write(model_json)

Save model and weights at /home/bukya/Learning_Purpose/Speech_Emotion_Recognition/saved_models/Emotion_Model.h5 


# Model validation

In [12]:
# loading json and model architecture 
json_file = open('model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Model.h5")
print("Loaded model from disk")
 
# Keras optimiser
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
score = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

Loaded model from disk
accuracy: 43.93%


In [13]:
preds = loaded_model.predict(X_test, 
                         batch_size=16, 
                         verbose=1)

preds=preds.argmax(axis=1)
preds




array([11,  3, 11, ...,  0,  5,  1])

In [14]:
# predictions 
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})

# Actual labels
actual=y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})

# Lets combined both of them into a single dataframe
finaldf = actual.join(preds)
finaldf[170:180]

Unnamed: 0,actualvalues,predictedvalues
170,male_sad,female_disgust
171,female_surprise,female_sad
172,male_angry,male_happy
173,female_disgust,female_disgust
174,male_angry,male_angry
175,female_fear,female_happy
176,male_surprise,female_angry
177,female_fear,female_sad
178,female_happy,male_sad
179,female_neutral,female_neutral


In [15]:
# Write out the predictions to disk
finaldf.to_csv('Predictions.csv', index=False)
finaldf.groupby('predictedvalues').count()

Unnamed: 0_level_0,actualvalues
predictedvalues,Unnamed: 1_level_1
female_angry,319
female_disgust,502
female_fear,229
female_happy,425
female_neutral,221
female_sad,348
female_surprise,117
male_angry,109
male_disgust,124
male_fear,64


# Model Accuracy and Confusion Matrix

In [16]:
# Get the predictions file 
finaldf = pd.read_csv("Predictions.csv")
classes = finaldf.actualvalues.unique()
classes.sort()    

# Confusion matrix 
c = confusion_matrix(finaldf.actualvalues, finaldf.predictedvalues)
print("Model Accuracy:", accuracy_score(finaldf.actualvalues, finaldf.predictedvalues))

Model Accuracy: 0.43932916803683


In [17]:
# Classification report 
classes = finaldf.actualvalues.unique()
classes.sort()    
print(classification_report(finaldf.actualvalues, finaldf.predictedvalues, target_names=classes))

                 precision    recall  f1-score   support

   female_angry       0.54      0.60      0.56       287
 female_disgust       0.33      0.62      0.43       267
    female_fear       0.50      0.41      0.45       282
   female_happy       0.40      0.58      0.48       293
 female_neutral       0.56      0.52      0.54       236
     female_sad       0.47      0.60      0.53       277
female_surprise       0.88      0.84      0.86       123
     male_angry       0.71      0.38      0.49       204
   male_disgust       0.30      0.18      0.23       200
      male_fear       0.39      0.13      0.19       197
     male_happy       0.30      0.22      0.25       202
   male_neutral       0.37      0.39      0.38       221
       male_sad       0.21      0.14      0.17       209
  male_surprise       0.45      0.51      0.48        43

       accuracy                           0.44      3041
      macro avg       0.46      0.44      0.43      3041
   weighted avg       0.45   