In [1]:
# Imports, ignore warnings
import warnings
warnings.filterwarnings('ignore')
import os
from tensorflow import keras
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from keras.models import model_from_json
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

Using TensorFlow backend.


In [2]:
# Read in the JSON file
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

In [3]:
# Load the model from JSON 
loaded_model = model_from_json(loaded_model_json)

In [11]:
# Load weights into new model
loaded_model.load_weights('saved_models/Emotion_Voice_Detection_Model.h5')
print('Loaded model from disk')
loaded_model.summary()

Loaded model from disk
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_7 (Conv1D)            (None, 216, 128)          768       
_________________________________________________________________
activation_8 (Activation)    (None, 216, 128)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 216, 128)          82048     
_________________________________________________________________
activation_9 (Activation)    (None, 216, 128)          0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 216, 128)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 27, 128)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (N

In [13]:
loaded_model.layers[17].get_config()

{'name': 'activation_14',
 'trainable': True,
 'dtype': 'float32',
 'activation': 'softmax'}

In [5]:
o = keras.optimizers.RMSprop(lr = 0.00001, decay = 1e-6)
loaded_model.compile(loss ='categorical_crossentropy', optimizer = o, metrics = ['accuracy'])

In [6]:
# Method for reading in the audio files and extracting features
"""
- d is the directory the audio files, default is the current working directory.
- dur is the duration in seconds that will be read in.
- For this CNN to work, dur must be 2.5
"""

def readAudioFiles(d, dur, sample_rate):
    if d is None:
        d = 'dir'
        
    df = pd.DataFrame(columns=['feature'])
    file_names = []
    i = 0
    for audiofile in os.listdir(d):
        # Load file using librosa
        print(audiofile, "loaded")
        file_names.append(audiofile)
        X, sr = librosa.load(os.path.join(d, audiofile), res_type = 'kaiser_fast', duration = dur , sr = sample_rate, offset = 0.5)
        sr = np.array(sr)
        # Extract the MFCCS
        mfccs = np.mean(librosa.feature.mfcc(y = X, 
                                            sr = sr, 
                                            n_mfcc = 13),
                        axis=0)
        feature = mfccs
        # Add to data frame
        df.loc[i] = [feature]
        i += 1
    df = pd.DataFrame(df['feature'].values.tolist())
    df = shuffle(df)
    df = df.fillna(0)
    return df, file_names 

In [7]:
audio_features, file_names = readAudioFiles(d = 'the-office-audio-clips', dur = 2.5, sample_rate = 44100)    

daffyduck.wav loaded
deranged.wav loaded
weapons.wav loaded
gamble.wav loaded
cage.wav loaded
punish.wav loaded
shesaid.wav loaded
smile.wav loaded
wild.wav loaded


In [8]:
audio_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,1.501406,0.817227,-0.033943,-0.573337,-0.992338,-1.820299,-0.825532,-1.126159,-2.236412,-1.79086,...,1.436221,2.606878,2.107405,2.170962,2.021762,1.636188,1.21244,0.841189,0.434088,0.131884
7,-2.409396,-2.300485,-2.479992,0.562576,1.272716,1.013227,0.298986,0.323682,-0.840398,-0.688106,...,-7.117177,-7.068227,-4.295846,-1.048497,0.295969,0.091689,-0.28353,-0.673306,-0.760514,1.282261
3,-10.703444,-8.510207,-5.300148,-6.360087,-7.391901,-5.837784,-4.275815,-2.047449,-1.596384,-0.740989,...,-1.705452,-2.427559,-4.075689,-5.216367,-2.87422,-3.190225,-3.929511,-3.21832,-1.793323,-0.007453
4,3.229875,-0.25044,-3.86676,-1.847083,-1.318632,-3.685853,-2.19231,-0.730599,-2.023446,-5.832206,...,-13.898515,-12.222639,-13.219418,-14.195075,-11.62548,-12.519917,-13.353108,-11.850022,-11.955689,-13.36469
2,-4.842369,-6.932052,-10.699755,-11.399866,-12.337184,-12.564977,-13.07259,-11.631197,-11.871105,-12.651745,...,-9.536854,-12.701858,-13.665248,-11.29546,-13.528756,-13.409475,-12.887518,-15.160089,-15.461849,-14.582693


In [9]:
audio_features_cnn = np.expand_dims(audio_features, axis = 2)
audio_features_cnn

array([[[ 1.50140584e+00],
        [ 8.17227423e-01],
        [-3.39427739e-02],
        ...,
        [ 8.41189027e-01],
        [ 4.34088051e-01],
        [ 1.31884351e-01]],

       [[-2.40939617e+00],
        [-2.30048466e+00],
        [-2.47999215e+00],
        ...,
        [-6.73305631e-01],
        [-7.60513604e-01],
        [ 1.28226066e+00]],

       [[-1.07034435e+01],
        [-8.51020718e+00],
        [-5.30014801e+00],
        ...,
        [-3.21832037e+00],
        [-1.79332304e+00],
        [-7.45307468e-03]],

       ...,

       [[-1.17263899e+01],
        [-8.36099339e+00],
        [-3.34873962e+00],
        ...,
        [-3.63800478e+00],
        [-5.55429268e+00],
        [-4.98490620e+00]],

       [[-4.59554720e+00],
        [-1.88232219e+00],
        [-1.98679745e+00],
        ...,
        [-9.92149353e+00],
        [-8.15657139e+00],
        [-8.68582344e+00]],

       [[-9.50154686e+00],
        [-9.01928425e+00],
        [-8.81004429e+00],
        ...,
        

In [34]:
preds = loaded_model.predict(audio_features_cnn, 
                             batch_size = 32, 
                             verbose = 1)



In [27]:
def sumProbs(preds):
    file = []
    for i in range(9):
        temp = []
        p_angry = preds[i][0] + preds[i][5]
        p_calm = preds[i][1] + preds[i][6]
        p_fearful = preds[i][2] + preds[i][7]
        p_happy = preds[i][3] + preds[i][8]
        p_sad = preds[i][4] + preds[i][9]
        temp.append(p_angry)
        temp.append(p_calm)
        temp.append(p_fearful)
        temp.append(p_happy)
        temp.append(p_sad)
        file.append(temp)
    return np.array(file)

In [28]:
new_preds = sumProbs(preds)
new_preds

array([[1.80615461e-05, 1.06655435e-26, 3.30796744e-11, 9.99981880e-01,
        3.18238669e-16],
       [1.00000000e+00, 1.02080774e-33, 9.87227540e-22, 2.03328760e-11,
        2.36926342e-23],
       [3.85580026e-02, 4.48244957e-13, 5.29361665e-02, 9.08505678e-01,
        1.79429264e-07],
       [1.70362946e-05, 9.76058027e-19, 9.99903798e-01, 7.92013889e-05,
        5.93343730e-09],
       [9.76949930e-01, 4.43267412e-35, 2.30886597e-16, 2.30501033e-02,
        2.19294882e-15],
       [4.25142935e-04, 6.61814413e-20, 6.71178782e-07, 9.99574125e-01,
        9.72960251e-14],
       [1.15296552e-06, 0.00000000e+00, 2.01270804e-22, 9.99998808e-01,
        4.37406585e-35],
       [7.03883066e-04, 4.93223050e-17, 8.06314886e-01, 1.91383645e-01,
        1.59765396e-03],
       [8.83158743e-02, 3.08008779e-20, 8.39973653e-13, 3.67920683e-03,
        9.08004940e-01]], dtype=float32)

In [30]:
arg_max = new_preds.argmax(axis = 1)
print(arg_max)

[3 0 3 2 0 3 3 2 4]


In [32]:
emotions = {
    0: 'angry', 
    1: 'calm',
    2 : 'fearful',
    3 : 'happy',
    4 : 'sad',
}

def inverseTransform(preds, emotion_dict):
    decoded = []
    preds = preds.tolist()
    for i in range(9):
        key = preds[i]
        filename = file_names[i]
        val = emotion_dict[key]
        print('file name:', filename, '/', 'CNN prediction:', key, '/', 'predicted emotion:', val)
        decoded.append(val) 
    return filename, key, val
        

In [33]:
pred_emo = inverseTransform(arg_max, emotions)

file name: daffyduck.wav / CNN prediction: 3 / predicted emotion: happy
file name: deranged.wav / CNN prediction: 0 / predicted emotion: angry
file name: weapons.wav / CNN prediction: 3 / predicted emotion: happy
file name: gamble.wav / CNN prediction: 2 / predicted emotion: fearful
file name: cage.wav / CNN prediction: 0 / predicted emotion: angry
file name: punish.wav / CNN prediction: 3 / predicted emotion: happy
file name: shesaid.wav / CNN prediction: 3 / predicted emotion: happy
file name: smile.wav / CNN prediction: 2 / predicted emotion: fearful
file name: wild.wav / CNN prediction: 4 / predicted emotion: sad
