In [4]:
import pandas as pd
import numpy as np
import random
import re

import os, sys, glob, pickle

import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

import soundfile as sf

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import IPython.display as ipd

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
from tqdm import tqdm, tqdm_pandas

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 


In [20]:
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten, Dropout,
                          GlobalMaxPool2D, MaxPool2D, concatenate, Activation, Input, Dense)

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Other  
from tqdm import tqdm, tqdm_pandas
import scipy
from scipy.stats import skew
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import sys
import IPython.display as ipd  # To play sound in the notebook
import warnings
# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [5]:
audio_df = pd.read_csv('combined_data.csv')
audio_df.sample(5)

Unnamed: 0,Emotion,Path
8222,female_disgust,../SER/CREMA/1061_IWL_DIS_XX.wav
4797,male_fear,../SER/CREMA/1019_TAI_FEA_XX.wav
10010,male_calm,../SER/CREMA/1083_IOM_NEU_XX.wav
10827,female_disgust,../SER/SER-Ravdess-data/Actor_02/03-01-07-01-0...
8022,male_sad,../SER/CREMA/1059_DFA_SAD_XX.wav


In [11]:
def prepare_data(df, n, mfcc):
    sampling_rate=44100
    audio_duration=2.5


    X = np.empty(shape=(df.shape[0], n, 216, 1))
    input_length = sampling_rate * audio_duration
    
    cnt = 0
    for fname in tqdm(df.Path):
        file_path = fname
        data, _ = librosa.load(file_path, sr=sampling_rate
                               ,res_type="kaiser_fast"
                               ,duration=2.5
                               ,offset=0.5
                              )

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, int(input_length) - len(data) - offset), "constant")

                
        # which feature?
        if mfcc == 1:
            # MFCC extraction 
            MFCC = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=n_mfcc)
            MFCC = np.expand_dims(MFCC, axis=-1)
            X[cnt,] = MFCC
            
        else:
            # Log-melspectogram
            melspec = librosa.feature.melspectrogram(data, n_mels = n_melspec)   
            logspec = librosa.amplitude_to_db(melspec)
            logspec = np.expand_dims(logspec, axis=-1)
            X[cnt,] = logspec
            
        cnt += 1
    
    return X


In [12]:
n_mfcc = 30
mfcc = prepare_data(audio_df, n = n_mfcc, aug = 0, mfcc = 1)

100%|██████████| 12162/12162 [15:25<00:00, 13.14it/s]


In [13]:
mfcc.shape

(12162, 30, 216, 1)

In [18]:
def get_2d_conv_model(n):
    ''' Create a standard deep 2D convolutional neural network'''
    nclass = 14
    inp = Input(shape=(n,216,1))  #2D matrix of 30 MFCC bands by 216 audio length.
    x = Convolution2D(32, (4,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Flatten()(x)
    x = Dense(64)(x)
    x = Dropout(rate=0.2)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(rate=0.2)(x)
    
    out = Dense(nclass, activation=softmax)(x)
    model = models.Model(inputs=inp, outputs=out)
    
    opt = optimizers.Adam(0.001)
    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model


In [14]:
lb = LabelEncoder()

In [21]:
#  Split between train and test 
X_train, X_test, y_train, y_test = train_test_split(mfcc
                                                    , audio_df.Emotion
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )


# one hot encode the target 
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

# Normalization as per the standard NN process
def get_2d_conv_model(n):
    ''' Create a standard deep 2D convolutional neural network'''
    nclass = 14
    inp = Input(shape=(n,216,1))  #2D matrix of 30 MFCC bands by 216 audio length.
    x = Convolution2D(32, (4,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Flatten()(x)
    x = Dense(64)(x)
    x = Dropout(rate=0.2)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(rate=0.2)(x)
    
    out = Dense(nclass, activation=softmax)(x)
    model = models.Model(inputs=inp, outputs=out)
    
    opt = optimizers.Adam(0.001)
    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

# Build CNN model 
model = get_2d_conv_model(n=n_mfcc)
model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    batch_size=16, verbose = 2, epochs=20)

Epoch 1/20
571/571 - 246s - loss: 2.2318 - acc: 0.2492 - val_loss: 2.0140 - val_acc: 0.2775 - 246s/epoch - 431ms/step
Epoch 2/20
571/571 - 222s - loss: 1.7199 - acc: 0.3936 - val_loss: 1.5338 - val_acc: 0.4656 - 222s/epoch - 390ms/step
Epoch 3/20
571/571 - 227s - loss: 1.5318 - acc: 0.4604 - val_loss: 1.8667 - val_acc: 0.3982 - 227s/epoch - 397ms/step
Epoch 4/20
571/571 - 221s - loss: 1.4251 - acc: 0.4924 - val_loss: 1.5869 - val_acc: 0.4193 - 221s/epoch - 388ms/step
Epoch 5/20
571/571 - 221s - loss: 1.3449 - acc: 0.5225 - val_loss: 1.2327 - val_acc: 0.5518 - 221s/epoch - 386ms/step
Epoch 6/20
571/571 - 212s - loss: 1.2908 - acc: 0.5391 - val_loss: 1.2217 - val_acc: 0.5409 - 212s/epoch - 370ms/step
Epoch 7/20
571/571 - 209s - loss: 1.2369 - acc: 0.5583 - val_loss: 1.6602 - val_acc: 0.4357 - 209s/epoch - 365ms/step
Epoch 8/20
571/571 - 195s - loss: 1.2131 - acc: 0.5621 - val_loss: 1.3528 - val_acc: 0.5051 - 195s/epoch - 341ms/step
Epoch 9/20
571/571 - 197s - loss: 1.1720 - acc: 0.5802 -

In [22]:
preds = model.predict(X_test, 
                         batch_size=16, 
                         verbose=1)

preds=preds.argmax(axis=1)
preds



array([ 8,  5, 12, ...,  5,  1,  9], dtype=int64)

In [23]:
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})


# Actual labels
actual=y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})

# Lets combined both of them into a single dataframe
finaldf = actual.join(preds)

In [24]:
finaldf.sample(10)

Unnamed: 0,actualvalues,predictedvalues
989,female_sad,female_sad
1854,female_calm,female_happy
2952,female_calm,female_calm
890,female_angry,female_angry
1452,female_sad,female_sad
2622,male_happy,male_angry
1020,male_happy,male_fear
1587,female_sad,female_calm
782,female_angry,male_angry
2891,female_fear,female_sad


In [90]:
nwpath = ['../SER/output16.wav']

In [91]:
newinput = pd.DataFrame({'Path': nwpath})
newinput

Unnamed: 0,Path
0,../SER/output16.wav


In [92]:
n_mfcc = 30
mfcc2 = prepare_data(newinput, n = n_mfcc, aug = 0, mfcc = 1)

100%|██████████| 1/1 [00:00<00:00, 15.05it/s]


In [93]:
preds2 = model.predict(mfcc2, 
                         batch_size=16, 
                         verbose=1)

preds2=preds2.argmax(axis=1)
preds2



array([0], dtype=int64)

In [94]:
final = preds2.astype(int).flatten()
final = (lb.inverse_transform((final)))
print(final) #emo(final) #gender(final) 

['female_angry']
