In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv1D, MaxPool1D, Flatten, Dense
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from keras.models import Model, Sequential
from keras.layers import Dense
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization, Conv1D, MaxPooling1D
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

In [2]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

observed_emotions=['calm', 'happy', 'fearful']

In [3]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
    return result

def load_data(test_size):
    x,y = [],[]
    for file in glob.glob("../input/ravdess-emotional-song-audio/Actor_*/*.wav"):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc = True, chroma = True, mel = True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size = test_size, random_state = 0) 

x_train, x_test, y_train, y_test = load_data(test_size = 0.2) 

In [4]:
set(y_train)

{'calm', 'fearful', 'happy'}

In [5]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

lb = LabelEncoder()

y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

In [6]:
x_traincnn = np.expand_dims(x_train, axis=2)
x_testcnn = np.expand_dims(x_test, axis=2)

In [7]:
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [8]:
model = Sequential()

model.add(Conv1D(filters = 32, kernel_size = 5, activation = 'relu', input_shape = [180, 1]))
model.add(BatchNormalization())
model.add(MaxPool1D(pool_size = 8, strides = 4))
model.add(Dropout(0.25))

model.add(Conv1D(filters = 32, kernel_size = 5, activation = 'relu')) 
model.add(MaxPool1D(pool_size = 8, strides = 4))
model.add(Dropout(0.25))

model.add(Conv1D(filters = 64, kernel_size = 3, activation = 'relu')) 
model.add(MaxPool1D(pool_size = 8, strides = 3, padding = "same"))
model.add(Dropout(0.25))

model.add(Conv1D(filters = 128, kernel_size = 2, activation = 'relu', padding = "same")) 
model.add(MaxPool1D(pool_size = 8, strides = 4, padding = "same"))

model.add(Conv1D(filters = 256, kernel_size = 3, activation = 'relu', padding = "same")) 
model.add(MaxPool1D(pool_size = 8, strides = 2, padding = "same"))


model.add(Flatten())

model.add(Dense(100, activation = 'relu'))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(3, activation = 'softmax'))  

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 176, 32)           192       
_________________________________________________________________
batch_normalization (BatchNo (None, 176, 32)           128       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 43, 32)            0         
_________________________________________________________________
dropout (Dropout)            (None, 43, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 39, 32)            5152      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 8, 32)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 8, 32)             0

In [10]:
model.compile(optimizer = 'rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
cnnhistory = model.fit(x_traincnn, y_train, batch_size = 16, epochs = 700, validation_data = (x_testcnn, y_test)) 

Epoch 1/700
Epoch 2/700
Epoch 3/700
Epoch 4/700
Epoch 5/700
Epoch 6/700
Epoch 7/700
Epoch 8/700
Epoch 9/700
Epoch 10/700
Epoch 11/700
Epoch 12/700
Epoch 13/700
Epoch 14/700
Epoch 15/700
Epoch 16/700
Epoch 17/700
Epoch 18/700
Epoch 19/700
Epoch 20/700
Epoch 21/700
Epoch 22/700
Epoch 23/700
Epoch 24/700
Epoch 25/700
Epoch 26/700
Epoch 27/700
Epoch 28/700
Epoch 29/700
Epoch 30/700
Epoch 31/700
Epoch 32/700
Epoch 33/700
Epoch 34/700
Epoch 35/700
Epoch 36/700
Epoch 37/700
Epoch 38/700
Epoch 39/700
Epoch 40/700
Epoch 41/700
Epoch 42/700
Epoch 43/700
Epoch 44/700
Epoch 45/700
Epoch 46/700
Epoch 47/700
Epoch 48/700
Epoch 49/700
Epoch 50/700
Epoch 51/700
Epoch 52/700
Epoch 53/700
Epoch 54/700
Epoch 55/700
Epoch 56/700
Epoch 57/700
Epoch 58/700
Epoch 59/700
Epoch 60/700
Epoch 61/700
Epoch 62/700
Epoch 63/700
Epoch 64/700
Epoch 65/700
Epoch 66/700
Epoch 67/700
Epoch 68/700
Epoch 69/700
Epoch 70/700
Epoch 71/700
Epoch 72/700
Epoch 73/700
Epoch 74/700
Epoch 75/700
Epoch 76/700
Epoch 77/700
Epoch 78

In [12]:
from keras.callbacks import History 
print("Accuracy = ", cnnhistory.history['val_accuracy'][-1])

Accuracy =  0.9189189076423645
