In [1]:
import keras
from keras.layers import Activation, Dense, Dropout, Conv2D, \
                         Flatten, MaxPooling2D
from keras.models import Sequential
import librosa
import librosa.display
import numpy as np
import pandas as pd
import random

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
# Read Data
data = pd.read_csv('metadata.csv')
data.head(3)

Unnamed: 0,sr,audio,classID,fold
0,0,LJ001-0001.wav,1,11
1,1,LJ001-0002.wav,1,11
2,2,LJ001-0003.wav,1,11


In [3]:
data.shape

(20522, 4)

In [4]:
valid_data = data[['audio', 'classID', 'fold']]
valid_data.shape

(20522, 3)

In [5]:
valid_data['path'] = 'fold' + valid_data['fold'].astype('str') + '/' + valid_data['audio'].astype('str')

In [6]:
audio_files = []

for row in valid_data.itertuples():
    audio_files.append(row.path)

In [None]:
D = [] # Dataset

for row, row_dict in zip(audio_files, valid_data.classID):
    try:
        y, sr = librosa.load('dataset/audio/'+row, duration=2.97)
        ps = librosa.feature.melspectrogram(y=y, sr=sr)
            if ps.shape != (128, 128): continue
        D.append( (ps, row_dict) )
    except:
        print('Something is wrong with '+row)
    else:
        continue

In [8]:
print("Number of samples: ", len(D))

Number of samples:  15929


In [10]:
dataset = D
random.shuffle(dataset)

train = dataset[:14500]
test = dataset[14500:]

X_train, y_train = zip(*train)
X_test, y_test = zip(*test)

# Reshape for CNN input
X_train = np.array([x.reshape( (128, 128, 1) ) for x in X_train])
X_test = np.array([x.reshape( (128, 128, 1) ) for x in X_test])

# One-Hot encoding for classes
y_train = np.array(keras.utils.to_categorical(y_train, 4))
y_test = np.array(keras.utils.to_categorical(y_test, 4))

In [13]:
model = Sequential()
input_shape=(128, 128, 1)

model.add(Conv2D(24, (5, 5), strides=(1, 1), input_shape=input_shape))
model.add(MaxPooling2D((4, 2), strides=(4, 2)))
model.add(Activation('relu'))

model.add(Conv2D(48, (5, 5), padding="valid"))
model.add(MaxPooling2D((4, 2), strides=(4, 2)))
model.add(Activation('relu'))

model.add(Conv2D(48, (5, 5), padding="valid"))
model.add(Activation('relu'))

model.add(Flatten())
model.add(Dropout(rate=0.5))

model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(rate=0.5))

model.add(Dense(4))
model.add(Activation('softmax'))

In [14]:
model.compile(
	optimizer="Adam",
	loss="categorical_crossentropy",
	metrics=['accuracy'])

model.fit(
	x=X_train, 
	y=y_train,
    epochs=12,
    batch_size=128,
    validation_data= (X_test, y_test))

score = model.evaluate(
	x=X_test,
	y=y_test)

print('Test loss:', score[0])
print('Test accuracy:', score[1])

Instructions for updating:
Use tf.cast instead.
Train on 14500 samples, validate on 1429 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Test loss: 0.02798131039738264
Test accuracy: 0.9958012596221134


In [15]:
model.save('voice_model.h5')