In [1]:
import pandas as pd
import numpy as np
import os

In [130]:
speakers = os.listdir('alexa/')
# speakers = speakers[:80] # read somewhere that the number of classes has to be a multiple of 4 

In [131]:
df = pd.DataFrame(columns=['filepath', 'speaker'])
for speaker in speakers:
    files = os.listdir('alexa/{}/'.format(speaker))
    for file in files:
        filepath = 'alexa/{}/{}'.format(speaker, file)
        df = df.append({'filepath':filepath, 'speaker':speaker}, ignore_index=True)
print(len(speakers))
df.head() # this is just a print statement

87


Unnamed: 0,filepath,speaker
0,alexa/anfcucvo/1.wav,anfcucvo
1,alexa/anfcucvo/2.wav,anfcucvo
2,alexa/anfcucvo/3.wav,anfcucvo
3,alexa/anfcucvo/4.wav,anfcucvo
4,alexa/anfcucvo/5.wav,anfcucvo


In [132]:
from sklearn.model_selection import train_test_split

In [188]:
train, test = train_test_split(df, test_size = 0.26, stratify = df['speaker'])
# the stratify parameter makes the function split data evenly over the speakers column
# this is so we dont get all files of the same speaker in the test set and not the training set
train.head()

Unnamed: 0,filepath,speaker
283,alexa/uqctddis/1.wav,uqctddis
81,alexa/fsyeviyq/8.wav,fsyeviyq
131,alexa/jvvfnxlp/5.wav,jvvfnxlp
54,alexa/fegormwx/2.wav,fegormwx
89,alexa/ghmcwtzk/3.wav,ghmcwtzk


In [189]:
import librosa

In [190]:
def extract_features(filename):
    
    X, sample_rate = librosa.load(filename, res_type='kaiser_fast')
    
    # librosa returns an array of 40 arrays, one for each mfcc
    # np.mean takes the mean of each array, so we will be left with an array of size 40
    # the n_mfcc=40 parameter means return 40 mfccs
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    
    # plp = librosa.beat.plp(y=X, sr=22050, onset_envelope=None, hop_length=512, win_length=99, tempo_min=30, tempo_max=300, prior=None)
    
    return mfccs.tolist()#  + plp.tolist()

In [191]:
# test out the function on one of the files
print(len(extract_features(train['filepath'][77])))

40


In [192]:
# apply the extract features to every element in train and test
train_features = train['filepath'].apply(extract_features)


In [193]:
# train_features is now an array of arrays
test_features = test['filepath'].apply(extract_features)
train_features.head()

283    [-325.49041748046875, 110.84760284423828, -12....
81     [-556.0848388671875, 83.43140411376953, -15.43...
131    [-349.3893127441406, 110.5346908569336, -8.676...
54     [-444.859375, 108.34062194824219, -51.28759765...
89     [-369.7653503417969, 97.99247741699219, -16.91...
Name: filepath, dtype: object

In [194]:
test_features.head()

123    [-642.3672485351562, 104.11151123046875, -9.53...
115    [-413.44793701171875, 71.04056549072266, 0.688...
202    [-388.9583435058594, 90.2202377319336, -9.0481...
185    [-444.04962158203125, 111.02970886230469, -23....
106    [-326.7193908691406, 112.73077392578125, -22.1...
Name: filepath, dtype: object

In [195]:
len(train_features[77])
# just an array with 40 elements

40

In [196]:
# split into X and Y where X is the features and Y is the label (name of speaker)
# remember that each array is still in the same order as above 
# so each element in X_train corresponds to an element in Y_train at the same index
X_train = train_features.tolist()
X_test = test_features.tolist()
Y_train = train['speaker'].tolist()
Y_test = test['speaker'].tolist()

In [197]:
# now X_train is a 2d array, and each array is the long array of mfccs
print("Speaker: {}".format(Y_train[0]))
print("Features: {}: ".format(X_train[0]))

Speaker: uqctddis
Features: [-325.49041748046875, 110.84760284423828, -12.021147727966309, 29.381074905395508, -15.489127159118652, 17.974807739257812, -6.429633617401123, 7.366815567016602, 5.389136791229248, -1.46623957157135, 8.568319320678711, -3.801391124725342, 0.2132442742586136, -9.337488174438477, 4.982039451599121, -1.8447786569595337, -3.6032490730285645, 0.048032861202955246, -5.138862609863281, -0.9659779667854309, -6.731682300567627, -1.43858003616333, -4.894558906555176, -3.35678768157959, -0.269686222076416, -3.4032187461853027, -1.5623409748077393, -2.1458749771118164, -2.9577226638793945, -4.80684232711792, -2.878754138946533, -2.670773983001709, -4.641262531280518, -5.370326042175293, -4.078849792480469, -4.332222938537598, -4.500255107879639, -3.626905918121338, -4.698450565338135, -2.2406246662139893]: 


In [198]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

In [199]:
# hot encode y 
lb = LabelEncoder()
Y_train_encoded = to_categorical(lb.fit_transform(Y_train))
Y_test_encoded = to_categorical(lb.fit_transform(Y_test))

In [200]:
print(Y_train_encoded[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [201]:
print(len(X_train[1])) # array of 40 elements (mean of mfcc)

40


In [202]:
from sklearn.preprocessing import StandardScaler

In [203]:
print(type(X_train[0]))
print(len(Y_train_encoded[0])) # number of unique speakers
print(len(Y_test_encoded[0])) # number of unique speakers

<class 'list'>
87
87


In [204]:
ss = StandardScaler()


In [205]:
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.fit_transform(X_test)

In [206]:
X_train_scaled.shape

(273, 40)

In [207]:
len(X_test_scaled)

96

In [208]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.callbacks import EarlyStopping

In [209]:
model = Sequential()

model.add(Dense(40, input_shape=(40,), activation = 'relu'))
model.add(Dropout(0.01))

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.01))  

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.01))    

model.add(Dense(87, activation = 'softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

In [210]:
history = model.fit(X_train_scaled, Y_train_encoded, batch_size=256, epochs=100, validation_data=(X_test_scaled, Y_test_encoded))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
