In [86]:
import pandas as pd
import numpy as np
import os

In [87]:
speakers = os.listdir('alexa/')

In [88]:
df = pd.DataFrame(columns=['filepath', 'speaker'])
for speaker in speakers:
    files = os.listdir('alexa/{}/'.format(speaker))
    for file in files:
        filepath = 'alexa/{}/{}'.format(speaker, file)
        df = df.append({'filepath':filepath, 'speaker':speaker}, ignore_index=True)
print(len(speakers))
df.head() # this is just a print statement

87


Unnamed: 0,filepath,speaker
0,alexa/alastair/1.wav,alastair
1,alexa/alastair/2.wav,alastair
2,alexa/alastair/3.wav,alastair
3,alexa/alastair/5.wav,alastair
4,alexa/anfcucvo/1.wav,anfcucvo


In [89]:
from sklearn.model_selection import train_test_split

In [137]:
train, test = train_test_split(df, test_size = 0.29, stratify = df['speaker'])
# the stratify parameter makes the function split data evenly over the speakers column
# this is so we dont get all files of the same speaker in the test set and not the training set
train.head()

Unnamed: 0,filepath,speaker
4,alexa/anfcucvo/1.wav,anfcucvo
225,alexa/onnnswlx/1.wav,onnnswlx
359,alexa/zgmrhuwb/3.wav,zgmrhuwb
260,alexa/tiurjmpp/2.wav,tiurjmpp
348,alexa/xpzqxqrd/4.wav,xpzqxqrd


In [138]:
import librosa

In [139]:
def extract_features(filename):
    
    X, sample_rate = librosa.load(filename, res_type='kaiser_fast')
    
    # librosa returns an array of 40 arrays, one for each mfcc
    # np.mean takes the mean of each array, so we will be left with an array of size 40
    # the n_mfcc=40 parameter means return 40 mfccs
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    
    # plp = librosa.beat.plp(y=X, sr=22050, onset_envelope=None, hop_length=512, win_length=99, tempo_min=30, tempo_max=300, prior=None)
    
    return mfccs.tolist()#  + plp.tolist()

In [140]:
# test out the function on one of the files
# print(len(extract_features(train['filepath'][37])))

In [141]:
# apply the extract features to every element in train and test
train_features = train['filepath'].apply(extract_features)

In [142]:
# train_features is now an array of arrays
test_features = test['filepath'].apply(extract_features)
train_features.head()

4      [-451.0007019042969, 129.68942260742188, -10.4...
225    [-425.0965576171875, 89.916015625, -7.29105377...
359    [-514.9548950195312, 133.59536743164062, -34.7...
260    [-334.6752014160156, 131.853515625, -24.814403...
348    [-446.0290222167969, 89.11070251464844, -16.11...
Name: filepath, dtype: object

In [144]:
# len(train_features[37])
# just an array with 40 elements

In [145]:
# split into X and Y where X is the features and Y is the label (name of speaker)
# remember that each array is still in the same order as above 
# so each element in X_train corresponds to an element in Y_train at the same index
X_train = train_features.tolist()
X_test = test_features.tolist()
Y_train = train['speaker'].tolist()
Y_test = test['speaker'].tolist()

In [146]:
# now X_train is a 2d array, and each array is the long array of mfccs
print("Speaker: {}".format(Y_train[0]))
print("Features: {}: ".format(X_train[0]))

Speaker: anfcucvo
Features: [-451.0007019042969, 129.68942260742188, -10.475382804870605, 13.48437213897705, -22.18383026123047, 13.31363296508789, 2.3849048614501953, 7.671524524688721, 2.464989423751831, 1.4831621646881104, 12.884814262390137, -2.120051145553589, 4.12436580657959, -1.3452571630477905, 2.779601812362671, 0.5196439623832703, 1.7318799495697021, 0.8280112743377686, -7.266208648681641, 2.65681791305542, -1.4635851383209229, -3.649433135986328, 0.9350355267524719, -1.1745051145553589, -1.3960975408554077, -1.873644471168518, -2.640820026397705, -2.5397894382476807, -3.0769588947296143, -0.5750591158866882, -0.9709802865982056, -0.4318140745162964, -1.121940016746521, -4.129611968994141, -3.1608803272247314, -3.086773633956909, 0.12607285380363464, -1.4484682083129883, -0.6509028673171997, -2.0082321166992188]: 


In [147]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

In [148]:
# hot encode y 
lb = LabelEncoder()

Y_train_encoded = to_categorical(lb.fit_transform(Y_train))
Y_test_encoded = to_categorical(lb.fit_transform(Y_test))

In [179]:
print("label: " + str(Y_train[44]))
print("encoded label: " + str(Y_test_encoded[35]))


label: kxrnhrcj
encoded label: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [162]:
from sklearn.preprocessing import StandardScaler

In [163]:
print(len(Y_train_encoded[0])) # number of unique speakers
print(len(Y_test_encoded[0])) # number of unique speakers
# THESE NEED TO BE THE SAME OR THERE WILL BE AN ERROR. if they are different then increase test_size in the train_test_split line above

87
87


In [165]:
ss = StandardScaler()

X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.fit_transform(X_test)

In [166]:
X_train_scaled.shape

(261, 40)

In [168]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.callbacks import EarlyStopping

In [172]:
model = Sequential()

model.add(Dense(40, input_shape=(40,), activation = 'relu'))
model.add(Dropout(0.01))

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.01))  

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.01))    

model.add(Dense(len(Y_train_encoded[0]), activation = 'softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

In [173]:
history = model.fit(X_train_scaled, Y_train_encoded, batch_size=256, epochs=100, validation_data=(X_test_scaled, Y_test_encoded))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [183]:
alastair = extract_features("test/alastair.wav") # test sample of alastair saying alexa - seperate from training data

labels = Y_train_encoded.tolist() # list of the labels from training

to_predict = [alastair] # list of data for the model to predict, just one item for now
predictions = model.predict(to_predict) # returns a list of predictions 
al = predictions[0].tolist() # take the first element which is the prediciton for the first element in to_predict, remember this is still one hot encoded so it is a big array of 0s and 1s
al_float = [int(i) for i in al] # convert it to floats

print(al_float) # encoded label, if the prediction is right it should be [1,0,0,0, ...] since alastair is the first file in the alexa folder

prediction = labels.index(al_float) # index of predicted label
actual = Y_train.index('alastair') # index of alastair in training set

print(prediction)
print(actual)

if(prediction == actual):
    print("Alastair predicted correctly")
else:
    print(":(")

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
223
223
Alastair predicted correctly
