In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
speakers = os.listdir('alexa/')

In [3]:
df = pd.DataFrame(columns=['filepath', 'speaker'])
for speaker in speakers:
    files = os.listdir('alexa/{}/'.format(speaker))
    for file in files:
        filepath = 'alexa/{}/{}'.format(speaker, file)
        df = df.append({'filepath':filepath, 'speaker':speaker}, ignore_index=True)
print(len(speakers))
df.head() # this is just a print statement

87


Unnamed: 0,filepath,speaker
0,alexa/alastair/1.wav,alastair
1,alexa/alastair/2.wav,alastair
2,alexa/alastair/3.wav,alastair
3,alexa/alastair/5.wav,alastair
4,alexa/anfcucvo/1.wav,anfcucvo


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, test = train_test_split(df, test_size = 0.29, stratify = df['speaker'])
# the stratify parameter makes the function split data evenly over the speakers column
# this is so we dont get all files of the same speaker in the test set and not the training set
train.head()

Unnamed: 0,filepath,speaker
294,alexa/vgemoinn/4.wav,vgemoinn
41,alexa/dnkhkmfq/1.wav,dnkhkmfq
128,alexa/jvvfnxlp/2.wav,jvvfnxlp
367,alexa/zzgleilo/3.wav,zzgleilo
173,alexa/kxiphqej/1.wav,kxiphqej


In [6]:
import librosa

In [7]:
def extract_features(filename):
    
    X, sample_rate = librosa.load(filename, res_type='kaiser_fast')
    
    # librosa returns an array of 40 arrays, one for each mfcc
    # np.mean takes the mean of each array, so we will be left with an array of size 40
    # the n_mfcc=40 parameter means return 40 mfccs
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    
    # plp = librosa.beat.plp(y=X, sr=22050, onset_envelope=None, hop_length=512, win_length=99, tempo_min=30, tempo_max=300, prior=None)
    
    return mfccs.tolist()#  + plp.tolist()

In [8]:
# test out the function on one of the files
# print(len(extract_features(train['filepath'][37])))

In [9]:
# apply the extract features to every element in train and test
train_features = train['filepath'].apply(extract_features)

In [10]:
# train_features is now an array of arrays
test_features = test['filepath'].apply(extract_features)
train_features.head()

294    [-399.54132080078125, 129.24896240234375, -26....
41     [-470.98406982421875, 124.64154052734375, 1.92...
128    [-406.5012512207031, 83.7170181274414, 12.6531...
367    [-421.73419189453125, 104.6003189086914, -18.9...
173    [-587.8046264648438, 80.37097930908203, -14.78...
Name: filepath, dtype: object

In [11]:
# len(train_features[37])
# just an array with 40 elements

In [12]:
# split into X and Y where X is the features and Y is the label (name of speaker)
# remember that each array is still in the same order as above 
# so each element in X_train corresponds to an element in Y_train at the same index
X_train = train_features.tolist()
X_test = test_features.tolist()
Y_train = train['speaker'].tolist()
Y_test = test['speaker'].tolist()

In [13]:
# now X_train is a 2d array, and each array is the long array of mfccs
print("Speaker: {}".format(Y_train[0]))
print("Features: {}: ".format(X_train[0]))

Speaker: vgemoinn
Features: [-399.54132080078125, 129.24896240234375, -26.860309600830078, 25.929628372192383, -9.587641716003418, 8.800304412841797, -0.8646549582481384, 7.1240410804748535, -0.9060578346252441, 4.101642608642578, 13.863786697387695, -1.5559594631195068, 5.053192615509033, -3.6457841396331787, 1.2464885711669922, -1.319087266921997, 2.4215571880340576, 4.819954872131348, -0.2557099759578705, 3.449850082397461, -0.3846716284751892, 0.3831017017364502, -0.18064607679843903, 2.589801073074341, -0.454228013753891, -3.7038257122039795, 0.2175537347793579, 1.5566868782043457, -0.5052340030670166, -0.9919822812080383, 0.4876585304737091, -0.30659282207489014, 0.18528155982494354, -2.0200259685516357, -1.7635644674301147, -1.5095770359039307, -1.8632760047912598, -1.0368870496749878, -1.17448890209198, -1.1344026327133179]: 


In [14]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

In [15]:
# hot encode y 
lb = LabelEncoder()

Y_train_encoded = to_categorical(lb.fit_transform(Y_train))
Y_test_encoded = to_categorical(lb.fit_transform(Y_test))

In [16]:
print("label: " + str(Y_train[44]))
print("encoded label: " + str(Y_test_encoded[35]))


label: ghmcwtzk
encoded label: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
print(len(Y_train_encoded[0])) # number of unique speakers
print(len(Y_test_encoded[0])) # number of unique speakers
# THESE NEED TO BE THE SAME OR THERE WILL BE AN ERROR. if they are different then increase test_size in the train_test_split line above

87
87


In [19]:
ss = StandardScaler()

X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.fit_transform(X_test)

In [20]:
X_train_scaled.shape

(261, 40)

In [21]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.callbacks import EarlyStopping

In [22]:
model = Sequential()

model.add(Dense(40, input_shape=(40,), activation = 'relu'))
model.add(Dropout(0.01))

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.01))  

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.01))    

model.add(Dense(len(Y_train_encoded[0]), activation = 'softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

In [23]:
history = model.fit(X_train_scaled, Y_train_encoded, batch_size=256, epochs=100, validation_data=(X_test_scaled, Y_test_encoded))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [44]:
def predict_speaker(file):

    labels = Y_train_encoded.tolist() # list of the encoded labels from training 

    to_predict = [alastair] # list of data for the model to predict, just one item for now
    predictions = model.predict(to_predict) # returns a list of predictions 
    al = predictions[0].tolist() # take the first element which is the prediciton for the first element in to_predict, remember this is still one hot encoded so it is a big array of 0s and 1s
    al_float = [int(i) for i in al] # convert it to floats

    prediction_ind = labels.index(al_float) # index of predicted label

    prediction = Y_train[prediction_ind]

    return prediction
    


In [42]:
alastair = extract_features("test/alastair.wav") # test sample of alastair saying alexa - seperate from training data

print("speaker: " + predict_speaker(alastair))


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
speaker: alastair


In [34]:
import pyaudio
import wave
import time

In [45]:
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 3
WAVE_OUTPUT_FILENAME = "test/audio_input.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)
print("say 'Alexa' in: 3")
for i in range(2,0,-1):
    time.sleep(1)
    print (i)
time.sleep(1)

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

print(predict_speaker(WAVE_OUTPUT_FILENAME))

say 'Alexa' in: 3
2
1
* recording
* done recording
alastair
