In [13]:
import pandas as pd
import numpy as np
import os
import librosa
import pyaudio
import wave
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.callbacks import EarlyStopping

In [14]:
speakers = os.listdir('alexa/')
speakers = speakers[:4]

df = pd.DataFrame(columns=['filepath', 'speaker'])
for speaker in speakers:
    files = os.listdir('alexa/{}/'.format(speaker))
    for file in files:
        filepath = 'alexa/{}/{}'.format(speaker, file)
        df = df.append({'filepath':filepath, 'speaker':speaker}, ignore_index=True)
print(len(speakers))
df.head() # this is just a print statement

4


Unnamed: 0,filepath,speaker
0,alexa/aaeli/1.wav,aaeli
1,alexa/aaeli/2.wav,aaeli
2,alexa/aaeli/3.wav,aaeli
3,alexa/aaeli/4.wav,aaeli
4,alexa/aaharley/1.wav,aaharley


In [15]:

train, test = train_test_split(df, test_size = 0.29, stratify = df['speaker'])
# the stratify parameter makes the function split data evenly over the speakers column
# this is so we dont get all files of the same speaker in the test set and not the training set
train.head()

Unnamed: 0,filepath,speaker
6,alexa/aaharley/3.wav,aaharley
0,alexa/aaeli/1.wav,aaeli
2,alexa/aaeli/3.wav,aaeli
12,alexa/alex/1.wav,alex
8,alexa/alastair/1.wav,alastair


In [16]:
MFCCS = 12

def extract_features(filename):
    
    X, sample_rate = librosa.load(filename, res_type='kaiser_fast')
    
    # librosa returns an array of 40 arrays, one for each mfcc
    # np.mean takes the mean of each array, so we will be left with an array of size 40
    # the n_mfcc=40 parameter means return 40 mfccs
    mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=MFCCS)
    
    mfccs_mean = np.mean(mfccs.T, axis=0)
    
    # plp = librosa.beat.plp(y=X, sr=22050, onset_envelope=None, hop_length=512, win_length=99, tempo_min=30, tempo_max=300, prior=None)
    
    return mfccs_mean.tolist()#  + plp.tolist()

In [17]:
# apply the extract features to every element in train and test
train_features = train['filepath'].apply(extract_features)

# train_features is now an array of arrays
test_features = test['filepath'].apply(extract_features)

train_features.head()

6     [-585.1529541015625, 62.63404846191406, -0.797...
0     [-494.1591491699219, 77.5472412109375, 33.6363...
2     [-436.8819274902344, 95.32469177246094, 20.774...
12    [-496.4476318359375, 129.58126831054688, 61.19...
8     [-761.903076171875, 59.643253326416016, 4.3486...
Name: filepath, dtype: object

In [18]:
# split into X and Y where X is the features and Y is the label (name of speaker)
# remember that each array is still in the same order as above 
# so each element in X_train corresponds to an element in Y_train at the same index
X_train = train_features.tolist()
X_test = test_features.tolist()
Y_train = train['speaker'].tolist()
Y_test = test['speaker'].tolist()

In [19]:
# now X_train is a 2d array, and each array is the long array of mfccs
print("Speaker: {}".format(Y_train[0]))
print("Features: {}: ".format(X_train[0]))

Speaker: aaharley
Features: [-585.1529541015625, 62.63404846191406, -0.7975379228591919, 3.3995730876922607, 11.82490062713623, -3.0479846000671387, -3.4039061069488525, 0.6697409749031067, -3.783257007598877, 0.4936215877532959, 5.158707141876221, 0.361592561006546]: 


In [20]:

# hot encode y 
lb = LabelEncoder()

Y_train_encoded = to_categorical(lb.fit_transform(Y_train))
Y_test_encoded = to_categorical(lb.fit_transform(Y_test))

In [21]:
ss = StandardScaler()

X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.fit_transform(X_test)


In [22]:
model = Sequential()

model.add(Dense(MFCCS, input_shape=(MFCCS,), activation = 'relu'))
model.add(Dropout(0.2))

# model.add(Dense(256, activation = 'relu'))
# model.add(Dropout(0.01))  

model.add(Dense(512, activation = 'relu'))
model.add(Dropout(0.3))    

model.add(Dense(len(Y_train_encoded[0]), activation = 'softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_scaled, Y_train_encoded, epochs=20, validation_data=(X_test_scaled, Y_test_encoded))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [47]:
def predict_speaker(file_path):
      
    labels = Y_train_encoded.tolist() # list of the encoded labels from training 
    
    features = ss.transform([extract_features(file_path)])
    to_predict = np.array(features) # list of data for the model to predict, just one item for now
    
    predictions = model.predict(to_predict) # returns a list of predictions 
    pred = predictions[0].tolist() # take the first element which is the prediciton for the first element in to_predict, remember this is still one hot encoded so it is a big array of 0s and 1s
    
    m = max(pred)
    print(m)
    print(pred)
    p = [1 if i==max for i in pred] # convert it to floats
    print(p)
    try:
        prediction_ind = labels.index(p) # index of predicted label (encoded)

        prediction = Y_train[prediction_ind]
    except:
        prediction = "could not identify speaker"
        
    return prediction
    


In [48]:
predict_speaker("test/alex.wav")

0.9807298183441162
[0.010649635456502438, 0.0047612059861421585, 0.0038593050558120012, 0.9807298183441162]
[0, 0, 0, 0]


'could not identify speaker'

In [44]:
predict_speaker("test/Harley.wav")

[0, 0, 0, 0]


'could not identify speaker'

In [45]:
predict_speaker("test/eli.wav")

[0, 0, 0, 0]


'could not identify speaker'

In [46]:
predict_speaker("test/alastair.wav")

[0, 0, 0, 0]


'could not identify speaker'

In [28]:
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 2
WAVE_OUTPUT_FILENAME = "test/audio_input.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)
print("say 'Alexa' in: 3")
for i in range(2,0,-1):
    time.sleep(1)
    print (i)
time.sleep(1)

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

print(predict_speaker(WAVE_OUTPUT_FILENAME))


say 'Alexa' in: 3
2
1
* recording
* done recording
[[0.20175    0.11879814 0.4442187  0.23523316]]
could not identify speaker
