In [10]:
import pandas as pd
import numpy as np
import os
import librosa
import pyaudio
import wave
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten

In [11]:
DIR = 'alexa/'# 'team/' 
speakers = os.listdir(DIR)
speakers = speakers[:25]

df = pd.DataFrame(columns=['filepath', 'speaker'])
for speaker in speakers:
    files = os.listdir(DIR+'{}/'.format(speaker))
    for file in files:
        filepath = DIR+ '{}/{}'.format(speaker, file)
        df = df.append({'filepath':filepath, 'speaker':speaker}, ignore_index=True)
# df.head()

In [12]:

train, test = train_test_split(df, test_size = 0.29, stratify = df['speaker'])
# the stratify parameter makes the function split data evenly over the speakers column
# this is so we dont get all files of the same speaker in the test set and not the training set
# train.head()

In [13]:
MFCCS = 12

def extract_features(filename):
    
    X, sample_rate = librosa.load(filename, res_type='kaiser_fast')
    
    # librosa returns an array of 40 arrays, one for each mfcc
    # np.mean takes the mean of each array, so we will be left with an array of size 40
    # the n_mfcc=40 parameter means return 40 mfccs
    mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=MFCCS)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    
    
    delta = librosa.feature.delta(mfccs)
    delta_mean = np.mean(delta.T, axis=0)
   
    deltadelta = librosa.feature.delta(mfccs, order=2)
    deltadelta_mean = np.mean(deltadelta.T, axis=0)
    
    return mfccs_mean.tolist()  + delta_mean.tolist() + deltadelta_mean.tolist()

In [14]:
# apply the extract features to every element in train and test
train_features = train['filepath'].apply(extract_features)

# train_features is now an array of arrays
test_features = test['filepath'].apply(extract_features)

# train_features.head()

In [15]:
# split into X and Y where X is the features and Y is the label (name of speaker)
# remember that each array is still in the same order as above 
# so each element in X_train corresponds to an element in Y_train at the same index
X_train = train_features.tolist()
X_test = test_features.tolist()
Y_train = train['speaker'].tolist()
Y_test = test['speaker'].tolist()

In [16]:
# now X_train is a 2d array, and each array is the long array of mfccs
# print("Speaker: {}".format(Y_train[0]))
# print("Features: {}: ".format(X_train[0]))

In [17]:

# hot encode y 
lb = LabelEncoder()

Y_train_encoded = to_categorical(lb.fit_transform(Y_train))
Y_test_encoded = to_categorical(lb.fit_transform(Y_test))

In [18]:
ss = StandardScaler()

X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.fit_transform(X_test)


In [19]:


model = Sequential()

model.add(Dense(3*MFCCS, input_shape=(3*MFCCS,), activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(2048, activation = 'relu'))
model.add(Dropout(0.4))  

model.add(Dense(1024, activation = 'relu'))
model.add(Dropout(0.4))    

model.add(Dense(len(Y_train_encoded[0]), activation = 'softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_scaled, Y_train_encoded, epochs=50, validation_data=(X_test_scaled, Y_test_encoded))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [20]:
def predict_speaker(file_path):
      
    labels = Y_train_encoded.tolist() # list of the encoded labels from training 
    
    features = ss.transform([extract_features(file_path)])
    
    to_predict = np.array(features) # list of data for the model to predict, just one item for now
    
    predictions = model.predict(to_predict) # returns a list of predictions 
    pred = predictions[0].tolist() # take the first element which is the prediciton for the first element in to_predict, remember this is still one hot encoded so it is a big array of 0s and 1s
    m = max(pred)
    p = [1 if i==m else 0 for i in pred] # convert highest propability to 1 and all else to 0
    
    try:
        prediction_ind = labels.index(p) # index of predicted label (encoded)
        prediction = Y_train[prediction_ind]
    except:
        prediction = "could not identify speaker"
        
    return prediction

In [21]:
predict_speaker("test/alex.wav")

'alex'

In [22]:
predict_speaker("test/Harley.wav")

'harley'

In [23]:
predict_speaker("test/eli.wav")

'eli'

In [42]:
predict_speaker("test/alastair.wav")



EOFError: 

In [41]:
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 2
print(FORMAT)
def record_voice(output_filename):

    WAVE_OUTPUT_FILENAME = output_filename

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    print("say 'Alexa' in: 3")
    for i in range(2,0,-1):
        time.sleep(1)
        print (i)
    time.sleep(1)

    print("* recording")

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("* done recording")

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    print()

8


In [28]:
file = "test/audio_input.wav"
record_voice(file)
print(predict_speaker(file))

say 'Alexa' in: 3
2
1
* recording
* done recording

bfeciyuh


8
