In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-driv/e-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

gpg: keybox '/tmp/tmpczf5qk8g/pubring.gpg' created
gpg: /tmp/tmpczf5qk8g/trustdb.gpg: trustdb created
gpg: key AD5F235DF639B041: public key "Launchpad PPA for Alessandro Strada" imported
gpg: Total number processed: 1
gpg:               imported: 1
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [0]:
!mkdir -p drive
!google-drive-ocamlfuse drive
import os
os.chdir('/content/drive')

In [0]:
os.chdir('drive')

In [0]:
os.chdir('Speaker Classification')

In [0]:
!pip install librosa
!pip install scipy



In [0]:
import scipy.io.wavfile as wav
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats
from skimage.util.shape import view_as_windows
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import shuffle
import tensorflow as tf
import librosa
import librosa.display
from scipy import misc
import pandas as pd
from matplotlib.pyplot import specgram
import math
import matplotlib.image as mpimg

#load the .wav data
print("Loading the data from .wav files...")
print("")
voice_00 = librosa.load('data/voice_01.wav', sr = 16000)
voice_01 = librosa.load('data/voice_02.wav', sr = 16000)
voice_02 = librosa.load('data/voice_03.wav', sr = 16000)
voice_03 = librosa.load('data/voice_04.wav', sr = 16000)
voice_04 = librosa.load('data/voice_05.wav', sr = 16000)
voice_05 = librosa.load('data/voice_06.wav', sr = 16000)
voice_06 = librosa.load('data/voice_07.wav', sr = 16000)
voice_07 = librosa.load('data/voice_08.wav', sr = 16000)
voice_08 = librosa.load('data/voice_09.wav', sr = 16000)
voice_09 = librosa.load('data/voice_10.wav', sr = 16000)
print("Loaded")
print("")

#inspect the data
voices = [voice_00, voice_01, voice_02, voice_03, voice_04, voice_05, voice_06, voice_07, voice_08, voice_09]
for voice_file in voices:
    print('Sample Rate:', voice_file[1], '|', 'Shape of Array:', voice_file[0].shape) #2 channel audio

#only use the audio data (librosa load function also outputs sample rate)
all_voice_files = []
for voice in voices:
    all_voice_files.append(voice[0])
    
#look at the attributes of each file
#min, max, mean, standard deviation
indexer = 0
for voice in all_voice_files:
    print("Sample", indexer, "|", "Mean:", np.mean(voice), "|", "Max:", np.max(voice), "|", \
          "Min:", np.min(voice), "|", "Std Dev:", np.std(voice))
    indexer += 1
    
"""
Find all 200ms samples that have clear audio data in them
"""
#in each sample, find 200ms chunks that have a mean over the total average + standard deviation
window_shape = 16000 / 5

voice_data = []
voice_labels = []
voice_number = 0

for voice in all_voice_files:
    positive_full_array = view_as_windows(np.absolute(voice), window_shape)
    positive_full_array = positive_full_array[::int(window_shape / 2)] #keep every nth row, where n is window_shape/2 (For some overlap)
    temp_full_array = view_as_windows(voice, window_shape)
    temp_full_array = temp_full_array[::int(window_shape / 2)]
    
    for window_index in range(len(temp_full_array)):
        if np.mean(positive_full_array[window_index]) > (np.mean(voice) + np.std(voice)):
            voice_data.append(temp_full_array[window_index])
            voice_labels.append(voice_number)
            
    voice_number += 1
    
voice_data = np.array(voice_data)
print("Number of samples:", voice_data.shape)
#normalize the data
#voice_data_normalized = preprocessing.normalize(voice_data)

voice_labels = np.array(voice_labels)
#one-hot encode the labels
voice_labels = np.eye(10)[voice_labels]

print("Number of labels:", voice_labels.shape)

#determine how many of each speaker is in the samples/labels dataset
identity_matrix = np.identity(10)
number_of_samples = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
for i in range(len(voice_labels)):
    for j in range(10):
        number_of_samples[j] += np.sum(np.all(np.equal(voice_labels[i], identity_matrix[j])))

for i in range(10):
    print("Number of samples from voice", i, ':', number_of_samples[i])

"""
split the data into sets
"""

#split into training and testing sets - 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(voice_data, voice_labels, test_size = 0.40, random_state = 7)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5, random_state = 7)
print("Training Set:", X_train.shape, y_train.shape)
print("Testing Set:", X_test.shape, y_test.shape)
print("Validation Set:", X_val.shape, y_val.shape)

print("Created raw audio data and labels from .wav files.")
print("Shape: ", X_train[0].shape)

def generate_mfcc():
    train_out = []
    val_out = []
    test_out = []

    for i in range(len(X_train)):
        temp_train = np.transpose(np.expand_dims(librosa.feature.mfcc(y=X_train[i], sr=16000), axis=0), [0, 2, 1])
        train_out.append(temp_train)
    train_out = np.array(train_out)

    for i in range(len(X_test)):
        temp_test = np.transpose(np.expand_dims(librosa.feature.mfcc(y=X_test[i], sr=16000), axis=0), [0, 2, 1])
        test_out.append(temp_test)
    test_out = np.array(test_out)

    for i in range(len(X_val)):
        temp_val = np.transpose(np.expand_dims(librosa.feature.mfcc(y=X_val[i], sr=16000), axis=0), [0, 2, 1])
        val_out.append(temp_val)
    val_out = np.array(val_out)

    print(train_out.shape)
    print(val_out.shape)
    print(test_out.shape)
    return train_out, val_out, test_out
        
        
X_train_out, X_val_out, X_test_out = generate_mfcc()

Loading the data from .wav files...

Loaded

Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample Rate: 16000 | Shape of Array: (2880000,)
Sample 0 | Mean: 4.7888716e-06 | Max: 0.91083825 | Min: -0.5682895 | Std Dev: 0.054258995
Sample 1 | Mean: -0.0002666064 | Max: 0.64925575 | Min: -0.8183496 | Std Dev: 0.07194531
Sample 2 | Mean: -2.390854e-06 | Max: 0.4160391 | Min: -0.39128813 | Std Dev: 0.054623123
Sample 3 | Mean: 3.47779e-06 | Max: 0.40846926 | Min: -0.43750376 | Std Dev: 0.043937676
Sample 4 | Mean: -1.792521e-06 | Max: 0.3158226 | Min: -0.27962488 | Std Dev: 0.03735458
Sample 5 | Mean: -1.8312595e

In [0]:
X_train_out = np.squeeze(X_train_out, axis=1)
X_val_out = np.squeeze(X_val_out, axis=1)
X_test_out = np.squeeze(X_test_out, axis=1)

print(X_train_out.shape)

(1616, 7, 20)


In [0]:
epochs = 100
timesteps = 7
hidden_units = 512
batch_size = 10

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(timesteps, 20), dropout=0.25, recurrent_dropout=0.45))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 512)               1091584   
_________________________________________________________________
dense_3 (Dense)              (None, 10)                5130      
Total params: 1,096,714
Trainable params: 1,096,714
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model.fit(X_train_out, y_train, epochs=epochs, batch_size= batch_size, verbose=2)

Epoch 1/100
 - 5s - loss: 1.7600 - acc: 0.3923
Epoch 2/100
 - 4s - loss: 1.1455 - acc: 0.6176
Epoch 3/100
 - 4s - loss: 0.8980 - acc: 0.7116
Epoch 4/100
 - 4s - loss: 0.7551 - acc: 0.7420
Epoch 5/100
 - 4s - loss: 0.6908 - acc: 0.7649
Epoch 6/100
 - 4s - loss: 0.6464 - acc: 0.7729
Epoch 7/100
 - 4s - loss: 0.6173 - acc: 0.7877
Epoch 8/100
 - 4s - loss: 0.5976 - acc: 0.7946
Epoch 9/100
 - 4s - loss: 0.5931 - acc: 0.8001
Epoch 10/100
 - 4s - loss: 0.5507 - acc: 0.8261
Epoch 11/100
 - 4s - loss: 0.5189 - acc: 0.8224
Epoch 12/100
 - 4s - loss: 0.5028 - acc: 0.8168
Epoch 13/100
 - 4s - loss: 0.4921 - acc: 0.8373
Epoch 14/100
 - 4s - loss: 0.4809 - acc: 0.8366
Epoch 15/100
 - 4s - loss: 0.4888 - acc: 0.8274
Epoch 16/100
 - 4s - loss: 0.4675 - acc: 0.8453
Epoch 17/100
 - 4s - loss: 0.4862 - acc: 0.8366
Epoch 18/100
 - 4s - loss: 0.4645 - acc: 0.8416
Epoch 19/100
 - 4s - loss: 0.4210 - acc: 0.8540
Epoch 20/100
 - 4s - loss: 0.4218 - acc: 0.8564
Epoch 21/100
 - 4s - loss: 0.4241 - acc: 0.8583
E

 - 4s - loss: 0.3383 - acc: 0.8756
Epoch 47/100
 - 4s - loss: 0.3317 - acc: 0.8830
Epoch 48/100
 - 4s - loss: 0.3079 - acc: 0.8929
Epoch 49/100
 - 4s - loss: 0.3598 - acc: 0.8787
Epoch 50/100
 - 4s - loss: 0.3116 - acc: 0.8960
Epoch 51/100
 - 4s - loss: 0.3179 - acc: 0.8892
Epoch 52/100
 - 4s - loss: 0.3302 - acc: 0.8843
Epoch 53/100
 - 4s - loss: 0.3412 - acc: 0.8824
Epoch 54/100
 - 4s - loss: 0.3212 - acc: 0.8837
Epoch 55/100
 - 4s - loss: 0.3207 - acc: 0.8905
Epoch 56/100
 - 4s - loss: 0.3053 - acc: 0.8892
Epoch 57/100
 - 4s - loss: 0.3249 - acc: 0.8917
Epoch 58/100
 - 4s - loss: 0.3115 - acc: 0.8849
Epoch 59/100
 - 4s - loss: 0.2990 - acc: 0.8979
Epoch 60/100
 - 4s - loss: 0.3181 - acc: 0.8899
Epoch 61/100
 - 4s - loss: 0.2905 - acc: 0.9004
Epoch 62/100
 - 4s - loss: 0.3311 - acc: 0.8787
Epoch 63/100
 - 4s - loss: 0.3064 - acc: 0.8979
Epoch 64/100
 - 4s - loss: 0.3143 - acc: 0.8886
Epoch 65/100
 - 4s - loss: 0.2917 - acc: 0.8923
Epoch 66/100
 - 4s - loss: 0.3005 - acc: 0.8917
Epoch

 - 4s - loss: 0.2688 - acc: 0.9059
Epoch 92/100
 - 4s - loss: 0.2633 - acc: 0.9097
Epoch 93/100
 - 4s - loss: 0.2577 - acc: 0.9072
Epoch 94/100
 - 4s - loss: 0.2773 - acc: 0.9016
Epoch 95/100
 - 4s - loss: 0.2776 - acc: 0.8967
Epoch 96/100
 - 4s - loss: 0.2326 - acc: 0.9152
Epoch 97/100
 - 4s - loss: 0.2654 - acc: 0.9066
Epoch 98/100
 - 4s - loss: 0.2559 - acc: 0.9072
Epoch 99/100
 - 4s - loss: 0.2729 - acc: 0.9134
Epoch 100/100
 - 4s - loss: 0.2443 - acc: 0.9115


<keras.callbacks.History at 0x7f0ba883f550>

In [0]:
model.evaluate(x=X_val_out, y=y_val, batch_size=batch_size)



[0.2506263633373112, 0.9202226243346432]

In [0]:
model.evaluate(x=X_test_out, y=y_test, batch_size=batch_size)



[0.3131854184752016, 0.8923933141085566]