#### Imports

In [None]:
import IPython.display as ipd
# % pylab inline
import os
import pandas as pd
import librosa
import glob 
import librosa.display
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from keras.utils.np_utils import to_categorical

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 

from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout 
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

from keras.callbacks import EarlyStopping

from keras import regularizers

from sklearn.preprocessing import LabelEncoder

from datetime import datetime

import os

: 

In [None]:
# pip install librosa

: 

#### Reading the files from our folder and creating a dataframe from it

In [None]:
#list the files
filelist = os.listdir('male') 
#read them into pandas
df_male = pd.DataFrame(filelist)

: 

In [None]:
# Adding the 1 label to the dataframe representing male
df_male['label']='1'

: 

In [None]:
# Renaming the column name to file
df_male = df_male.rename(columns={0:'file'})

: 

In [None]:
df_male.head()

: 

In [None]:
# Checking for a file that gets automatically generated and we need to drop 
df_male[df_male['file']=='.DS_Store']

: 

Doing the same for the female folder

In [None]:
filelist = os.listdir('female') 
#read them into pandas
df_female = pd.DataFrame(filelist)

: 

In [None]:
df_female['label']='0'

: 

In [None]:
df_female = df_female.rename(columns={0:'file'})

: 

In [None]:
df_female.head()

: 

In [None]:
# Checking for a file that gets automatically generated and we need to drop 
df_female[df_female['file']=='.DS_Store']

: 

In [None]:
# Dropping the system file
df_female.drop(981, inplace=True)

: 

In [None]:
# Resetting the index since we dropped a row
df_female = df_female.reset_index(drop=True)

: 

Joining both dataframes together 

In [None]:
df = pd.concat([df_female, df_male], ignore_index=True)

: 

In [None]:
df.head()

: 

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

: 

We are going to do an split of train, validation and test with 70% train, 20% validation and 10% for test. We check that the randomized splits have balanced classes

In [None]:
df_train = df[:9188]

: 

In [None]:
df_train['label'].value_counts(normalize=True)

: 

In [None]:
df_validation = df[9188:11813]

: 

In [None]:
df_validation['label'].value_counts(normalize=True)

: 

In [None]:
df_test = df[11813:13125]

: 

In [None]:
df_test['label'].value_counts(normalize=True)

: 

#### Function to extract the features and label for each sound file by iterating through every row of the dataframe

In [None]:
# Although this function was modified and many parameteres were explored with, most of it
# came from Source 8 (sources in the READ.ME)

def extract_features(files):
    
    # Sets the name to be the path to where the file is in my computer
    file_name = os.path.join(os.path.abspath('voice')+'/'+str(files.file))

    # Loads the audio file as a floating point time series and assigns the default sample rate
    # Sample rate is set to 22050 by default
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 

    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

    # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
    stft = np.abs(librosa.stft(X))

    # Computes a chromagram from a waveform or power spectrogram.
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

    # Computes a mel-scaled spectrogram.
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)

    # Computes spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

    # Computes the tonal centroid features (tonnetz)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
    sr=sample_rate).T,axis=0)
        
    
    # We add also the classes of each file as a label at the end
    label = files.label

    return mfccs, chroma, mel, contrast, tonnetz, label

: 

In [None]:
# Code to start the timer to see how long it takes to extract the features
startTime = datetime.now()

: 

In [None]:
# Applying the function to the train data by accessing each row of the dataframe
features_label = df.apply(extract_features, axis=1)

: 

In [None]:
# Code to see how long it took
print(datetime.now() - startTime)

: 

In [None]:
# Checking how the output looks
features_label

: 

In [None]:
# The next code in markdown saves the numpy array (in case our kernel restarts or 
# anything happens, because it takes long to extract the features)

# np.save('features_label', features_label)

: 

In [None]:
# The next code loads the saved numpy array of our extracted features
# features_label = np.load('features_label.npy', allow_pickle=True)

: 

In [None]:
# We create an empty list where we will concatenate all the features into one long feature
# for each file to feed into our neural network 

features = []
for i in range(0, len(features_label)):
    features.append(np.concatenate((features_label[i][0], features_label[i][1], 
                features_label[i][2], features_label[i][3],
                features_label[i][4]), axis=0))

: 

In [None]:
len(features)

: 

#### We will add the speaker id to our dataframe to have that as the label for our model and predict speakers from their voice

In [None]:
# We create an empty list where we will append all the speakers ids for each row of our
# dataframe by slicing the file name since we know the id is the first numbers before the hash
speaker = []
for i in range(0, len(df)):
    speaker.append(df['file'][i].split('-')[0])

: 

In [None]:
# Now we create the speaker column in our dataframe and set it equal to our speaker list
df['speaker'] = speaker

: 

In [None]:
# Checking that it worked as expected
df.head()

: 

In [None]:
# Checking the number of speakers or the number of different people in our voice data
df['speaker'].nunique()

: 

In [None]:
# Setting our labels to be equal to our speaker list
labels = speaker

: 

In [None]:
# Checking the size of labels and making sure it matches the size of features
len(labels)

: 

#### Checking if we have balanced classes for the whole data

In [None]:
# They look somewhat balanced with a min of 56 and a max of 166, mean of 114 
# with standard deviation of 15.89 (calculated from scipy)
np.unique(labels, return_counts=True)

: 

#### Hot encoding y and pre processing X and y

In [None]:
X = np.array(features)

: 

In [None]:
y = np.array(labels)

: 

In [None]:
# Hot encoding y
lb = LabelEncoder()
y = to_categorical(lb.fit_transform(y))

: 

In [None]:
X.shape

: 

In [None]:
y.shape

: 

In [None]:
# Choosing the first 9188 (70%) files to be our train data
# Choosing the next  2625 (20%) files to be our validation data
# Choosing the next  1312 (10%) files to be our test never before seen data
# This is analogous to a train test split but we add a validation split and we are making
# we do not shuffle anything since we are dealing with several time series, we already 
# checked before that we have balanced classes (analogous to stratify)

X_train = X[:9188]
y_train = y[:9188]

X_val = X[9188:11813]
y_val = y[9188:11813]

X_test = X[11813:]
y_test = y[11813:]

: 

In [None]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_val = ss.transform(X_val)
X_test = ss.transform(X_test)

: 

In [None]:
# Build a simple dense model with early stopping with softmax for categorical classification
# We have 115 classes 

model = Sequential()

model.add(Dense(193, input_shape=(193,), activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.25))  

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))    

model.add(Dense(115, activation = 'softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')

: 

In [None]:
# fitting the model with the train data and validation with the validation data
# we used early stop with patience 100 because we did not want to use early stop
# I leave the early stop regularization code in case anyone wants to use it

history = model.fit(X_train, y_train, batch_size=256, epochs=100, 
                    validation_data=(X_val, y_val),
                    callbacks=[early_stop])
                    

: 

In [None]:
# Check out our train accuracy and validation accuracy over epochs.
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Set figure size.
plt.figure(figsize=(12, 8))

# Generate line plot of training, testing loss over epochs.
plt.plot(train_accuracy, label='Training Accuracy', color='#185fad')
plt.plot(val_accuracy, label='Validation Accuracy', color='orange')

# Set title
plt.title('Training and Validation Accuracy by Epoch', fontsize = 25)
plt.xlabel('Epoch', fontsize = 18)
plt.ylabel('Categorical Crossentropy', fontsize = 18)
plt.xticks(range(0,100,5), range(0,100,5))

plt.legend(fontsize = 18);

: 

In [None]:
# We get our predictions from the test data
preds = model.predict_classes(X_test)

: 

In [None]:
# We transform back our predictions to the speakers ids
preds = lb.inverse_transform(preds)

: 

In [None]:
# We slice our dataframe to our test dataframe
df_test = df[11813:]

: 

In [None]:
# We create a new column called preds and set it equal to our predictions
df_test['preds'] = preds

: 

In [None]:
# Checking how our test dataframe looks like now with our predictions
df_test

: 

In [None]:
# Checking how many speakers we got wrong
df_test[df_test['speaker'] != df_test['preds']]

: 

In [None]:
# Checking our model accuracy
1-round(len(df_test[df_test['speaker'] != df_test['preds']])/len(df_test),3)

: 

#### 99.8% accurate on test data for classification of speakers