In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
speakers = os.listdir('alexa/')

In [3]:
df = pd.DataFrame(columns=['filepath', 'speaker'])
for speaker in speakers:
    files = os.listdir('alexa/{}/'.format(speaker))
    for file in files:
        filepath = 'alexa/{}/{}'.format(speaker, file)
        df = df.append({'filepath':filepath, 'speaker':speaker}, ignore_index=True)

df.head() # this is just a print statement

Unnamed: 0,filepath,speaker
0,alexa/anfcucvo/1.wav,anfcucvo
1,alexa/anfcucvo/2.wav,anfcucvo
2,alexa/anfcucvo/3.wav,anfcucvo
3,alexa/anfcucvo/4.wav,anfcucvo
4,alexa/anfcucvo/5.wav,anfcucvo


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, test = train_test_split(df, test_size = 0.25, stratify = df['speaker'])
# the stratify parameter makes the function split data evenly over the speakers column
# this is so we dont get all files of the same speaker in the test set and not the training set
train.head()

Unnamed: 0,filepath,speaker
178,alexa/kxrnhrcj/2.wav,kxrnhrcj
142,alexa/kebwpdyu/4.wav,kebwpdyu
163,alexa/kwavzzrt/1.wav,kwavzzrt
157,alexa/kpkwyaut/3.wav,kpkwyaut
359,alexa/zgmrhuwb/3.wav,zgmrhuwb


In [6]:
import librosa

In [47]:
def extract_features(filename):
    
    X, sample_rate = librosa.load(filename, res_type='kaiser_fast')
    
    # librosa returns an array of 40 arrays, one for each mfcc
    # np.mean takes the mean of each array, so we will be left with an array of size 40
    # the n_mfcc=40 parameter means return 40 mfccs
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)

    return mfccs

In [48]:
# test out the function on one of the files
print(extract_features(train['filepath'][178]))


[-3.1962180e+02  1.4823213e+02  7.4388347e+00  4.3691959e+01
 -1.2536559e+01  1.4930658e+01 -1.3931860e+01 -1.0716937e+01
  2.3703206e+00 -2.9629259e+00  1.0011257e+01 -3.0535655e+00
  4.6731682e+00 -6.5846086e-01  1.6229712e+00  2.2420251e+00
 -6.6143498e+00 -4.5536866e+00 -3.0322628e+00 -5.7504845e+00
 -8.9279156e+00 -1.6755651e+00 -2.0000737e+00  2.3309101e-01
  1.6361276e+00  2.6695242e+00  5.5133595e+00  2.7833445e+00
  2.9495533e+00  1.6522233e+00  1.3291792e+00 -1.1747522e+00
 -5.6325346e-01  2.9200637e-01  3.5991445e-01  2.6883512e+00
  4.2928919e-01  1.7083055e+00  8.9124268e-01  2.1304487e-01]


In [49]:
# apply the extract features to every element in train and test
train_features = train['filepath'].apply(extract_features)
# train_features is now an array of arrays
test_features = test['filepath'].apply(extract_features)
train_features.head()

178    [-319.6218, 148.23213, 7.4388347, 43.69196, -1...
142    [-423.4431, 97.035995, -35.56632, 24.627453, -...
163    [-408.52982, 154.50317, -17.65575, 22.240131, ...
157    [-437.1071, 121.908424, -18.949936, 15.910776,...
359    [-514.9549, 133.59537, -34.719406, 44.99738, -...
Name: filepath, dtype: object

In [51]:
train_features[142].shape
# just an array with 40 elements

(40,)

In [54]:
# split into X and Y where X is the features and Y is the label (name of speaker)
# remember that each array is still in the same order as above 
# so each element in X_train corresponds to an element in Y_train at the same index
X_train = np.asarray(train_features)
X_test = np.array(test_features)
Y_train = np.array(train['speaker'])
Y_test = np.array(test['speaker'])

In [55]:
# now X_train is a 2d array, and each array is the long array of mfccs
print("speaker: {}".format(Y_train[0]))
print("Features: {}: ".format(X_train[0]))

speaker: kxrnhrcj
Features: [-3.1962180e+02  1.4823213e+02  7.4388347e+00  4.3691959e+01
 -1.2536559e+01  1.4930658e+01 -1.3931860e+01 -1.0716937e+01
  2.3703206e+00 -2.9629259e+00  1.0011257e+01 -3.0535655e+00
  4.6731682e+00 -6.5846086e-01  1.6229712e+00  2.2420251e+00
 -6.6143498e+00 -4.5536866e+00 -3.0322628e+00 -5.7504845e+00
 -8.9279156e+00 -1.6755651e+00 -2.0000737e+00  2.3309101e-01
  1.6361276e+00  2.6695242e+00  5.5133595e+00  2.7833445e+00
  2.9495533e+00  1.6522233e+00  1.3291792e+00 -1.1747522e+00
 -5.6325346e-01  2.9200637e-01  3.5991445e-01  2.6883512e+00
  4.2928919e-01  1.7083055e+00  8.9124268e-01  2.1304487e-01]: 


In [None]:
# comment 3 by harley