In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
speakers = os.listdir('alexa/')

In [3]:
df = pd.DataFrame(columns=['filepath', 'speaker'])
for speaker in speakers:
    files = os.listdir('alexa/{}/'.format(speaker))
    for file in files:
        filepath = 'alexa/{}/{}'.format(speaker, file)
        df = df.append({'filepath':filepath, 'speaker':speaker}, ignore_index=True)

df.head() # this is just a print statement

Unnamed: 0,filepath,speaker
0,alexa/anfcucvo/1.wav,anfcucvo
1,alexa/anfcucvo/2.wav,anfcucvo
2,alexa/anfcucvo/3.wav,anfcucvo
3,alexa/anfcucvo/4.wav,anfcucvo
4,alexa/anfcucvo/5.wav,anfcucvo


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, test = train_test_split(df, test_size = 0.25, stratify = df['speaker'])
# the stratify parameter makes the function split data evenly over the speakers column
# this is so we dont get all files of the same speaker in the test set and not the training set
train.head()

Unnamed: 0,filepath,speaker
178,alexa/kxrnhrcj/2.wav,kxrnhrcj
142,alexa/kebwpdyu/4.wav,kebwpdyu
163,alexa/kwavzzrt/1.wav,kwavzzrt
157,alexa/kpkwyaut/3.wav,kpkwyaut
359,alexa/zgmrhuwb/3.wav,zgmrhuwb


In [6]:
import librosa

In [7]:
def extract_features(filename):
    
    X, sample_rate = librosa.load(filename, res_type='kaiser_fast')
    
    mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)

    return mfccs

In [8]:
# test out the function on one of the files
print(extract_features(train['filepath'][178]))
# it returns an array of 40 arrays so we need to flatten it into a 1d array later

[[-5.9909015e+02 -3.6271417e+02 -2.7202203e+02 ... -3.6679407e+02
  -3.6198358e+02 -3.6131876e+02]
 [ 0.0000000e+00  3.8621197e+01  6.0143829e+01 ...  1.4929198e+02
   1.5024278e+02  1.4371318e+02]
 [ 0.0000000e+00 -1.2240431e+01 -3.0983868e+01 ...  1.6902367e+01
   1.8225193e+01  1.4989624e+01]
 ...
 [ 0.0000000e+00 -4.5158615e+00 -2.9134605e+00 ... -3.1189102e-01
   4.3945408e-01  2.1566045e+00]
 [ 0.0000000e+00  6.4124022e+00  4.1545420e+00 ...  2.9092536e+00
   1.2940142e+00  1.5603128e-01]
 [ 0.0000000e+00  8.1574726e+00  8.6703691e+00 ... -4.1870975e-01
  -1.6172510e+00 -1.2464466e+00]]


In [9]:
# apply the extract features to every element in train 
train_features = train['filepath'].apply(extract_features)
# train_features is now an array of 2d arrays
# each 2d array is the features of one audio file
test_features = test['filepath'].apply(extract_features)
train_features.head()

178    [[-599.09015, -362.71417, -272.02203, -277.916...
142    [[-675.10297, -675.10297, -663.4995, -659.2624...
163    [[-327.3413, -336.8754, -435.57803, -499.06454...
157    [[-352.7316, -367.68524, -400.95602, -431.2065...
359    [[-534.69324, -524.5771, -519.0945, -518.5205,...
Name: filepath, dtype: object

In [10]:
train_features[272].shape
#40 arrays, each with 116 elements

(40, 116)

In [29]:
#now to flatten each 2d array into a 1d array
train_features_1d = []
for features in train_features:
    features_1d = features.flatten()
    train_features_1d.append(features_1d)

In [37]:
print(train_features_1d[0]) # now this is a single array

[-5.9909015e+02 -3.6271417e+02 -2.7202203e+02 ... -4.1870975e-01
 -1.6172510e+00 -1.2464466e+00]


In [38]:
# do the same thing for the test features
test_features_1d = []
for features in test_features:
    features_1d = features.flatten()
    test_features_1d.append(features_1d)

In [42]:
# split into X and Y where X is the features and Y is the label (name of speaker)
# remember that each array is still in the same order as above 
# so each element in X_train corresponds to an element in Y_train at the same index
X_train = np.asarray(train_features_1d)
X_test = np.array(test_features_1d)
Y_train = np.array(train['speaker'])
Y_test = np.array(test['speaker'])




In [45]:
# now X_train is a 2d array, and each array is the long array of mfccs
print("speaker: {}".format(Y_train[0]))
print("Features: {}: ".format(X_train[0]))

speaker: kxrnhrcj
Features: [-5.9909015e+02 -3.6271417e+02 -2.7202203e+02 ... -4.1870975e-01
 -1.6172510e+00 -1.2464466e+00]: 
