In [1]:
import numpy as np
from matplotlib import pyplot as pl
import os
import scipy.io.wavfile as wav
from python_speech_features import mfcc
import utils.mlp_backprop_momentum as mlp
import utils.k_fold_cross_validation as cv

%matplotlib inline

In [2]:
PATH = 'vowels/'

MALE = [1,-1,-1]
FEMALE = [-1,1,-1]
KID = [-1,-1,1]


In [3]:
mfccs = []

for filename in os.listdir(PATH):
    if filename.endswith('.wav'):
        sample_rate, X = wav.read(os.path.join(PATH, filename))
        ceps = np.mean(mfcc(X, samplerate=sample_rate, nfft=1024), axis=0)
        
        speaker = MALE
        if filename[2] == 'm':
            speaker = MALE
        elif filename[2] == 'f':
            speaker = FEMALE
        elif filename[1] == 'k':
            speaker = KID
        else:
            continue
            
        ceps = np.append(ceps, speaker)
        mfccs.append(ceps)
        
mfccs = np.array(mfccs)
mfccs = mfccs - mfccs.min(axis=0)
mfccs = mfccs / mfccs.max(axis=0)
mfccs = mfccs * 2 - 1
        
# mfccs contient désormais toutes les valeurs de mfcc, ainsi que la position du mfcc dans la window et le type de speaker

In [4]:
mfccs

array([[ 0.46461765, -0.52752412,  0.36474408, ..., -1.        ,
        -1.        ,  1.        ],
       [ 0.43507974, -0.36574898, -0.10970382, ..., -1.        ,
        -1.        ,  1.        ],
       [ 0.27572612,  0.18993969, -0.4100183 , ..., -1.        ,
         1.        , -1.        ],
       ...,
       [ 0.1968931 , -0.43617008, -0.02637636, ...,  1.        ,
        -1.        , -1.        ],
       [ 1.        , -0.41982229, -0.22460435, ..., -1.        ,
        -1.        ,  1.        ],
       [ 0.33870224,  0.05426988, -0.44070258, ..., -1.        ,
        -1.        ,  1.        ]])

In [5]:
# learning 

DATASET_SIZE = len(mfccs)
SPREAD = 0.7
dataset = mfccs
N_INITS = 10
EPOCHS = 200 #nb d'iterations
N_NEURONS = [13, 6, 3]
LEARNING_RATE = 0.001
MOMENTUM = 0.5
K=5

MSE_train, MSE_test, conf_mat = cv.k_fold_cross_validation(mlp.MLP(N_NEURONS, activation='tanh'),
                                                          dataset,
                                                          k=K,
                                                          learning_rate=LEARNING_RATE,
                                                          momentum=MOMENTUM,
                                                          epochs=EPOCHS,
                                                          threshold=0.5)