# Challenge SD207 - 2017
*<p>Author: Pengfei MI, Rui SONG</p>*
*<p>Date: 06/06/2017</p>*

In [17]:
import numpy as np
from scipy.stats import mode
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import librosa
from presets import Preset
import time

In [2]:
# Define some usefull functions
def load_sound_files(file_paths):
    raw_sounds = []
    for fp in file_paths:
        X,sr = librosa.load(fp)
        raw_sounds.append(X)
    return raw_sounds

In [3]:
# Read data and preprocessing
X_test = pd.read_csv('test_files.txt', header=None)[0].values
data = pd.read_csv('audio/train.txt', header=None, sep='\s+')
X_train =  data[0].values
y_train = pd.factorize(data[1])[0]
print X_train
#print y_train

['audio/b010_0_30.wav' 'audio/b010_60_90.wav' 'audio/b010_150_180.wav'
 'audio/b010_30_60.wav' 'audio/b010_120_150.wav' 'audio/b022_120_150.wav'
 'audio/b022_60_90.wav' 'audio/b022_180_210.wav' 'audio/b022_30_60.wav'
 'audio/b022_90_120.wav' 'audio/b022_150_180.wav' 'audio/b022_0_30.wav'
 'audio/b011_180_210.wav' 'audio/b011_90_120.wav' 'audio/b011_150_180.wav'
 'audio/b011_60_90.wav' 'audio/b011_120_150.wav' 'audio/b011_30_60.wav'
 'audio/b011_0_30.wav' 'audio/a112_90_120.wav' 'audio/a112_120_150.wav'
 'audio/a112_30_60.wav' 'audio/a112_60_90.wav' 'audio/a112_0_30.wav'
 'audio/b107_0_30.wav' 'audio/b107_30_60.wav' 'audio/b107_90_120.wav'
 'audio/b107_60_90.wav' 'audio/b110_210_240.wav' 'audio/b110_120_150.wav'
 'audio/b110_180_210.wav' 'audio/b110_150_180.wav' 'audio/b110_270_300.wav'
 'audio/b110_90_120.wav' 'audio/b110_30_60.wav' 'audio/b110_0_30.wav'
 'audio/b110_240_270.wav' 'audio/b110_60_90.wav' 'audio/a083_30_60.wav'
 'audio/a083_0_30.wav' 'audio/a083_150_180.wav' 'audio/a083_6

In [6]:
n_mfcc = 20
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name, sr = None)
    S = librosa.feature.melspectrogram(y = X, sr = sample_rate, n_fft = 512, hop_length = 512)
    mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, S=librosa.power_to_db(S), n_mfcc = n_mfcc).T
    return mfcc#,chroma,mel,contrast,tonnetz

def parse_audio_files(files, file_labels):
    features, labels = np.empty((0,n_mfcc)), np.empty(0)
    for i in range(files.shape[0]):
        try:
            mfccs = extract_feature(files[i])
            #print mfccs.shape
        except Exception as e:
            print "Error encountered while parsing file: ", files[i]
            continue
        features = np.vstack([features, mfccs])
        labels = np.append(labels, file_labels[i]*np.ones(mfccs.shape[0]))
        print files[i]
    return np.array(features), np.array(labels, dtype = np.int)

In [7]:
ft, lb = parse_audio_files(X_train, y_train)
print ft.shape
print lb

audio/b010_0_30.wav
audio/b010_60_90.wav
audio/b010_150_180.wav
audio/b010_30_60.wav
audio/b010_120_150.wav
audio/b022_120_150.wav
audio/b022_60_90.wav
audio/b022_180_210.wav
audio/b022_30_60.wav
audio/b022_90_120.wav
audio/b022_150_180.wav
audio/b022_0_30.wav
audio/b011_180_210.wav
audio/b011_90_120.wav
audio/b011_150_180.wav
audio/b011_60_90.wav
audio/b011_120_150.wav
audio/b011_30_60.wav
audio/b011_0_30.wav
audio/a112_90_120.wav
audio/a112_120_150.wav
audio/a112_30_60.wav
audio/a112_60_90.wav
audio/a112_0_30.wav
audio/b107_0_30.wav
audio/b107_30_60.wav
audio/b107_90_120.wav
audio/b107_60_90.wav
audio/b110_210_240.wav
audio/b110_120_150.wav
audio/b110_180_210.wav
audio/b110_150_180.wav
audio/b110_270_300.wav
audio/b110_90_120.wav
audio/b110_30_60.wav
audio/b110_0_30.wav
audio/b110_240_270.wav
audio/b110_60_90.wav
audio/a083_30_60.wav
audio/a083_0_30.wav
audio/a083_150_180.wav
audio/a083_60_90.wav
audio/a083_120_150.wav
audio/a083_90_120.wav
audio/a060_60_90.wav
audio/a060_0_30.wav
au

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.grid_search import GridSearchCV

np.random.seed(45)
param_grid = {#"hidden_layer_sizes":[(128,256),(128,64)],
    "alpha":np.logspace(1, 3, 7),
    #"learning_rate_init":np.logspace(-4,-1,3),
    #"momentum": [0.6, 0.7, 0.8, 0.9]
}

clf = MLPClassifier(hidden_layer_sizes=(100,100,100), random_state =42, verbose=True)
gs = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)
gs.fit(ft, lb)

gs.best_estimator_



Iteration 1, loss = 3.27946367
Iteration 1, loss = 3.29846243
Iteration 1, loss = 3.26561363
Iteration 1, loss = 3.25082357
Iteration 2, loss = 2.04563983
Iteration 2, loss = 2.04960629
Iteration 2, loss = 2.02911885
Iteration 2, loss = 2.03590766
Iteration 3, loss = 1.68956481
Iteration 3, loss = 1.68965393
Iteration 3, loss = 1.66820973
Iteration 3, loss = 1.68749659
Iteration 4, loss = 1.55637130
Iteration 4, loss = 1.55468670
Iteration 4, loss = 1.53424281
Iteration 4, loss = 1.56348838
Iteration 5, loss = 1.50608184
Iteration 5, loss = 1.50388101
Iteration 5, loss = 1.48223669
Iteration 5, loss = 1.51390161
Iteration 6, loss = 1.47827100
Iteration 6, loss = 1.47984241
Iteration 6, loss = 1.45933582
Iteration 6, loss = 1.49009628
Iteration 7, loss = 1.46817666
Iteration 7, loss = 1.46391490
Iteration 7, loss = 1.44593921
Iteration 7, loss = 1.47536864
Iteration 8, loss = 1.45548255
Iteration 8, loss = 1.43544307
Iteration 8, loss = 1.45206929
Iteration 8, loss = 1.46690577
Iteratio

MLPClassifier(activation='relu', alpha=10.0, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False)

In [10]:
data_val = pd.read_csv('audio/dev.txt', header=None, sep='\s+')
X_val=  data_val[0].values
y_val = pd.factorize(data_val[1])[0]
print X_val

['audio/b021_30_60.wav' 'audio/b021_150_180.wav' 'audio/b021_90_120.wav'
 'audio/b021_120_150.wav' 'audio/b021_60_90.wav' 'audio/b021_180_210.wav'
 'audio/b021_0_30.wav' 'audio/b019_180_210.wav' 'audio/b019_120_150.wav'
 'audio/b019_90_120.wav' 'audio/b019_150_180.wav' 'audio/b019_60_90.wav'
 'audio/b019_0_30.wav' 'audio/b019_30_60.wav' 'audio/b020_90_120.wav'
 'audio/b020_30_60.wav' 'audio/b020_150_180.wav' 'audio/b020_60_90.wav'
 'audio/b020_180_210.wav' 'audio/b020_0_30.wav' 'audio/b020_120_150.wav'
 'audio/a104_0_30.wav' 'audio/a104_150_180.wav' 'audio/a104_90_120.wav'
 'audio/a104_60_90.wav' 'audio/a104_30_60.wav' 'audio/a104_120_150.wav'
 'audio/a140_120_150.wav' 'audio/a140_150_180.wav' 'audio/a140_90_120.wav'
 'audio/a140_60_90.wav' 'audio/a140_30_60.wav' 'audio/a140_240_270.wav'
 'audio/a140_210_240.wav' 'audio/a140_180_210.wav' 'audio/a140_0_30.wav'
 'audio/a054_0_30.wav' 'audio/a054_60_90.wav' 'audio/a054_90_120.wav'
 'audio/a054_120_150.wav' 'audio/a054_30_60.wav' 'audio/a0

In [None]:
ft_val, lb_val = parse_audio_files(X_val, y_val)
print ft_val.shape

In [12]:
print lb_val

[ 0  0  0 ..., 14 14 14]


In [14]:
def predict(clf, X_test):
    y_pred = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x)
        y_predicts = clf.predict(x_mfccs)
        y_pred = np.append(y_pred, mode(y_predicts).mode[0])
    return np.array(y_pred, dtype = np.int)

In [18]:
y_train_pred = predict(gs.best_estimator_, X_train)
print "Score on validation test: %f" % np.mean(y_train_pred == y_train)
print y_train_pred

y_val_pred = predict(gs.best_estimator_, X_val)
print "Score on validation test: %f" % np.mean(y_val_pred == y_val)
print y_val_pred

Score on validation test: 0.864261
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  2  2  2  2  2  2  2  6  2  6  2  2
  2  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
  3  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4 11  5  5 11 11  5
  5  5  5 11  5  5  5  5  5  5  5  5  5  5  5  6  6  6  6  6  6  6  6  6  6
  6  6  6  6  6  6  6  6  6  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  6 14  6  6  8  8  8  8  8  8
  8  8  6  9  9  6  6  9 14  6  2  9 14  2  7 10  7 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11  1 11  1 11  1 12 12  8 11  1 11  4  4  2 11  2  4 12  4 12
 12  4 12  4 13 13 13 13 13 13 13  1 13 13  1 13 13 13 13  1  1  1 14 14 14
 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14  0  0  0  0  0  0  0  0  0 11
  0  3  0  0  0 14  0  8  0  1  1  1  1  1  1  1  1  