# Challenge SD207 - 2017
*<p>Author: Pengfei MI, Rui SONG</p>*
*<p>Date: 06/06/2017</p>*

In [1]:
# Basic libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import platform
from time import time

# Librosa related: audio feature extraction
import librosa
import librosa.display

# Sklearn related: data preprocessing and classifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.base import clone

In [2]:
# Define some usefull functions
def load_sound_file(file_name):
    X, sr = librosa.load(os.path.join(FILEROOT, file_name), sr=None)
    return X

def extract_feature(file_name, feature_type): # Late fusion
    X, sample_rate = librosa.load(os.path.join(FILEROOT, file_name), sr=None)
    frame_mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc).T
    if feature_type == 'mfcc':
        frame_feature = frame_mfcc
    elif feature_type == 'mfcc_0':
        frame_feature = frame_mfcc[:, 1:]
    elif feature_type == "d_mfcc":
        frame_feature = librosa.feature.delta(frame_mfcc, width=width, order=1, trim=True)
    elif feature_type == "dd_mfcc":
        frame_feature = librosa.feature.delta(frame_mfcc, width=width, order=2, trim=True)
    elif feature_type == "mfcc_d":
        d_mfcc = librosa.feature.delta(frame_mfcc, width=width, order=1, trim=True)
        frame_feature = np.c_[frame_mfcc, d_mfcc]
    else:
        d_mfcc = librosa.feature.delta(mfcc, width=width, order=1, trim=True)
        dd_mfcc = librosa.feature.delta(mfcc, width=width, order=2, trim=True)
        frame_feature = np.c_[mfcc, d_mfcc, dd_mfcc]

    if n_fusion == -1:
        file_feature = frame_feature
    elif n_fusion == 0:
        file_feature = np.mean(frame_feature, axis=0).reshape((1, -1))
    else:
        n_frames = frame_feature.shape[0]/n_fusion
        fusion_mean = np.empty((n_frames, frame_feature.shape[1]))
        #fusion_var = fusion_mean = np.empty((n_frames, frame_feature.shape[1]))
        for i in range(n_frames):
            fusion_mean[i, :] = np.mean(frame_feature[n_fusion*i:n_fusion*(i+1)-1, :], axis=0)
            #fusion_var[i, :] = np.std(frame_feature[n_fusion*i:n_fusion*(i+1)-1, :], axis=0)
        #return np.c_[fusion_mean, fusion_var]
        file_feature = fusion_mean
    return file_feature
    
def parse_audio_files(file_names, file_labels, feature_type):
    # Try to detect the feature dimension
    n_features = extract_feature(file_names[0], feature_type).shape[1]
    features, labels = np.empty((0, n_features)), np.empty(0)
    for fn, fl in zip(file_names, file_labels):
        ff = extract_feature(fn, feature_type)
        features = np.vstack([features, ff])
        labels = np.append(labels, fl*np.ones(ff.shape[0]))
    return np.array(features), np.array(labels, dtype = np.int)

def cross_validation(clf, X, y, test_fold, feature_type="mfcc"):
    y_pred, y_pred_sum, y_pred_prod = np.empty_like(y), np.empty_like(y), np.empty_like(y)
    n_folds = len(np.unique(test_fold))
    for i in range(n_folds):
        t0 = time()
        new_clf = clone(clf, safe=True)
        X_train = X[test_fold != i]
        X_test = X[test_fold == i]
        y_train = y[test_fold != i]
        y_test = y[test_fold == i]
        print "Launching fold #%d/%d, train set size: %d, test set size: %d" % (i+1, n_folds, len(X_train), len(X_test))
        clf_train(new_clf, X_train, y_train, feature_type)
        test_pred, test_pred_sum, test_pred_prod = clf_predict(new_clf, X_test, feature_type)
        y_pred[test_fold == i] = test_pred
        y_pred_sum[test_fold == i] = test_pred_sum
        y_pred_prod[test_fold == i] = test_pred_prod
        print "fold#%d done in %0.3fs, score: %0.3f." % (i+1, time()-t0, accuracy_score(y_test, test_pred))
    t0 = time()
    print "Retraining classifier with whole train set..."
    clf_train(clf, X, y, feature_type)
    print "Done in %0.3fs." % (time() - t0)
    return y_pred, y_pred_sum, y_pred_prod

def clf_train(clf, files, file_labels, feature_type):
    X_train, y_train= parse_audio_files(files, file_labels, feature_type)
    clf.fit(X_train, y_train)
        
def predict_maj(clf, X_test, feature_type):
    y_pred = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x, feature_type)
        y_predicts = clf.predict(x_mfccs)
        y_pred = np.append(y_pred, sp.stats.mode(y_predicts).mode[0])
    return np.array(y_pred, dtype = np.int)

def predict_sum(clf, X_test, feature_type):
    y_pred = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x, feature_type)
        y_predicts = np.sum(clf.predict_proba(x_mfccs), axis=0)
        y_pred = np.append(y_pred, np.argmax(y_predicts))
    return np.array(y_pred, dtype = np.int)

def predict_prod(clf, X_test, feature_type):
    y_pred = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x, feature_type)
        y_predicts = np.prod(clf.predict_proba(x_mfccs), axis=0)
        y_pred = np.append(y_pred, np.argmax(y_predicts))
    return np.array(y_pred, dtype = np.int)

def clf_predict(clf, X_test, feature_type):
    y_pred = np.empty(0)
    y_pred_sum = np.empty(0)
    y_pred_prod = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x, feature_type)
        y_predicts = clf.predict(x_mfccs)
        y_predict_probas = clf.predict_proba(x_mfccs)
        y_pred = np.append(y_pred, sp.stats.mode(y_predicts).mode[0])
        y_pred_sum = np.append(y_pred_sum, np.argmax(np.sum(y_predict_probas, axis=0)))
        y_pred_prod = np.append(y_pred_prod, np.argmax(np.prod(y_predict_probas, axis=0)))
    return np.array(y_pred, dtype=np.int), np.array(y_pred_sum, dtype=np.int), np.array(y_pred_prod, dtype=np.int)

In [3]:
# Read data and preprocessing
print "Loading files..."
t0 = time()

# Define FILEROOT according to the platform
# My personal computer
if sys.platform == "darwin":
    FILEROOT = './'
# Node of Telecom
elif platform.node()[:4] == 'lame':
    FILEROOT = '/tmp/'
# The machines of Telecom
else:
    FILEROOT = '/tsi/plato/sons/sd207/'

# Load the cross validation folds
N_FOLDS = 3
train_files, train_scenes, test_fold = np.empty(0, dtype=str), np.empty(0), np.empty(0)
for i in range(N_FOLDS):
    files = pd.read_csv('train%s.txt' % str(i), sep='\s+', header=None)[0].values
    scenes = pd.read_csv('train%s.txt' % str(i), sep='\s+', header=None)[1].values
    print "Fold #%d: %d files from %d sources" % (i+1, len(files), len(np.unique([f.split('_')[0] for f in files])))
    train_files = np.append(train_files, files)
    train_scenes = np.append(train_scenes, scenes)
    test_fold = np.append(test_fold, i*np.ones_like(scenes))

scenes = np.unique(train_scenes)
n_scenes = len(scenes)
labels = pd.factorize(scenes, sort=True)[0]
n_labels = len(labels)
train_labels = pd.factorize(train_scenes, sort=True)[0]
test_files = pd.read_csv('test_files.txt', header=None)[0].values
test_labels = pd.read_csv('meta.txt', header=None)[0].values

print "%d scenes:" % n_scenes, scenes
print "Training set size: %d" % len(train_files)
print "Test set size: %d" % len(test_files)
print "Done in %0.3fs." % (time()-t0)

Loading files...
Fold #1: 290 files from 45 sources
Fold #2: 292 files from 43 sources
Fold #3: 290 files from 45 sources
15 scenes: ['beach' 'bus' 'cafe/restaurant' 'car' 'city_center' 'forest_path'
 'grocery_store' 'home' 'library' 'metro_station' 'office' 'park'
 'residential_area' 'train' 'tram']
Training set size: 872
Test set size: 298
Done in 0.020s.


In [4]:
# Train classifier
print "Doing cross validation..."
t0 = time()

np.random.seed(42)

feature_type = "mfcc"
n_mfcc = 40
n_fft = 512
hop_length = 512
n_fusion = -1

clf = QuadraticDiscriminantAnalysis()
y_pred, y_pred_sum, y_pred_prod = cross_validation(clf, train_files, train_labels, test_fold, feature_type)
print "Done in %0.3fs." % (time()-t0)

Doing cross validation...
Launching fold #1/3, train set size: 582, test set size: 290
fold#1 done in 69.493s, score: 0.703.
Launching fold #2/3, train set size: 580, test set size: 292
fold#2 done in 65.322s, score: 0.716.
Launching fold #3/3, train set size: 582, test set size: 290
fold#3 done in 65.652s, score: 0.603.
Retraining classifier with whole train set...
Done in 111.801s.
Done in 312.271s.


In [5]:
# Print cross validation results
t0 = time()
print '-'*60
print "Score on validation test (vote by majority): %f" % accuracy_score(train_labels, y_pred)
print classification_report(train_labels, y_pred, target_names=scenes)
print "Confusion matrix:"
print confusion_matrix(train_labels, y_pred)

print '-'*60
print "Score on validation test (vote by proba sum): %f" % accuracy_score(train_labels, y_pred_sum )
print classification_report(train_labels, y_pred_sum, target_names=scenes)
print "Confusion matrix:"
print confusion_matrix(train_labels, y_pred_sum)

print '-'*60
print "Score on validation test (vote by proba product): %f" % accuracy_score(train_labels, y_pred_prod)
print classification_report(train_labels, y_pred_prod, target_names=scenes)
print "Confusion matrix:"
print confusion_matrix(train_labels, y_pred_prod)
print "Done in %0.3fs." % (time()-t0)

------------------------------------------------------------
Score on validation test (vote by majority): 0.674312
                  precision    recall  f1-score   support

           beach       0.60      0.58      0.59        59
             bus       0.81      0.49      0.61        59
 cafe/restaurant       0.78      0.75      0.77        57
             car       0.68      0.98      0.81        59
     city_center       0.82      0.86      0.84        59
     forest_path       0.90      0.75      0.82        60
   grocery_store       0.79      0.75      0.77        59
            home       0.88      0.72      0.79        58
         library       0.66      0.84      0.74        58
   metro_station       0.63      0.57      0.60        56
          office       0.86      0.95      0.90        60
            park       0.33      0.31      0.32        58
residential_area       0.32      0.36      0.34        59
           train       0.97      0.55      0.70        55
            tr

  'precision', 'predicted', average, warn_for)


In [6]:
y_test_pred, y_test_pred_sum, y_test_pred_prod = clf_predict(clf, test_files, feature_type)

print "Score by maj: %f" % accuracy_score(test_labels, y_test_pred)
print "Score by sum: %f" % accuracy_score(test_labels, y_test_pred_sum)
print "Score by prod: %f" % accuracy_score(test_labels, y_test_pred_prod)

np.savetxt('y_test_pred_mfcc_mlp.txt', y_test_pred, fmt='%d')
#np.savetxt('y_test_pred_mfcc_mlp_sum.txt', y_test_pred_sum, fmt='%d')
#np.savetxt('y_test_pred_mfcc_mlp_prod.txt', y_test_pred_prod, fmt='%d')

Score by maj: 0.734899
Score by sum: 0.744966
Score by prod: 0.248322
