# Challenge SD207 - 2017
*<p>Author: Pengfei MI, Rui SONG</p>*
*<p>Date: 06/06/2017</p>*

In [1]:
# Basic libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import platform
from time import time

# Librosa related: audio feature extraction
import librosa
import librosa.display

# Sklearn related: data preprocessing and classifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.base import clone
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.utils.estimator_checks import check_estimator
import inspect

In [2]:
# Define some usefull functions
def load_sound_file(file_name):
    X, sr = librosa.load(os.path.join(FILEROOT, file_name), sr=None)
    return X

def extract_feature(file_name, feature_type): # Late fusion
    X, sample_rate = librosa.load(os.path.join(FILEROOT, file_name), sr=None)
    if feature_type == 'mfcc':
        return librosa.feature.mfcc(y=X, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc).T
    elif feature_type == 'mfcc_0':
        mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc)[1:].T
        return mfcc
    elif feature_type == "d_mfcc":
        mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc).T
        return librosa.feature.delta(mfcc, width=width, order=1, trim=True)
    elif feature_type == "dd_mfcc":
        mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc).T
        return librosa.feature.delta(mfcc, width=width, order=2, trim=True)
    elif feature_type == "mfcc_d":
        mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc).T
        d_mfcc = librosa.feature.delta(mfcc, width=width, order=1, trim=True)
        return np.c_[mfcc, d_mfcc]
    elif feature_type == "mfcc_dd":
        mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc).T
        d_mfcc = librosa.feature.delta(mfcc, width=width, order=1, trim=True)
        dd_mfcc = librosa.feature.delta(mfcc, width=width, order=2, trim=True)
        return np.c_[mfcc, d_mfcc, dd_mfcc]
    
def parse_audio_files(file_names, file_labels, feature_type):
    # Try to detect the feature dimension
    n_features = extract_feature(file_names[0], feature_type).shape[1]
    features, labels = np.empty((0, n_features)), np.empty(0)
    for fn, fl in zip(file_names, file_labels):
        ff = extract_feature(fn, feature_type)
        features = np.vstack([features, ff])
        labels = np.append(labels, fl*np.ones(ff.shape[0]))
    return np.array(features), np.array(labels, dtype = np.int)

def cross_validation(clf, X, y, test_fold, feature_type="mfcc"):
    y_pred, y_pred_sum, y_pred_prod = np.empty_like(y), np.empty_like(y), np.empty_like(y)
    n_folds = len(np.unique(test_fold))
    for i in range(n_folds):
        t0 = time()
        new_clf = clone(clf, safe=True)
        X_train = X[test_fold != i]
        X_test = X[test_fold == i]
        y_train = y[test_fold != i]
        y_test = y[test_fold == i]
        print "Launching fold #%d/%d, train set size: %d, test set size: %d" % (i+1, n_folds, len(X_train), len(X_test))
        clf_train(new_clf, X_train, y_train, feature_type)
        test_pred, test_pred_sum, test_pred_prod = clf_predict(new_clf, X_test, feature_type)
        y_pred[test_fold == i] = test_pred
        y_pred_sum[test_fold == i] = test_pred_sum
        y_pred_prod[test_fold == i] = test_pred_prod
        print "fold#%d done in %0.3fs, score: %0.3f." % (i+1, time()-t0, accuracy_score(y_test, test_pred))
    t0 = time()
    print "Retraining classifier with whole train set..."
    clf_train(clf, X, y, feature_type)
    print "Done in %0.3fs." % (time() - t0)
    return y_pred, y_pred_sum, y_pred_prod

def clf_train(clf, files, file_labels, feature_type):
    X_train, y_train= parse_audio_files(files, file_labels, feature_type)
    clf.fit(X_train, y_train)
        
def predict_maj(clf, X_test, feature_type):
    y_pred = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x, feature_type)
        y_predicts = clf.predict(x_mfccs)
        y_pred = np.append(y_pred, sp.stats.mode(y_predicts).mode[0])
    return np.array(y_pred, dtype = np.int)

def predict_sum(clf, X_test, feature_type):
    y_pred = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x, feature_type)
        y_predicts = np.sum(clf.predict_proba(x_mfccs), axis=0)
        y_pred = np.append(y_pred, np.argmax(y_predicts))
    return np.array(y_pred, dtype = np.int)

def predict_prod(clf, X_test, feature_type):
    y_pred = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x, feature_type)
        y_predicts = np.prod(clf.predict_proba(x_mfccs), axis=0)
        y_pred = np.append(y_pred, np.argmax(y_predicts))
    return np.array(y_pred, dtype = np.int)

def clf_predict(clf, X_test, feature_type):
    y_pred = np.empty(0)
    y_pred_sum = np.empty(0)
    y_pred_prod = np.empty(0)
    for x in X_test:
        x_mfccs = extract_feature(x, feature_type)
        y_predicts = clf.predict(x_mfccs)
        y_predict_probas = clf.predict_proba(x_mfccs)
        y_pred = np.append(y_pred, sp.stats.mode(y_predicts).mode[0])
        y_pred_sum = np.append(y_pred_sum, np.argmax(np.sum(y_predict_probas, axis=0)))
        y_pred_prod = np.append(y_pred_prod, np.argmax(np.prod(y_predict_probas, axis=0)))
    return np.array(y_pred, dtype=np.int), np.array(y_pred_sum, dtype=np.int), np.array(y_pred_prod, dtype=np.int)

class AcousticSceneClassifier(BaseEstimator, ClassifierMixin):
    """Classifier adapted to acoustic scene classification."""
    def __init__(self, feature_type="mfcc", n_mfcc=20, n_fft=512, hop_length=512, width=3, n_fusion=-1, \
                 hidden_layer_sizes=(100), alpha=0.0001, learning_rate_init=0.001):
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")
        for arg, val in values.items():
            setattr(self, arg, val)
        self.features_ = {}
        self.clf_ = MLPClassifier(hidden_layer_sizes=self.hidden_layer_sizes, alpha=self.alpha, learning_rate_init=self.learning_rate_init)
        
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        self.features_ = {}
        self.clf_.set_params(**{'hidden_layer_sizes': self.hidden_layer_sizes,
                                'alpha': self.alpha,
                                'learning_rate_init': self.learning_rate_init})
        return self
    
    def _extract_feature(self, file_name): # Late fusion
        if file_name not in self.features_:
            X, sample_rate = librosa.load(os.path.join(FILEROOT, file_name), sr=None)
            frame_mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_fft=self.n_fft, hop_length=self.hop_length, n_mfcc=self.n_mfcc).T
            if self.feature_type == 'mfcc':
                frame_feature = frame_mfcc
            elif self.feature_type == 'mfcc_0':
                frame_feature = frame_mfcc[:, 1:]
            elif self.feature_type == "d_mfcc":
                frame_feature = librosa.feature.delta(frame_mfcc, width=width, order=1, trim=True)
            elif self.feature_type == "dd_mfcc":
                frame_feature = librosa.feature.delta(frame_mfcc, width=width, order=2, trim=True)
            elif self.feature_type == "mfcc_d":
                d_mfcc = librosa.feature.delta(frame_mfcc, width=width, order=1, trim=True)
                frame_feature = np.c_[frame_mfcc, d_mfcc]
            else:
                d_mfcc = librosa.feature.delta(mfcc, width=width, order=1, trim=True)
                dd_mfcc = librosa.feature.delta(mfcc, width=width, order=2, trim=True)
                frame_feature = np.c_[mfcc, d_mfcc, dd_mfcc]

            if self.n_fusion == -1:
                self.features_[file_name] = frame_feature
            elif self.n_fusion == 0:
                self.features_[file_name] = np.mean(frame_feature, axis=0).reshape((1, -1))
            else:
                n_frames = frame_feature.shape[0]/n_fusion
                fusion_mean = np.empty((n_frames, frame_feature.shape[1]))
                #fusion_var = fusion_mean = np.empty((n_frames, frame_feature.shape[1]))
                for i in range(n_frames):
                    fusion_mean[i, :] = np.mean(frame_feature[n_fusion*i:n_fusion*(i+1)-1, :], axis=0)
                    #fusion_var[i, :] = np.std(frame_feature[n_fusion*i:n_fusion*(i+1)-1, :], axis=0)
                #return np.c_[fusion_mean, fusion_var]
                self.features_[file_name] = fusion_mean
        return self.features_[file_name]

    def _parse_audio_files(self, file_names, file_labels):
        # Try to detect the feature dimension
        n_features = self._extract_feature(file_names[0]).shape[1]
        features, labels = np.empty((0, n_features)), np.empty(0)
        for fn, fl in zip(file_names, file_labels):
            ff = self._extract_feature(fn)
            features = np.vstack([features, ff])
            labels = np.append(labels, fl*np.ones(ff.shape[0]))
        return np.array(features), np.array(labels, dtype = np.int)
    
    def fit(self, X, y):
        X_train, y_train= self._parse_audio_files(X, y)
        self.clf_.fit(X_train, y_train)
        return self
        
    def predict(self, X, rule="maj"):
        y_pred = np.empty(0)
        for x in X:
            x_feature = self._extract_feature(x)
            if rule == "maj":
                y_predicts = self.clf_.predict(x_feature)
                y_pred = np.append(y_pred, sp.stats.mode(y_predicts).mode[0])
            elif rule == "sum":
                y_predict_probas = self.clf_.predict_proba(x_feature)
                y_pred = np.append(y_pred, np.argmax(np.sum(y_predict_probas, axis=0)))
            else:
                y_predict_probas = self.clf_.predict_proba(x_feature)
                y_pred = np.append(y_pred, np.argmax(np.prod(y_predict_probas, axis=0)))
        return np.array(y_pred, dtype=np.int)

In [3]:
# Read data and preprocessing
print "Loading files..."
t0 = time()

# Define FILEROOT according to the platform
# My personal computer
if sys.platform == "darwin":
    FILEROOT = './'
# Node of Telecom
elif platform.node()[:4] == 'lame':
    FILEROOT = '/tmp/'
# The machines of Telecom
else:
    FILEROOT = '/tsi/plato/sons/sd207/'

# Load the cross validation folds
N_FOLDS = 3
train_files, train_scenes, test_fold = np.empty(0, dtype=str), np.empty(0), np.empty(0)
for i in range(N_FOLDS):
    files = pd.read_csv('train%s.txt' % str(i), sep='\s+', header=None)[0].values
    scenes = pd.read_csv('train%s.txt' % str(i), sep='\s+', header=None)[1].values
    print "Fold #%d: %d files from %d sources" % (i+1, len(files), len(np.unique([f.split('_')[0] for f in files])))
    train_files = np.append(train_files, files)
    train_scenes = np.append(train_scenes, scenes)
    test_fold = np.append(test_fold, i*np.ones_like(scenes))

scenes = np.unique(train_scenes)
n_scenes = len(scenes)
labels = pd.factorize(scenes, sort=True)[0]
n_labels = len(labels)
train_labels = pd.factorize(train_scenes, sort=True)[0]
test_files = pd.read_csv('test_files.txt', header=None)[0].values
test_labels = pd.read_csv('meta.txt', header=None)[0].values

print "%d scenes:" % n_scenes, scenes
print "Training set size: %d" % len(train_files)
print "Test set size: %d" % len(test_files)
print "Done in %0.3fs." % (time()-t0)

Loading files...
Fold #1: 290 files from 45 sources
Fold #2: 292 files from 43 sources
Fold #3: 290 files from 45 sources
15 scenes: ['beach' 'bus' 'cafe/restaurant' 'car' 'city_center' 'forest_path'
 'grocery_store' 'home' 'library' 'metro_station' 'office' 'park'
 'residential_area' 'train' 'tram']
Training set size: 872
Test set size: 298
Done in 0.017s.


In [4]:
# Train classifier
print "Doing cross validation..."
t0 = time()

np.random.seed(42)

params = [{'feature_type': "mfcc",
           'n_mfcc': [17, 20, 25, 40],
           'n_fft': [512, 1024, 2048, 4096],
           'hop_length': [512, 1024, 2048, 40],
           'n_fusion': [-1, 0, 3, 5]
           'hidden_layer_sizes': [(40), (40, 20), (40, 80, 40), (256), (256, 256, 256)],
           'alpha': np.logspace(-5, 3, 7)}]

asc = AcousticSceneClassifier(n_fusion=-1, hidden_layer_sizes=(40), alpha=0.1)
clf = GridSearchCV(asc, params, cv=4, n_jobs=-1, verbose=3)
clf.fit(train_files, train_labels)

print clf.best_estimator_
print "Done in %0.3fs." % (time()-t0)

Doing cross validation...
Fitting 4 folds for each of 4 candidates, totalling 16 fits
[CV] hop_length=4096, n_fft=4096 .....................................
[CV] hop_length=4096, n_fft=2048 .....................................
[CV] hop_length=4096, n_fft=4096 .....................................
[CV] hop_length=4096, n_fft=4096 .....................................
[CV] hop_length=4096, n_fft=4096 .....................................
[CV] hop_length=4096, n_fft=2048 .....................................
[CV] hop_length=4096, n_fft=2048 .....................................
[CV] hop_length=4096, n_fft=2048 .....................................
[CV] ...... hop_length=4096, n_fft=2048, score=0.755656, total= 1.3min
[CV] hop_length=2048, n_fft=4096 .....................................
[CV] ...... hop_length=4096, n_fft=2048, score=0.779817, total= 1.7min
[CV] hop_length=2048, n_fft=4096 .....................................
[CV] ...... hop_length=4096, n_fft=2048, score=0.671171, total

[Parallel(n_jobs=-1)]: Done   7 out of  16 | elapsed:  2.6min remaining:  3.3min


[CV] ...... hop_length=4096, n_fft=2048, score=0.616114, total= 2.8min
[CV] hop_length=2048, n_fft=2048 .....................................
[CV] ...... hop_length=2048, n_fft=4096, score=0.796380, total= 4.2min
[CV] ...... hop_length=2048, n_fft=4096, score=0.825688, total= 4.2min
[CV] ...... hop_length=2048, n_fft=4096, score=0.616114, total= 4.2min
[CV] ...... hop_length=2048, n_fft=4096, score=0.729730, total= 5.3min
[CV] ...... hop_length=2048, n_fft=2048, score=0.791855, total= 4.5min


[Parallel(n_jobs=-1)]: Done  13 out of  16 | elapsed:  6.9min remaining:  1.6min


[CV] ...... hop_length=2048, n_fft=2048, score=0.611374, total= 4.1min
[CV] ...... hop_length=2048, n_fft=2048, score=0.716216, total= 4.8min
[CV] ...... hop_length=2048, n_fft=2048, score=0.821101, total= 4.7min


[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  7.3min finished


AcousticSceneClassifier(alpha=0.1, feature_type='mfcc', hidden_layer_sizes=40,
            hop_length=2048, learning_rate_init=0.001, n_fft=4096,
            n_fusion=-1, n_mfcc=40, width=3)
Done in 538.626s.


In [6]:
y_pred = clf.best_estimator_.predict(test_files)
y_pred_sum = clf.best_estimator_.predict(test_files, "sum")
y_pred_prod = clf.best_estimator_.predict(test_files, "prod")

In [None]:
"""# Print cross validation results
t0 = time()
print '-'*60
print "Score on validation test (vote by majority): %f" % accuracy_score(train_labels, y_pred)
print classification_report(train_labels, y_pred, target_names=scenes)
print "Confusion matrix:"
print confusion_matrix(train_labels, y_pred)

print '-'*60
print "Score on validation test (vote by proba sum): %f" % accuracy_score(train_labels, y_pred_sum )
print classification_report(train_labels, y_pred_sum, target_names=scenes)
print "Confusion matrix:"
print confusion_matrix(train_labels, y_pred_sum)

print '-'*60
print "Score on validation test (vote by proba product): %f" % accuracy_score(train_labels, y_pred_prod)
print classification_report(train_labels, y_pred_prod, target_names=scenes)
print "Confusion matrix:"
print confusion_matrix(train_labels, y_pred_prod)
print "Done in %0.3fs." % (time()-t0)"""

In [7]:
#y_test_pred, y_test_pred_sum, y_test_pred_prod = clf_predict(clf, test_files, feature_type)

print "Score by maj: %f" % accuracy_score(test_labels, y_pred)
print "Score by sum: %f" % accuracy_score(test_labels, y_pred_sum)
print "Score by prod: %f" % accuracy_score(test_labels, y_pred_prod)

np.savetxt('y_test_pred_mfcc_mlp_gs.txt', y_test_pred, fmt='%d')
np.savetxt('y_test_pred_mfcc_mlp_gs_sum.txt', y_test_pred_sum, fmt='%d')
np.savetxt('y_test_pred_mfcc_mlp_gs_prod.txt', y_test_pred_prod, fmt='%d')

Score by maj: 0.711409
Score by sum: 0.708054
Score by prod: 0.734899
