In [14]:
import sys
sys.path.insert(0, '../')

import pandas as pd
from preprocess import Audio_Processor
from data_utils import balanced_supersample, balanced_subsample
import matplotlib.pyplot as plt
import numpy as np
from data_utils import enumerate_strings
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier

In [2]:
from scipy import interp
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

def rocauc_curve(clf, X, y, 
                 folds=5, 
                 verbose=0, 
                 title='Receiver operating characteristic example', 
                 balance_fcn=None,
                 n_classes=1
                ):
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    cv = StratifiedKFold(n_splits=folds)

    if n_classes > 2:
        clf = OneVsRestClassifier(clf)
    
    i = 0
    for train, test in cv.split(X, y):
        if balance_fcn == None:
            probas_ = clf.fit(X[train], y[train]).predict_proba(X[test])
        else:
            X_train, y_train = balance_fcn(X[train], y[train])
            probas_ = clf.fit(X_train.values, y_train.values.squeeze()).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        if verbose > 1:
            plt.plot(fpr, tpr, lw=1, alpha=0.3,
                     label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
        else:
            plt.plot(fpr, tpr, lw=1, alpha=0.3)            

        i += 1
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    if verbose > 0:
        plt.plot(mean_fpr, mean_tpr, color='b',
                 label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
                 lw=2, alpha=.8)
    else:
        plt.plot(mean_fpr, mean_tpr, color='b',
                 lw=2, alpha=.8)        

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')
        
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title(title)
    if verbose > 0:
        plt.legend(loc="lower right")
    plt.show()    

In [3]:
SR = 16000
blocksize = int(SR/2)
overlap = int(SR/4)

In [4]:
path_to_db = '../../../data/FSDKaggle2018/'
dataset = pd.read_csv(path_to_db + 'meta/train.csv')
dataset = dataset.drop(['manually_verified', 'freesound_id', 'license'], axis=1)
ps = Audio_Processor(path_to_db + 'train/')
classes = dataset.label.unique()
h_classes = ['Human & Animal', 'Interacting Materials', 'Musical Instruments']
mapping = {'Hi-hat': h_classes[2],
           'Saxophone': h_classes[2],
           'Trumpet': h_classes[2],
           'Glockenspiel': h_classes[2],
           'Cello': h_classes[2],
           'Knock': h_classes[1],
           'Gunshot_or_gunfire': h_classes[1],
           'Clarinet': h_classes[2],
           'Computer_keyboard': h_classes[1],
           'Keys_jangling': h_classes[1],
           'Snare_drum': h_classes[2],
           'Writing': h_classes[1],
           'Laughter': h_classes[0],
           'Tearing': h_classes[1],
           'Fart': h_classes[1],
           'Oboe': h_classes[2],
           'Flute': h_classes[2],
           'Cough': h_classes[0],
           'Telephone': h_classes[1],
           'Bark': h_classes[0],
           'Chime': h_classes[2],
           'Bass_drum': h_classes[2],
           'Bus': h_classes[1],
           'Squeak': h_classes[0],
           'Scissors': h_classes[1],
           'Harmonica': h_classes[2],
           'Gong': h_classes[2],
           'Microwave_oven': h_classes[1],
           'Burping_or_eructation': h_classes[0],
           'Double_bass': h_classes[2],
           'Shatter': h_classes[1],
           'Fireworks': h_classes[1],
           'Tambourine': h_classes[2],
           'Cowbell': h_classes[2],
           'Electric_piano': h_classes[2],
           'Meow': h_classes[0],
           'Drawer_open_or_close': h_classes[1],
           'Applause': h_classes[1],
           'Acoustic_guitar': h_classes[2],
           'Violin_or_fiddle': h_classes[2],
           'Finger_snapping': h_classes[1]
           }
dataset['h_target'] = None
for index, row in dataset.iterrows():
    row['target'] = np.where(classes == row['label'])
    dataset.loc[index, 'h_target'] = mapping[row['label']]
dataset.columns = ['filename', 'target', 'h_target']
enum_map = enumerate_strings(dataset, ['filename'])

In [9]:
df = ps.preprocess_fold(dataset, 
                        kind='mfcc', 
                        blocksize=blocksize, 
                        overlap=overlap,
                        folds=4
                       )
df.head()

Unnamed: 0,mfcc_2_std,mfcc_2_mean,mfcc_2_noise,mfcc_3_std,mfcc_3_mean,mfcc_3_noise,mfcc_4_std,mfcc_4_mean,mfcc_4_noise,mfcc_5_std,...,sflat_noise,sroll_std,sroll_mean,sroll_noise,rmse_std,rmse_mean,rmse_noise,h_target,target,fold
0,6.277245,4.830221,4.344662,3.730741,4.74827,3.000848,4.242958,5.094922,4.498384,5.498527,...,-0.239049,0.060269,20.873332,48.78579,18.417743,32.212951,8.802563,2,23,3
1,10.097137,6.163587,7.883221,8.777986,8.594063,9.843226,10.881635,8.295204,6.299725,8.611447,...,0.394118,0.109677,42.895582,66.75651,10.925151,81.550981,8.917697,2,23,3
2,9.939776,7.657642,8.555211,9.54249,9.419632,5.612988,9.305904,9.516434,8.124083,9.034157,...,-0.081153,-0.161348,50.519282,59.806859,12.246634,89.261647,9.31407,2,23,3
3,14.264796,10.35449,11.384781,12.96699,13.092263,10.565021,8.894694,10.062756,10.724808,11.717766,...,0.245759,0.226735,38.719454,59.807349,12.524417,71.987151,9.524957,2,23,3
4,18.920682,17.171287,14.398867,15.073439,17.106221,21.394535,16.950549,17.392855,12.155673,15.193046,...,-0.091828,0.17082,32.704298,33.846153,8.092868,63.793389,6.825882,2,23,3


In [10]:
df = df[df['fold'] == 1]
X = df.drop(['target', 'h_target', 'fold'], axis=1)
y = df['h_target']

In [15]:
clf = Pipeline([
    ('scaler',MinMaxScaler()),
    ('classify', SVC(probability=True))
])
clf = OneVsRestClassifier(clf)

In [16]:
rocauc_curve(clf, X.values, y.values, folds=5, verbose=1, title='Unbalanced')



ValueError: multiclass format is not supported

In [None]:
rocauc_curve(clf, X.values, y.values, folds=5, title='Sub-Sampled', verbose=0, balance_fcn=balanced_subsample)

In [None]:
rocauc_curve(clf, X.values, y.values, folds=5, title='Super-Sampled', verbose=0, balance_fcn=balanced_supersample)

In [None]:
score_list = []
for fcn in [None, balanced_subsample, balanced_supersample]:
    print(fcn)
    if fcn:
        X_temp, y_temp = fcn(X, y)
        print(y_temp.shape)
        print(X_temp.shape)
    else:
        X_temp = X
        y_temp = y
    score_list.append(cross_validate(clf, 
                            X_temp, y_temp, 
                            cv=3, 
                            scoring=['accuracy', 'f1', 'recall', 'precision', 'roc_auc']
                        )
                     )

In [None]:
nun = pd.DataFrame(score_list[0])
nun['kind'] = 'Unbalanced'
sub = pd.DataFrame(score_list[1])
sub['kind'] = 'Subsample'
sup = pd.DataFrame(score_list[2])
sup['kind'] = 'Supersample'

In [None]:
comb = pd.concat([nun, sub, sup])

In [None]:
comb = comb.groupby('kind').mean()

In [None]:
comb.head()

In [None]:
comb.plot(y='fit_time',kind='bar')

In [None]:
comb.plot(y=['test_accuracy', 'test_f1', 'test_roc_auc'],kind='bar')