# Evaluating model performance

In [None]:
# Changing keras backend
import theano

In [68]:
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import os
import numpy as np
import pickle
import IPython
from scipy.stats import mode

import os
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D, Dropout, Flatten, Activation, Conv2D, Dense
from keras import metrics
from sklearn.model_selection import train_test_split

import gc


In [20]:
def load_data(folder, subset=False, test=False):
    """
    Files are loaded into a large numpy object and held in memory. 
    This function also feature scales the data, with the scaledrs being saved
    to allow equivalent transformation of the test set
    """
    files = os.listdir(folder)

    n = len(files) # limiter, set to smaller number for non full set
    if subset:
        n = 10
    
    train_df_mfcc = pd.DataFrame()
    train_df_mfcc["index"] = files
    train_df_mfcc.set_index("index")

    #Create an empty structure to fit the data inside
    X = np.empty((n, 20, 87, 1))

    for i in range(len(files[:n])):
        file = files[i]
        this_mfcc = pd.read_parquet("{}/{}".format(folder, file)).values
        X[i] = np.expand_dims(this_mfcc, axis=-1)

    # establish stats (to normalise train and test within same range)
    if not test:
        stats = {}
        stats["train_mean"] = np.mean(X)
        stats["train_std"] = np.std(X)
        
        # save stats
        with open("train_stats.json", "w") as fp:
            json.dump(stats, fp)
        
    else:
        with open("train_stats.json") as fp:
            stats = json.load(fp)
    

    # Feature scale the samples
    X = X - stats["train_mean"]
    X = X / stats["train_std"]
    return X

In [22]:
X_test = load_data("../data/freesound-ml/audio_test_new_mfcc")

In [27]:
def get_base_name(s):
    return int(s.split("_")[0])

In [89]:
import scipy

In [148]:
def get_accuracy(model_suffix, X_test):
    model = keras.models.load_model("../data/freesound-ml/nn_{}.model".format(model_suffix))
    preds = pd.Series(model.predict_classes(X_test))
    
    # Get ground truth
    new_files = os.listdir("../data/freesound-ml/audio_test_new_mfcc/")
    y_test = pd.DataFrame()
    y_test["fn"] = new_files
    y_test["parent_index"] = y_test["fn"].apply(get_base_name)
    
    # Get the labels by joining on the parent's index
    y_test = y_test.join(df_test, on="parent_index", rsuffix="parent")
    ground_truth = y_test[y_test.columns[7:]].idxmax(axis=1)
    y_test["pred"] = preds
    #print(np.mean((preds == ground_truth).astype(int)))
    
    
    # Aggregate by parent index (the original, and take the MOST COMMON prediction 
    # from each of the 2-second chunks)
    results_agg = y_test.groupby('parent_index').agg(lambda x: scipy.stats.mode(x)[0])
    
    # bool series is true when they are equal, convert to int (0 or 1), take mean for accuracy metric
    return np.mean((results_agg["class"] == results_agg["pred"]).astype(int))

In [72]:
get_accuracy("2d_4", X_test)

0.4787283777466106

In [73]:
get_accuracy("2d_5", X_test)

0.4198223468910706

In [84]:
results = get_accuracy("2d_5", X_test)


0.42987377279102384


### The following "aggregation step" has now been implemented in the get_accuracy function

In [92]:
results_agg = results.groupby('parent_index').agg(lambda x: scipy.stats.mode(x)[0])



In [93]:
results_agg.head()

Unnamed: 0_level_0,fn,fname,label,class,filepath,0,1,2,3,4,...,33,34,35,36,37,38,39,40,None,pred
parent_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,4_0.parquet,00326aa9.wav,Oboe,29,../data/freesound-ml/audio_test/00326aa9.wav,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
5,5_0.parquet,0038a046.wav,Bass_drum,3,../data/freesound-ml/audio_test/0038a046.wav,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,8_0.parquet,007759c4.wav,Saxophone,30,../data/freesound-ml/audio_test/007759c4.wav,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,18
9,9_0.parquet,008afd93.wav,Saxophone,30,../data/freesound-ml/audio_test/008afd93.wav,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30
12,12_0.parquet,00ae03f6.wav,Chime,7,../data/freesound-ml/audio_test/00ae03f6.wav,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7


In [95]:
np.mean((results_agg["class"] == results_agg["pred"]).astype(int))

0.5057736720554272

# The following grid search was performed on a GPU VM in Google Cloud:


In [None]:
for n_filters in [64, 32, 128]:intentionalsyntaxerror
    for dropout_rate in [0.3, 0.5]:    
        for n_layers in [2,3,4]:
            try:
                print("2d_{}_{}_{}".format(n_filters, dropout_rate, n_layers))
                res = train_model(n_filters=n_filters, kernel_size=(2,9), dropout_rate=dropout_rate, n_conv_layers=n_layers,
                           suffix="2d_{}_{}_{}".format(n_filters, dropout_rate, n_layers))

                # Save history object as csv
                h = pd.DataFrame([res.history[key] for key in res.history.keys()]).T
                h.columns = res.history.keys()
                h.to_csv("2d_{}_{}_{}.csv".format(n_filters, dropout_rate, n_layers), index=None)
            except:
                pass

In [100]:
results_fns= [x for x in os.listdir() if "2d_" in x]

In [103]:
 np.max(pd.read_csv(results_fns[0])["val_acc"])
    

0.5597899272825209

In [111]:
md = keras.models.load_model("../data/freesound-ml/nn_{}.model".format(results_fns[0][:-4]))

  return np.copy(kernel[slices])


In [118]:
get_accuracy(results_fns[0][:-4], X_test)

  return np.copy(kernel[slices])


0.11126694717157551




0.14780600461893764

In [140]:
results = []

for fn in results_fns:
    print(fn)
    model = keras.models.load_model("../data/freesound-ml/nn_{}.model".format(fn[:-4]))
    
    max_val_acc = np.max(pd.read_csv(fn)["val_acc"])
    true_test_acc = get_accuracy(fn[:-4], X_test)
    results.append([fn, max_val_acc, true_test_acc])
    

2d_128_0.3_2.csv


  return np.copy(kernel[slices])


0.11126694717157551




2d_32_0.3_2.csv
0.09957924263674614
2d_32_0.3_3.csv
0.060542309490416085
2d_32_0.5_2.csv
0.12622720897615708
2d_32_0.5_3.csv
0.08835904628330996
2d_64_0.3_2.csv
0.12318840579710146
2d_64_0.3_3.csv
0.08298270219728845
2d_64_0.5_2.csv
0.11524076671341749
2d_64_0.5_3.csv
0.08789153810191679


In [152]:
preds = model.predict(X_test)

In [153]:
preds[0]

array([0.04211148, 0.00428971, 0.00980724, 0.00968907, 0.01250777,
       0.00046169, 0.02469907, 0.00569606, 0.00309948, 0.00942499,
       0.01287755, 0.02550157, 0.01554736, 0.00476691, 0.01815393,
       0.02779944, 0.00434662, 0.04871554, 0.0166655 , 0.04613115,
       0.09142027, 0.0063581 , 0.00752467, 0.02371158, 0.02360239,
       0.02868497, 0.01480678, 0.1734222 , 0.00503807, 0.00056279,
       0.00380142, 0.01753699, 0.01303129, 0.06722847, 0.0197344 ,
       0.00681093, 0.02698777, 0.05781476, 0.04122413, 0.01632254,
       0.01208335], dtype=float32)

### ^ It appears that the trained model is predicting a relatively high probability for MANY classes... 

In [142]:
results = pd.DataFrame(results)

In [143]:
results.columns = ["fn", "best val_acc", "true_test_acc"]

In [144]:
results.head()

Unnamed: 0,fn,best val_acc,true_test_acc
0,2d_128_0.3_2.csv,0.55979,0.147806
1,2d_32_0.3_2.csv,0.546458,0.139338
2,2d_32_0.3_3.csv,0.508214,0.069284
3,2d_32_0.5_2.csv,0.542419,0.157044
4,2d_32_0.5_3.csv,0.460679,0.110085


# We see that the stonger CNNs dramatically overfit, despite high dropout rates

So, we have our best result, a very small CNN:

- n_filters=16
- kernel_size=(2,9)
- suffix="2d_4"
- dropout_rate=0.4
- n_conv_layers=2)

The biggest single gain in performance was from transitioning to a CNN structure. Next biggest was the use of a rectangular kernel, which allowed the model to find features more quickly.

In [149]:
get_accuracy("2d_4", X_test)



0.5612009237875288

In [None]:
print(best_accuracy)

# Next steps

If I were to continue developing this model, we would like to test the affects of manual feature engineering on the conv layers, i.e. if "derivative" style features (as used earlier in the project) are engineered, will the conv layers make good use of them ?

Further, I would investigate why the model is overfitting so strongly with the relatively similar networks trained above