<a id="loading_data"></a>
### Loading data

In [232]:
import numpy as np
np.random.seed(1001)

import os
import shutil
import pickler

import IPython
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
from sklearn.cross_validation import StratifiedKFold
from tqdm import tqdm_notebook as tqdm
import IPython.display as ipd
import wave

import librosa
import numpy as np
import scipy
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras.layers import (Convolution1D, Dense, Dropout, GlobalAveragePooling1D, 
                          GlobalMaxPool1D, Input, MaxPool1D, concatenate, LSTM, GRU, Reshape, ConvLSTM2D,
                         TimeDistributed)
from keras.utils import Sequence, to_categorical
from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten,
                          GlobalMaxPool2D, MaxPool2D, concatenate, Activation)
from keras.utils import Sequence, to_categorical
from keras import backend as K

from sklearn.cross_validation import train_test_split

%matplotlib inline
matplotlib.style.use('ggplot')

In [206]:
train = pd.read_csv("../input/freesound-audio-tagging/train.csv")
test = pd.read_csv("../input/freesound-audio-tagging/sample_submission.csv")

In [88]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2, n_classes=41,
                 use_mfcc=False, n_folds=10, learning_rate=0.0001, 
                 max_epochs=50, n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [300]:
config = Config(sampling_rate=16000, audio_duration=2, n_folds=3, learning_rate=0.001, n_mfcc=50)

In [90]:
def load(config, directory, meta=None):
    if meta is not None and 'fname' in meta:
        meta = meta.copy().set_index('fname')
    result = []
    label = []
    verified = []
    for filename in tqdm(os.listdir(directory)):
        if not filename.endswith('.wav'):
            continue
        path = os.path.join(directory, filename)
        y, sr = librosa.core.load(path, sr=config.sampling_rate)
        result.append(y)
        if meta is not None:
            label.append(meta.loc[filename].label)
            verified.append(meta.loc[filename].manually_verified)
    return result, label, verified

In [91]:
%%dumpit
TrainDataX, TrainDataY, TrainDataMV = load(config, '../input/freesound-audio-tagging/audio_train', train)

In [92]:
TrainDataFnames = os.listdir('../input/freesound-audio-tagging/audio_train')

In [93]:
%%dumpit
TestDataX, _, _ = load(config, '../input/freesound-audio-tagging/audio_test')

In [94]:
TestDataFnames = os.listdir('../input/freesound-audio-tagging/audio_test')

In [95]:
TrainDataX = np.asarray(TrainDataX)
TrainDataY = np.asarray(TrainDataY)
TrainDataMV = np.asarray(TrainDataMV)
TrainDataFnames = np.asarray(TrainDataFnames)

TestDataX = np.asarray(TestDataX)
TestDataFnames = np.asarray(TestDataFnames)

In [96]:
import random
import string

def random_id(N=5):
    return ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(N))

In [97]:
class Preprocessor:
    def __init__(self, config, preprocessing_fn=lambda x: x, use_mfcc=False):
        self.config = config
        self.preprocessing_fn = preprocessing_fn
        self.dim = self.config.dim
        
        self.use_mfcc = use_mfcc
        
        if self.use_mfcc:
            self.dim = (config.n_mfcc, 1 + int(np.floor(config.audio_length/512)), 1)
        else:
            self.dim = (config.audio_length, 1)
        
        self.label_idx = {label: i for i, label in enumerate(LABELS)}

    def process(self, X, y=None):
        res_X = np.empty((len(X), *self.dim))
        
        res_y = None
        if y is not None:
            res_y = np.empty(len(X))
        else:
            y = np.zeros(len(X))
        input_length = self.config.audio_length
        for i, (data, answer) in tqdm(enumerate(zip(X, y)), total=len(X)):
            if len(data) > input_length:
                max_offset = len(data) - input_length
                offset = np.random.randint(max_offset)
                data = data[offset:(input_length+offset)]
            else:
                if input_length > len(data):
                    max_offset = input_length - len(data)
                    offset = np.random.randint(max_offset)
                else:
                    offset = 0
                data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
                
            if self.use_mfcc:
                data = librosa.feature.mfcc(data, sr=self.config.sampling_rate,
                                                   n_mfcc=self.config.n_mfcc)
                data = np.expand_dims(data, axis=-1)
            else:
                data = self.preprocessing_fn(data)[:, np.newaxis]
            res_X[i,] = data
        
            if res_y is not None:
                res_y[i] = self.label_idx[answer]
        if res_y is not None:
            return res_X, to_categorical(res_y, num_classes=self.config.n_classes)
        return res_X

<a id="1d_normalization"></a>
#### Normalization

Normalization is a crucial preprocessing step. The simplest method is rescaling the range of features to scale the range in [0, 1]. 

In [98]:
def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+1e-6)
    return data-0.5

* The dummy model is just for debugging purpose.
* Our 1D Conv model is fairly deep and is trained using Adam Optimizer with a learning rate of 0.0001

In [244]:
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)
train["label_idx"] = train.label.apply(lambda x: label_idx[x])

In [302]:
preproc = Preprocessor(config, preprocessing_fn=audio_norm, use_mfcc=True)

In [303]:
ProcessedTrainDataX, ProcessedTrainDataY = preproc.process(TrainDataX, TrainDataY)

HBox(children=(IntProgress(value=0, max=9473), HTML(value='')))

In [304]:
ProcessedTestDataX = preproc.process(TestDataX)

HBox(children=(IntProgress(value=0, max=9400), HTML(value='')))

In [305]:
ProcessedTestDataX.shape

(9400, 50, 63, 1)

In [306]:
ProcessedTrainDataX.shape

(9473, 50, 63, 1)

In [307]:
X_train, X_test, Y_train, Y_test, F_train, F_test = \
    train_test_split(ProcessedTrainDataX, ProcessedTrainDataY, TrainDataFnames, random_state=1337, train_size=0.8)

In [362]:
class ModelRNN:
    def __init__(self, config, learning_rate=0.001, output_dir='output'):
        self.config = config
        self.learning_rate = learning_rate
        self.dim = (config.n_mfcc, 1 + int(np.floor(config.audio_length/512)), 1)
        self.model = self._build_model()
        self.id = random_id()
        self.output_dir = os.path.join(output_dir, self.__class__.__name__ + '_' + self.id)
        os.makedirs(self.output_dir)
    
    def _build_model(self):
        nclass = self.config.n_classes
        input_length = self.config.audio_length

        
        inp = Input(shape=(1, self.dim[0], self.dim[1]))
        x = inp

        #x = Convolution2D(32, (4,10), padding="same")(inp)
        #x = BatchNormalization()(x)
        #x = Activation("relu")(x)
        #x = MaxPool2D()(x)
        #x = Dropout(0.1)(x)
        
        print(x.shape)
        
        x = Convolution2D(32, (4,10), padding="same", data_format='channels_first')(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        x = MaxPool2D(data_format='channels_first')(x)
        x = Dropout(0.1)(x)
        
        print(x.shape)
        
        x = TimeDistributed(GRU(32, activation='linear', return_sequences=True))(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        x = MaxPool2D(data_format='channels_first')(x)
        x = Dropout(0.1)(x)
        
        print(x.shape)
        
        x = Convolution2D(32, (4,10), padding="same", data_format='channels_first')(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        x = MaxPool2D(data_format='channels_first')(x)
        x = Dropout(0.1)(x)
        
        print(x.shape)
        
        x = TimeDistributed(GRU(32, activation='linear', return_sequences=True))(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        x = MaxPool2D(data_format='channels_first')(x)
        x = Dropout(0.1)(x)
        
        print(x.shape)
        
        x = Convolution2D(32, (4,10), padding="same", data_format='channels_first')(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        x = MaxPool2D(data_format='channels_first')(x)
        x = Dropout(0.1)(x)
        print(x.shape)
        
        x = Flatten()(x)
        
        x = Dense(64)(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        out = Dense(nclass, activation=softmax)(x)

        model = models.Model(inputs=inp, outputs=out)
        opt = optimizers.Adam(config.learning_rate)

        model = models.Model(inputs=inp, outputs=out)
        
        print(model.count_params())
    
        
        opt = optimizers.Adam(self.learning_rate)

        model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
        return model
    
    def fit(self, X, Y, valX=None, valY=None, random_seed=1337):
        if valX is None or valY is None:
            trainX, valX, trainY, valY = train_test_split(X, Y, train_size=0.8, random_state=random_seed)
        else:
            trainX, trainY = X, Y
        
        checkpoint_filename = os.path.join(self.output_dir, 'best.h5')
        
        checkpoint = ModelCheckpoint(checkpoint_filename, 
                                     monitor='val_loss', verbose=1, save_best_only=True)
        early = EarlyStopping(monitor="val_loss", mode="min", patience=5)

        callbacks_list = [checkpoint, early]
        
        model = self.model

        history = model.fit(trainX, trainY, callbacks=callbacks_list, validation_data=(valX, valY), 
                            epochs=config.max_epochs, batch_size=64)

        model.load_weights(checkpoint_filename)
        
        model_filename = os.path.join(self.output_dir, 'model.h5')
        model.save(model_filename)

        
    def predict(self, X):
        predictions = self.model.predict(X, batch_size=128)
        return predictions
        
    def submission(self, X, fnames):
        predictions = self.predict(X)
        top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
        predicted_labels = [' '.join(list(x)) for x in top_3]
        
        result = pd.DataFrame({'fname': fnames, 'label': predicted_labels})
        result.set_index("fname", inplace=True)
        return result

In [363]:
model = ModelRNN(config)

(?, 1, 50, 63)
(?, 32, 25, 31)
(?, 32, 12, 16)
(?, 32, 6, 8)
(?, 32, 3, 16)
(?, 32, 1, 8)
113381


In [364]:
model.fit(X_train.reshape((-1, 1, X_train.shape[1], X_train.shape[2])), Y_train)

Train on 6062 samples, validate on 1516 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 3.45227, saving model to output/ModelLSTM_y3oac/best.h5
Epoch 2/50

Epoch 00002: val_loss improved from 3.45227 to 2.91007, saving model to output/ModelLSTM_y3oac/best.h5
Epoch 3/50

Epoch 00003: val_loss improved from 2.91007 to 2.60222, saving model to output/ModelLSTM_y3oac/best.h5
Epoch 4/50

Epoch 00004: val_loss improved from 2.60222 to 2.51823, saving model to output/ModelLSTM_y3oac/best.h5
Epoch 5/50

Epoch 00005: val_loss improved from 2.51823 to 2.13056, saving model to output/ModelLSTM_y3oac/best.h5
Epoch 6/50

Epoch 00006: val_loss improved from 2.13056 to 2.05268, saving model to output/ModelLSTM_y3oac/best.h5
Epoch 7/50

Epoch 00007: val_loss improved from 2.05268 to 1.99106, saving model to output/ModelLSTM_y3oac/best.h5
Epoch 8/50

Epoch 00008: val_loss improved from 1.99106 to 1.89587, saving model to output/ModelLSTM_y3oac/best.h5
Epoch 9/50

Epoch 00009: val_loss im

In [368]:
y_submission = model.submission(X_test.reshape((-1, 1, X_test.shape[1], X_test.shape[2])), F_test)
pred = np.asarray([x.split(' ') for x in y_submission.values[:, 0]])
ans = train.label.loc[F_test].values

In [369]:
np.mean(pred[:, 0] == ans) + np.mean(pred[:, 1] == ans) + np.mean(pred[:, 2] == ans)

0.7831134564643799

In [367]:
Y_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)