The main idea of the solution is to use [VGGish](https://github.com/tensorflow/models/tree/1b728d473949c27ad93b90a16e2585ede407ad2f/research/audioset) preprocessing along with pre-trained convolution [layers](https://drive.google.com/open?id=16JrWEedwaZFVZYvn1woPKCuWx85Ghzkp).<br>
There was two problems to handle:<br>
[AudioSet](https://research.google.com/audioset/) contains no background and all data have a fixed length (10 sec).<br>
So the first step here is to separate the background. It is achieved by Catboost on 4 simple features: mean, std, min, max.
The second step is to handle too short files. It appears that the most of the files of less than 1 sec length are belong to a single class. Thus, we can check and predict them separately.

In [1]:
import os
import wave
import keras
import sklearn
import warnings
import vggish_input
import numpy as np
import tensorflow as tf
import pandas as pd
import catboost as cb
from vggish import VGGish
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.models import Model
from keras.optimizers import Adam
from scipy.stats import mode as Mode

warnings.filterwarnings("ignore")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
NUM_CLASSES = 8
NUM_CLASSES_NET = NUM_CLASSES - 1
NUM_EPOCH = 3
NUM_NETS = 5
np.random.seed(123)
tf.set_random_seed(123)

In [3]:
meta = pd.read_csv('./meta/meta.txt', sep='\t', header=None, names=['name', 'class'], usecols=[0, 4], index_col=0)
le = LabelEncoder()
meta['class'] = le.fit_transform(meta['class'])
meta.head()

Unnamed: 0_level_0,class
name,Unnamed: 1_level_1
background_0001.wav,0
background_0001_time_stretch_0.wav,0
background_0001_time_stretch_1.wav,0
background_0001_time_stretch_10.wav,0
background_0001_time_stretch_11.wav,0


In [4]:
aposteriori_prob = np.zeros(NUM_CLASSES, dtype=np.int)

def prepare_data(df):
    X_net, Y_net, X_filter, Y_filter = [], [], [], []
    for index, row in tqdm_notebook(df.iterrows(), total=df.size):
        wav = wave.open('./audio/' + index, mode="r")
        frames = np.frombuffer(wav.readframes(wav.getnframes()), dtype=np.int16)
        X_filter += [[np.mean(frames), np.std(frames), np.min(frames), np.max(frames)]]
        Y_filter += [0 if row['class'] == 0 else 1]
        if row['class'] != 0:
            _, _, rate, length, _, _ = wav.getparams()
            if rate <= length:
                examples = vggish_input.wavfile_to_examples('./audio/' + index)
                X_net += [ex for ex in examples]
                Y_net += [row['class'] - 1 for ex in examples]
            else:
                aposteriori_prob[row['class']] += 1
    X_net = np.array(X_net)[..., None]
    Y_net = np.array(Y_net)
    return X_net, Y_net, X_filter, Y_filter

X_train, Y_train, X_filter, Y_filter = prepare_data(meta)
aposteriori_prob = aposteriori_prob / aposteriori_prob.sum()

HBox(children=(IntProgress(value=0, max=11307), HTML(value='')))




In [5]:
background_filter = cb.CatBoostClassifier(iterations=NUM_EPOCH, depth=4, learning_rate=0.1, loss_function='Logloss', custom_loss=['Accuracy'], random_seed=123, logging_level='Silent')
background_filter.fit(X_filter, Y_filter, verbose=False);

To stabilize result, 5 nets are trained and used for voting.

In [7]:
def get_model(model_bottom):
    x = model_bottom.get_layer(name="conv4/conv4_2").output
    x = Flatten(name='flatten_')(x)
    x = Dense(512, activation=keras.backend.relu, name='vggish_fc1/fc1_1')(x)
    x = Dropout(0.25, name='dropout_1')(x)
    x = Dense(512, activation=keras.backend.relu, name='vggish_fc1/fc1_2')(x)
    x = Dropout(0.25, name='dropout_2')(x)
    x = Dense(NUM_CLASSES_NET, activation=keras.backend.softmax, name='vggish_fc2')(x)
    model = Model(model_bottom.input, x, name='my_VGGish')
    model.compile(optimizer=Adam(amsgrad=True), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model_v = VGGish(include_top=False)
for layer in model_v.layers:
    layer.trainable = False

models = []
for i in range(NUM_NETS):
    print()
    print('Training Net '+ str(i))
    print()
    model = get_model(model_v)
    model.fit(X_train, Y_train, epochs=NUM_EPOCH, verbose=2, batch_size=64)
    models += [model]


Training Net 0

Epoch 1/3
 - 62s - loss: 0.0931 - acc: 0.9708
Epoch 2/3
 - 61s - loss: 0.0130 - acc: 0.9960
Epoch 3/3
 - 63s - loss: 0.0059 - acc: 0.9981

Training Net 1

Epoch 1/3
 - 61s - loss: 0.0976 - acc: 0.9691
Epoch 2/3
 - 63s - loss: 0.0107 - acc: 0.9967
Epoch 3/3
 - 63s - loss: 0.0065 - acc: 0.9982

Training Net 2

Epoch 1/3
 - 64s - loss: 0.0930 - acc: 0.9712
Epoch 2/3
 - 63s - loss: 0.0127 - acc: 0.9962
Epoch 3/3
 - 63s - loss: 0.0060 - acc: 0.9982

Training Net 3

Epoch 1/3
 - 64s - loss: 0.0940 - acc: 0.9710
Epoch 2/3
 - 62s - loss: 0.0124 - acc: 0.9962
Epoch 3/3
 - 58s - loss: 0.0063 - acc: 0.9978

Training Net 4

Epoch 1/3
 - 60s - loss: 0.0953 - acc: 0.9706
Epoch 2/3
 - 58s - loss: 0.0119 - acc: 0.9962
Epoch 3/3
 - 55s - loss: 0.0078 - acc: 0.9975


In [8]:
listdir_test = pd.Series(dict((name, name.split('_')[0]) for name in os.listdir('./test')))
listdir_test[listdir_test == 'knocking'] = 'knocking_door'
listdir_test_short = pd.Series(le.transform(listdir_test[listdir_test != 'unknown']), index=listdir_test.index[listdir_test != 'unknown'])

In [9]:
def predict(models):
    known, known_mask, pred, pred_proba = [], [], [], []
    for name in tqdm_notebook(listdir_test.index, total=listdir_test.size):
        if listdir_test[name] == 'unknown':
            known_mask += [False]
        else:
            known += [listdir_test_short[name]]
            known_mask += [True]
        wav = wave.open('./test/' + name, mode="r")
        frames = np.frombuffer(wav.readframes(wav.getnframes()), dtype=np.int16)
        filter_proba = background_filter.predict_proba([np.array([np.mean(frames), np.std(frames), np.min(frames), np.max(frames)])])
        filter_ans = np.argmax(filter_proba)
        if filter_ans == 0:
            pred += [filter_ans]
            pred_proba += [filter_proba[0][filter_ans]]
        else:
            _, _, rate, length, _, _ = wav.getparams()
            if rate <= length:
                example = vggish_input.wavfile_to_examples('./test/' + name)
                ans, ans_proba = [], []
                for model in models:
                    pred_batch = model.predict(np.array(example[..., None]))
                    pred_elem = np.zeros(NUM_CLASSES_NET)
                    for i in pred_batch:
                        for j in range(NUM_CLASSES_NET):
                            pred_elem[j] += i[j] / len(pred_batch)
                    ans += [np.argmax(pred_elem)]
                    ans_proba += [pred_elem]
                mode = Mode(ans)[0][0]
                pred += [mode + 1]
                pred_proba += [np.mean(ans_proba, axis=0)[mode]]
            else:
                pred += [np.argmax(aposteriori_prob)]
                pred_proba += [aposteriori_prob[pred[-1]]]
    pred = np.array(pred, dtype=np.int)
    pred_proba = np.array(pred_proba)
    known = np.array(known)
    known_mask = np.array(known_mask)
    return len(pred[known_mask][pred[known_mask] == known]) / len(known), pred, pred_proba

In [10]:
expected_accuracy, pred_class, pred_proba = predict(models)
print('Expected accuracy: ' + str(expected_accuracy))

with open('result.txt', 'w') as fw:
    for i in range(listdir_test.size):
        fw.write(listdir_test.index[i] + '\t' + '{:.3f}'.format(pred_proba[i]) + '\t' + le.inverse_transform(pred_class[i]) + '\n')

HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Expected accuracy: 0.959830866807611
