In [0]:
import os
import numpy as np 
import pandas as pd
import librosa
import matplotlib
import matplotlib.pyplot as plt

import keras 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU
from keras.layers import MaxPooling2D, AveragePooling2D, Convolution2D
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D, BatchNormalization, SpatialDropout2D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam, RMSprop
from keras.utils import Sequence, to_categorical

from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, LearningRateScheduler

In [0]:
train = pd.read_csv("train.csv")
sub = pd.read_csv("sample_submission.csv")

In [0]:
train.head(10)

In [0]:
sub.head(10)

In [0]:
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train["label_idx"] = train.label.apply(lambda x: label_idx[x])

In [0]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2, n_classes=41,
                 use_mfcc=False, n_folds=10, learning_rate=0.0001, 
                 max_epochs=50, n_mfcc=20):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs

        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [0]:
def get_2d_conv_model(config):
    
    nclass = config.n_classes
    
    input_layer = Input(shape=(config.dim[0],config.dim[1],1))
    
    x1 = Convolution2D(64, (4,10), padding="same")(input_layer)
    x1 = BatchNormalization()(x1)
    x1 = Activation("relu")(x1)

    x1 = Convolution2D(32, (4,10), padding="same")(x1)
    x1 = BatchNormalization()(x1)
    x1 = Activation("relu")(x1)
    
    x2 = Convolution2D(64, (4,10), padding="same")(input_layer)
    x2 = BatchNormalization()(x2)
    x2 = Activation("relu")(x2)

    x2 = Convolution2D(32, (4,10), padding="same")(x2)
    x2 = BatchNormalization()(x2)
    x2 = Activation("relu")(x2)
    
    max_pool1 = MaxPooling2D()(x1)
    avg_pool1 = AveragePooling2D()(x1)
    
    max_pool2 = MaxPooling2D()(x2)
    avg_pool2 = AveragePooling2D()(x2)
    
    y = concatenate([max_pool1, avg_pool1, max_pool2, avg_pool2])
    
    y = SpatialDropout2D(0.5)(y)

    y = Flatten()(y)
    
    y = Dense(128)(y)
    y = BatchNormalization()(y)
    y = Activation("relu")(y)
    
    y = Dropout(0.5)(y)
    
    y = Dense(64)(y)
    y = BatchNormalization()(y)
    y = Activation("relu")(y)
    
    out = Dense(nclass, activation='softmax')(y)

    model = Model(inputs=input_layer, outputs=out)
    
    opt = optimizers.Adam(config.learning_rate)

    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=["accuracy"])
    
    return model

In [0]:
config = Config(sampling_rate=44100, audio_duration=2, n_folds=10, 
                learning_rate=0.001, use_mfcc=True, n_mfcc=40)

In [0]:
def prepare_data(df, config, data_dir):
    X = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1], 1))
    input_length = config.audio_length
    for i, row in df.iterrows():
        file_path = data_dir + row["fname"]
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate, res_type="kaiser_fast")

        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        X[i,] = data
    return X

In [0]:
X_train = prepare_data(train, config, 'audio_train/')
print("Done")
X_test = prepare_data(sub, config, 'audio_test/')
print("Done")
y_train = to_categorical(train.label_idx, num_classes=config.n_classes)

In [0]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

In [0]:
PREDICTION_FOLDER = "predictions_2d_conv"
if not os.path.exists(PREDICTION_FOLDER):
    os.mkdir(PREDICTION_FOLDER)

In [0]:
file_path = "predictions_2d_conv/model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 20)

In [0]:
model = get_2d_conv_model(config)

history = model.fit(X_train, y_train, batch_size = 64, epochs = 100, validation_split=0.1, verbose = 1, callbacks = [check_point, early_stop], shuffle = True)

model = load_model(file_path)

In [0]:
predictions = model.predict(X_test, batch_size = 1024, verbose = 1)

top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]

predicted_labels = [' '.join(list(x)) for x in top_3]

sub['label'] = predicted_labels

sub.to_csv("output.csv", index=False)

In [0]:
output = pd.read_csv("output.csv")

In [0]:
output.head(10)

In [0]:
from google.colab import files

files.download('output.csv')