In [1]:
import os
import shutil
import pandas as pd
import numpy as np
import audio_tagging_utils as utils
import matplotlib.pyplot as plt
from glob import glob

In [2]:
# Importing Keras and other pre-processing libraries
import tensorflow as tf
from tensorflow.python.client import device_lib
import tensorflow.keras.backend as K
import gc

# This is needed to get if the gpu is detected, as we carried out runs on our local machines.
# print(device_lib.list_local_devices())

A CNN within our project will be fed the spectrograms of the .wav as input images. Such images need to be generated first, this is carried out by calling a method defined in the utility file. Such method will fetch each .wav present in the input directory and traspose it to its corresponding spectrogram, saving it as a .jpg image.


In [3]:
train_path = os.path.join('images', 'train')
test_path = os.path.join('images', 'test')

if not os.path.exists(train_path):
    os.mkdir(train_path)

if not os.path.exists(test_path):
    os.mkdir(test_path)

if not len(glob(os.path.join(train_path, '*'))) == 9473:
    utils.create_images('train', 'train')

if not len(glob(os.path.join(test_path, '*'))) == 1600:
    utils.create_images('test', 'test')

As the code aim to be highly modular, each employed model is defined as a method, this makes the Notebook more organic and readable, as each model is encapsulated. 

In [4]:
from tensorflow.keras import layers, models, regularizers, optimizers
from tensorflow.python.keras.engine import training
from tensorflow.python.framework.ops import Tensor
from tensorflow.keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, LeakyReLU, Conv2D, MaxPooling2D, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential, Model

number_of_classes = 41

def spectrogram_2d_conv_pool_cnn(model_input: Tensor) -> training.Model:
    x = Conv2D(32, (3, 3), padding='same', activation='relu')(model_input)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(0.25)(x)
    
    x = Conv2D(64, (3, 3), padding='same', activation='relu')(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(0.5)(x)
    
    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(0.5)(x)
    
    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(number_of_classes, activation='softmax')(x) #TO-FIX THIS

    model = Model(model_input, x, name='spectrogram_2d_conv_pool_cnn')
    
    return model

def kaggle_2d_conv_pool_cnn(model_input: Tensor) -> training.Model:
    x = Conv2D(32, (4,10), padding="same")(model_input)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPooling2D()(x)
    
    x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPooling2D()(x)
    
    x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPooling2D()(x)
    
    x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPooling2D()(x)

    x = Flatten()(x)
    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dense(number_of_classes, activation='softmax')(x)
    
    model = Model(model_input, x, name='kaggle_2d_conv_pool_cnn')
    
    return model

def kaggle_1d_conv_pool_cnn(model_input: Tensor) -> training.Model:
    x = Conv1D(16, 9, activation='relu', padding="valid")(model_input)
    x = Conv1D(16, 9, activation='relu', padding="valid")(x)
    x = MaxPooling1D(16)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Conv1D(32, 3, activation='relu', padding="valid")(x)
    x = Conv1D(32, 3, activation='relu', padding="valid")(x)
    x = MaxPooling1D(4)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Conv1D(32, 3, activation='relu', padding="valid")(x)
    x = Conv1D(32, 3, activation='relu', padding="valid")(x)
    x = MaxPooling1D(4)(x)
    x = Dropout(rate=0.1)(x)
    
    x = Conv1D(256, 3, activation='relu', padding="valid")(x)
    x = Conv1D(256, 3, activation='relu', padding="valid")(x)
    x = GlobalMaxPooling1D()(x)
    x = Dropout(rate=0.2)(x)

    x = Dense(64, activation='relu')(x)
    x = Dense(1028, activation='relu')(x)
    x = Dense(number_of_classes, activation='softmax')(x)
    
    model = Model(model_input, x, name='kaggle_1d_conv_pool_cnn')
    
    return model


def cnn_lstm(model_input: Tensor) -> training.Model:
    x = LSTM(512, activation='relu')(model_input)
    x = Dense(64, activation='relu')(x)
    x = Dense(number_of_classes, activation='softmax')(x)
    
    model = Model(model_input, x, name='lstm')
    
    return model

Now it is time to set up our trainining and evaluation pipeline for the model employing the .jpg spectrograms. The pipeline implement Kfold validation during training and evaluate each model on the test set, as well as generating its predictions using the best model obtained, as the best weights are saved during training.

In [5]:
from tensorflow.keras.layers import Input

number_of_splits = 5

# Shape of the input to the CNN, this shape is obtained from the flow_from_dataset method implemented in the next cell.
model_input = Input(shape=(64, 64, 3))
spectrogram_2d_conv_pool_cnn = spectrogram_2d_conv_pool_cnn(model_input)
spectrogram_2d_conv_pool_cnn.compile(optimizers.Adam(0.001),loss="categorical_crossentropy",metrics=['accuracy'])
spectrogram_2d_conv_pool_cnn.summary()

# We generate a folder for each model, this allows for a clear separation of the runs. Allowing for a cleaner folder structure.

run_dir = os.path.join('runs', spectrogram_2d_conv_pool_cnn.name)
if not os.path.exists(run_dir):
    os.mkdir(run_dir)
if os.path.exists(os.path.join(run_dir, 'logs')):
    shutil.rmtree(os.path.join(run_dir, 'logs'))
    
traindf_dir = os.path.join('meta', 'train.csv')
testdf_dir = os.path.join('meta', 'test.csv')
traindf=pd.read_csv(traindf_dir)
testdf=pd.read_csv(testdf_dir)    

# As the datasets have .wav, they need to be modified to search for .jpg files
traindf["fname"]= traindf["fname"].apply(utils.append_ext)
testdf["fname"]= testdf["fname"].apply(utils.append_ext)

Model: "spectrogram_2d_conv_pool_cnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64, 64, 3)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 64, 64, 32)        896       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 62, 62, 64)        18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 31, 31, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 31, 31, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 31, 31, 64)        36928     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 29

In [6]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from sklearn.model_selection import KFold
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Training con_pool_cnn

datagen=ImageDataGenerator(rescale=1./255.)

# The class indices to encode the labels need to be static(if not, ensemble of predictiosn will not be correct)
class_indices = {}

kfold_validation = KFold(n_splits= number_of_splits, shuffle=True, random_state=0)

# This structure is based on the one followed by the Kaggle notebook: https://www.kaggle.com/fizzbuzz/beginner-s-guide-to-audio-data
#Using random_state = 0 for repeatibility
print(f'+-----------Training {spectrogram_2d_conv_pool_cnn.name} Model-----------+')
for i, (train_split_indexes, test_split_indexes) in enumerate(kfold_validation.split(traindf)):
    train_fold = traindf.iloc[train_split_indexes]
    val_fold = traindf.iloc[test_split_indexes]
    
    best_weights_file = os.path.join(run_dir, f'best_{i}.h5')
    checkpoint = ModelCheckpoint(best_weights_file, monitor='val_loss', verbose=1, save_best_only=True)
    early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
    tb = TensorBoard(log_dir= os.path.join(run_dir, 'logs', f'fold_{i}'), write_graph=True)

    callbacks_list = [checkpoint, early, tb]

    train_generator=datagen.flow_from_dataframe(
        dataframe=train_fold,
        directory=os.path.join('images', 'train'),
        x_col="fname",
        y_col="label",
        batch_size=32,
        seed=42,
        shuffle=True,
        class_mode="categorical",
        target_size=(64,64))

    valid_generator=datagen.flow_from_dataframe(
        dataframe=val_fold,
        directory=os.path.join('images', 'train'),
        x_col="fname",
        y_col="label",
        batch_size=32,
        seed=42,
        shuffle=True,
        class_mode="categorical",
        target_size=(64,64))

    STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
    STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

    spectrogram_2d_conv_pool_cnn.fit(train_generator,
                    callbacks=callbacks_list,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=1)

    spectrogram_2d_conv_pool_cnn.load_weights(best_weights_file)
    
    train_generator.reset()
    valid_generator.reset()
    
    eval_generator=datagen.flow_from_dataframe(
        dataframe=testdf,
        directory=os.path.join('images', 'test'),
        x_col="fname",
        y_col= "label",
        batch_size=32,
        seed=42,
        shuffle=False,
        class_mode="categorical",
        target_size=(64,64))

    STEP_SIZE_EVAL=eval_generator.n//eval_generator.batch_size

    # It is important to reset the generator before evaluation
    eval_generator.reset()
    
    spectrogram_2d_conv_pool_cnn.evaluate(eval_generator, steps=STEP_SIZE_EVAL, verbose= 1)
    
    test_generator=datagen.flow_from_dataframe(
        dataframe=testdf,
        directory=os.path.join('images', 'test'),
        x_col="fname",
        y_col= None,
        batch_size=32,
        seed=42,
        shuffle=False,
        class_mode= None,
        target_size=(64,64))
    
    STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
    
    test_generator.reset()
    
    pred = spectrogram_2d_conv_pool_cnn.predict(test_generator, steps=STEP_SIZE_TEST, verbose= 1)
    
    np.save(os.path.join(run_dir, f'test_predictions_{i}.npy'), pred)
    
    pd.DataFrame(spectrogram_2d_conv_pool_cnn.history.history).plot()
    
    #On last step, retrieve actual class_indices, this is used to retrieve the actual string labels
    if i == number_of_splits - 1:
        class_indices = train_generator.class_indices

+-----------Training spectrogram_2d_conv_pool_cnn Model-----------+
Found 7578 validated image filenames belonging to 41 classes.
Found 1895 validated image filenames belonging to 41 classes.
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 236 steps, validate for 59 steps


UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node spectrogram_2d_conv_pool_cnn/conv2d/Conv2D (defined at <ipython-input-6-c910c5733d00>:58) ]] [Op:__inference_distributed_function_1578]

Function call stack:
distributed_function


In [None]:
from sklearn import metrics

#Ensembling of the predictions
pred_list = []
for i in range(number_of_splits):
    pred_list.append(np.load(os.path.join(run_dir, f'test_predictions_{i}.npy')))

prediction = np.ones_like(pred_list[0])

for pred in pred_list:
    prediction = prediction*pred
prediction = prediction**(1./len(pred_list))

#Saving predictions
predicted_class_indices = np.argmax(prediction,axis=-1)

labels = dict((v,k) for k,v in class_indices.items())
predicted_labels = [labels[k] for k in predicted_class_indices]

test = pd.read_csv(testdf_dir)

test[['fname', 'label']].to_csv(os.path.join(run_dir, f'{spectrogram_2d_conv_pool_cnn.name}_predictions.csv'), index=False)

y_true = test['label']
y_pred = predicted_labels

print(f'+-----------Printing {spectrogram_2d_conv_pool_cnn.name} predictions evaluations-----------+')
print(metrics.classification_report(y_true, y_pred, digits=3))

Now we will train a different model that uses raw mfcc obtained from the .wav files

In [None]:
max_len = 2
n_mfcc = 40

#raw .wav and mfcc are obtained using different sample rates, we followed the values used on the Kaggle notebook previously cited.
kaggle_2d_sr = 44100
kaggle_1d_sr = 16000

kaggle_2d_seed = 2
kaggle_1d_seed = 1

model_input = Input(shape=utils.mfcc_input_sizes(n_mfcc, kaggle_2d_sr, max_len))
kaggle_2d_conv_pool_cnn = kaggle_2d_conv_pool_cnn(model_input) 
kaggle_2d_conv_pool_cnn.compile(optimizers.Adam(0.001),loss="categorical_crossentropy",metrics=['accuracy'])

model_input = Input(shape=utils.wav_input_sizes(kaggle_1d_sr, max_len))
kaggle_1d_conv_pool_cnn = kaggle_1d_conv_pool_cnn(model_input) 
kaggle_1d_conv_pool_cnn.compile(optimizers.Adam(0.0001),loss="categorical_crossentropy",metrics=['accuracy'])

kaggle_1d_run_dir = os.path.join('runs', kaggle_1d_conv_pool_cnn.name)

if not os.path.exists(kaggle_1d_run_dir):
    os.mkdir(kaggle_1d_run_dir)
if os.path.exists(os.path.join(kaggle_1d_run_dir, 'logs')):
    shutil.rmtree(os.path.join(kaggle_1d_run_dir, 'logs'))
    
kaggle_2d_run_dir = os.path.join('runs', kaggle_2d_conv_pool_cnn.name)

if not os.path.exists(kaggle_2d_run_dir):
    os.mkdir(kaggle_2d_run_dir)
if os.path.exists(os.path.join(kaggle_2d_run_dir, 'logs')):
    shutil.rmtree(os.path.join(kaggle_2d_run_dir, 'logs'))
    
#Feeding the models into a list will make the code more modular.
models_to_train = [kaggle_2d_conv_pool_cnn, kaggle_1d_conv_pool_cnn]

#Re-read the dataframes, as .jpg was appended to 'fname'
traindf=pd.read_csv(traindf_dir)
testdf=pd.read_csv(testdf_dir)

In [None]:
from tensorflow.keras.utils import to_categorical

for model in models_to_train:
    print(f'+-----------Training {model.name} Model-----------+')
    
    if model.name == 'kaggle_2d_conv_pool_cnn':
        kfold_validation = KFold(n_splits= number_of_splits, shuffle=True, random_state=kaggle_2d_seed)
    elif model.name == 'kaggle_1d_conv_pool_cnn':
        kfold_validation = KFold(n_splits= number_of_splits, shuffle=True, random_state=kaggle_1d_seed)

    # Although this makes the code looks ugly, it allows for better testability and repetibility
    
    run_dir = os.path.join('runs', model.name)
    model_test_inputandlabels_file = os.path.join(run_dir, 'test_input_labels.npz')
    
    if not os.path.exists(model_inputandlabels_file) :
        if model.name == 'kaggle_2d_conv_pool_cnn':
            X_test, y_test = utils.create_mfcc_array(testdf, 'test', sr= kaggle_2d_sr, max_len= max_len, n_mfcc= n_mfcc)
        elif model.name == 'kaggle_1d_conv_pool_cnn':
            X_test, y_test = utils.create_wav_array(testdf, 'test', sr= kaggle_1d_sr, max_len= max_len)
        np.savez(model_inputandlabels_file, x_test=X_test, y_test=y_test)
    if os.path.exists(model_inputandlabels_file):
        arr = np.load(model_inputandlabels_file)
        X_test, y_test = arr['x_test'], arr['y_test']
        
    if model.name == 'kaggle_2d_conv_pool_cnn':
        input_shape = utils.mfcc_input_sizes(n_mfcc, kaggle_2d_sr, max_len)
        X_test= X_test.reshape(X_test.shape[0], input_shape[0], input_shape[1], input_shape[2])#, input_shape[2])
    elif model.name == 'kaggle_1d_conv_pool_cnn':
        input_shape = utils.wav_input_sizes(kaggle_1d_sr, max_len)
        X_test= X_test.reshape(X_test.shape[0], input_shape[0], input_shape[1])#, input_shape[2])
    
    y_test_hot = to_categorical(pd.Series(y_test).apply(lambda x: class_indices[x]))
    
    mean = np.mean(X_test, axis=0)
    std = np.std(X_test, axis=0)
        
    X_test = (X_test - mean)/std
    
    #Using random_state = 0 for repeatibility
    for i, (train_split_indexes, test_split_indexes) in enumerate(kfold_validation.split(traindf)):
        train_fold = traindf.iloc[train_split_indexes]
        val_fold = traindf.iloc[test_split_indexes]
        
        model_fold_inputandlabels_file = os.path.join(run_dir, f'fold{i}_input_labels.npz')
        
        #As as seed is used, splits should be consistent, allowing to store data to be reused, avoiding the lenghty generation of the needed arrays.
        if not os.path.exists(model_fold_inputandlabels_file):
            
            #As the models use different inputs, it is necessary to diversify the X_train, y_train generation etc.
            if model.name == 'kaggle_2d_conv_pool_cnn':
                X_train, y_train = utils.create_mfcc_array(train_fold, 'train', sr= kaggle_2d_sr, max_len= max_len, n_mfcc= n_mfcc)
                X_val, y_val = utils.create_mfcc_array(val_fold, 'train', sr= kaggle_2d_sr, max_len= max_len, n_mfcc= n_mfcc)
            elif model.name == 'kaggle_1d_conv_pool_cnn':
                X_train, y_train = utils.create_wav_array(train_fold, 'train', sr= kaggle_1d_sr, max_len= max_len)
                X_val, y_val = utils.create_wav_array(val_fold, 'train', sr= kaggle_1d_sr, max_len= max_len)
            
            np.savez(model_fold_inputandlabels_file, x_train=X_train, y_train=y_train, x_val=X_val, y_val=y_val)
        if os.path.exists(model_fold_inputandlabels_file):
            arr = np.load(model_fold_inputandlabels_file)
            X_train, y_train = arr['x_train'], arr['y_train']
            X_val, y_val = arr['x_val'], arr['y_val']
        
        best_weights_file = os.path.join(run_dir, f'best_{i}.h5')
        checkpoint = ModelCheckpoint(best_weights_file, monitor='val_loss', verbose=1, save_best_only=True)
        early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
        tb = TensorBoard(log_dir= os.path.join(run_dir, 'logs', f'fold_{i}'), write_graph=True)
    
        callbacks_list = [checkpoint, early, tb]
    
        #Outputs are generated using consistend indices
        y_train, y_val = pd.Series(y_train).apply(lambda x: class_indices[x]), pd.Series(y_val).apply(lambda x: class_indices[x])
        
        if model.name == 'kaggle_2d_conv_pool_cnn':
            X_train, X_val = X_train.reshape(X_train.shape[0], input_shape[0], input_shape[1], input_shape[2]), X_val.reshape(X_val.shape[0], input_shape[0], input_shape[1], input_shape[2])
        elif model.name == 'kaggle_1d_conv_pool_cnn':
            X_train, X_val = X_train.reshape(X_train.shape[0], input_shape[0], input_shape[1]), X_val.reshape(X_val.shape[0], input_shape[0], input_shape[1])
        
        #Very important data normalization.
        mean = np.mean(X_train, axis=0)
        std = np.std(X_train, axis=0)
        
        X_train, X_val = (X_train - mean)/std, (X_val - mean)/std
        
        y_train_hot = to_categorical(y_train)
        y_val_hot = to_categorical(y_val)  
        
        model.fit(X_train, y_train_hot,
                    callbacks=callbacks_list,
                    validation_data=(X_val, y_val_hot),
                    epochs=1)
        
        model.load_weights(best_weights_file)
        
        model.evaluate(X_test, y_test_hot)
        
        pred = model.predict(X_test, verbose= 1)
    
        np.save(os.path.join(run_dir, f'test_predictions_{i}.npy'), pred)
        
        pd.DataFrame(model.history.history).plot()

In [None]:
#Showing the obtained metrics for each model

for model in models_to_train:
    pred_list = []
    for i in range(number_of_splits):
        pred_list.append(np.load(os.path.join(run_dir, f'test_predictions_{i}.npy')))
    
    prediction = np.ones_like(pred_list[0])
    
    for pred in pred_list:
        prediction = prediction*pred
    prediction = prediction**(1./len(pred_list))
    # Make a submission file
    
    predicted_class_indices = np.argmax(prediction,axis=-1)
    
    labels = dict((v,k) for k,v in class_indices.items())
    predicted_labels = [labels[k] for k in predicted_class_indices]
    
    test = pd.read_csv(testdf_dir)
    test[['fname', 'label']].to_csv(os.path.join(run_dir, f'{model.name}_predictions.csv'), index=False)
    
    y_true = test['label']
    y_pred = predicted_labels
    
    print(f'+-----------Printing {model.name} predictions evaluation-----------+')
    print(metrics.classification_report(y_true, y_pred, digits=3))

Time to ensemple our predictions.

In [None]:
#Ensembling all the models using Geometric mean averaging.

pred_list = []
for i in range(number_of_splits):
    pred_list.append(np.load(os.path.join('runs', {spectrogram_2d_conv_pool_cnn.name} ,f'test_predictions_{i}.npy')))
    
for model in models_to_train:
    for i in range(number_of_splits):
        pred_list.append(np.load(os.path.join('runs', {model.name} ,f'test_predictions_{i}.npy')))

prediction = np.ones_like(pred_list[0])

for pred in pred_list:
    prediction = prediction*pred
prediction = prediction**(1./len(pred_list))

# Generate predictions

predicted_class_indices = np.argmax(prediction,axis=-1)
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predicted_labels = [labels[k] for k in predicted_class_indices]

test = pd.read_csv(testdf_dir)
test[['fname', 'label']].to_csv(f'ensembled_predictions.csv', index=False)

y_true = test['label']
y_pred = predicted_labels

print(metrics.classification_report(y_true, y_pred, digits=3))