In [1]:
!python --version

Python 3.7.3


In [6]:
import keras
import tensorflow as tf
print(f'keras version is {keras.__version__}')
print(f'tensorflow version is {tf.__version__}')

keras version is 2.2.4
tensorflow version is 1.14.0


# Summary of the modle
1. Keras and tensorflow version listed above
2. Used transfer-learning of VGG19, with all weights locked
3. Applied onset method to crop the spectrogram data from the first peak
4. Tried normalization methods, which did not work very well compared to raw numbers. So ended up with raw spectrogram numbers
5. Also tried CNN plus LSTM, which performed poorlly.
6. Working environment is a VM on remote server. 8G RAM, no GPU. "Memory error" is a real concern, which is the reason to train 10 epoches a time and only harvest 6 second data with 50 dimensions 

In [1]:
import pandas as pd
import librosa
import numpy as np

In [2]:
desc = pd.read_csv('./data_description.csv')


In [3]:
train = desc.query('dataset=="train"')
test=desc.query('dataset=="test"')

In [4]:
def load_wav_file(name, path):
    wav, sr = librosa.load(path + name)
    return wav, sr

In [5]:
def zero_pad(arr, length):
    """Cuts arrays under length or and pads arrays over length with zeros"""
    l = len(arr)
    #print(l)
    #print(length)
    if l > length:
        result = arr[0:length]
    else:
        zero_pad = np.zeros(length - l)
        result = np.concatenate([arr, zero_pad])
    return result

In [6]:
wav_path="C:/Users/zhouqi/Desktop/audio/train/"
files = list(train['fname'])

In [7]:
def zero_pad_2d(arr, length):
    """Cuts 2d arrays under length or and pads arrays over length with zeros """
    l = arr.shape[1]
    if l > length:
        result = arr[:,0:length]
    else:
        zero_pad = np.zeros((arr.shape[0], length - arr.shape[1]))
        result = np.hstack((arr, zero_pad))
        
    return result

In [8]:
def get_features(files, wav_path, pad = 132300, feat_pad=50):
    """Librosa feature extractor:
       extracts raw, spectrograms, mel spectrograms and mfccs from wav files, pads
       and converts to np arrays"""
    spectrograms = []
    for file in files:
        wav, sr = load_wav_file(file, wav_path)
        o_env = librosa.onset.onset_strength(wav, sr=sr)
        times = librosa.frames_to_time(np.arange(len(o_env)), sr=sr)
        onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
        start=int(times[onset_frames][0]*22050)
        wav=wav[start:]
        spectrogram = np.abs(librosa.stft(wav))
        padded_spectrogram = zero_pad_2d(spectrogram, feat_pad)
        spectrograms.append(padded_spectrogram)
    spectrograms = np.array(spectrograms)
    return spectrograms

In [9]:
one_hot = {'normal': 0, 'abnormal': 1}
y_data = np.array(train['outcome'].map(one_hot))
spec = get_features(files, wav_path)

In [38]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
x_train, x_test, y_train, y_test, train_filenames, test_filenames = train_test_split(spec, y_data, train['fname'].values, random_state=12, shuffle=True,test_size=0.25)

In [39]:
x_train_cnn = np.expand_dims(x_train, -1)
x_test_cnn = np.expand_dims(x_test, -1)
x_train_cnn_t = np.block([x_train_cnn,x_train_cnn,x_train_cnn])
x_test_cnn_t = np.block([x_test_cnn,x_test_cnn,x_test_cnn])

In [40]:
x_train_cnn.shape

(238, 1025, 50, 1)

In [2]:
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")
from keras.models import Sequential
from keras import Input
from keras.layers import GlobalAvgPool1D, Dropout,Reshape, BatchNormalization, LSTM, Flatten
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint,TensorBoard,ProgbarLogger
from keras.utils import np_utils
from sklearn import metrics 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import itertools
from keras.layers import Dense,GlobalAveragePooling2D
from keras.models import Model


Using TensorFlow backend.


In [14]:
from keras.applications.vgg19 import vgg19, preprocess_input
from keras import applications

In [15]:
model_v = applications.VGG19(weights = "imagenet", include_top=False, input_shape = (1025, 50, 3))




In [16]:
for layer in model_v.layers:
    layer.trainable = False

In [17]:
x = model_v.output
x = Flatten()(x)
x = Dense(1024, activation="relu")(x)
x = Dropout(0.25)(x)
x = Dense(512, activation="relu")(x)
predictions = Dense(1, activation="sigmoid")(x)

In [18]:
model_final = Model(input = model_v.input, output = predictions)

In [19]:
model_final.load_weights("./best_model_trained_vgg_onset_cv1.hdf5")

In [22]:
from sklearn import metrics
import tensorflow as tf
from keras import backend as K 

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [23]:
model_final.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['acc',auc])

In [41]:
%%time
# saved model checkpoint file
best_model_file="./best_model_trained.hdf5"
#train_model_file=file_path+"/checkpoints/weights.best_{epoch:02d}-{loss:.2f}.hdf5"
MAX_PATIENT=18
MAX_EPOCHS=10
MAX_BATCH=32

# callbacks
# removed EarlyStopping(patience=MAX_PATIENT)
callback=[ReduceLROnPlateau(patience=MAX_PATIENT, verbose=1),
          ModelCheckpoint(filepath=best_model_file, monitor='loss', verbose=1, save_best_only=True)]

print ("training started..... please wait.")
# training

hist = model_final.fit(x_train_cnn_t, y_train, epochs=MAX_EPOCHS, batch_size=MAX_BATCH,
                   validation_data=(x_test_cnn_t, y_test),
                   verbose=2, callbacks=callback)
print ("training finised!")

training started..... please wait.
Train on 238 samples, validate on 80 samples
Epoch 1/10
 - 282s - loss: 0.0716 - acc: 0.9706 - auc: 0.9805 - val_loss: 0.0933 - val_acc: 0.9875 - val_auc: 0.9815

Epoch 00001: loss improved from inf to 0.07159, saving model to ./best_model_trained.hdf5
Epoch 2/10
 - 284s - loss: 0.0967 - acc: 0.9664 - auc: 0.9822 - val_loss: 0.1223 - val_acc: 0.9375 - val_auc: 0.9827

Epoch 00002: loss did not improve from 0.07159
Epoch 3/10
 - 283s - loss: 0.1055 - acc: 0.9622 - auc: 0.9829 - val_loss: 0.1315 - val_acc: 0.9375 - val_auc: 0.9835

Epoch 00003: loss did not improve from 0.07159
Epoch 4/10
 - 285s - loss: 0.0885 - acc: 0.9664 - auc: 0.9839 - val_loss: 0.1259 - val_acc: 0.9250 - val_auc: 0.9844

Epoch 00004: loss did not improve from 0.07159
Epoch 5/10
 - 284s - loss: 0.0602 - acc: 0.9748 - auc: 0.9850 - val_loss: 0.1430 - val_acc: 0.9375 - val_auc: 0.9854

Epoch 00005: loss improved from 0.07159 to 0.06023, saving model to ./best_model_trained.hdf5
Epoch

In [25]:
wav_path_test="C:/Users/zhouqi/Desktop/audio/test/"

In [26]:
test_files = list(test['fname'])
x_data_test = get_features(test_files, wav_path_test)

In [42]:
x_data_test_ = np.expand_dims(x_data_test, -1)
x_data_test_t = np.block([x_data_test_,x_data_test_,x_data_test_])

In [43]:
y_pred = model_final.predict(x_data_test_t)
y_pred = y_pred.flatten().tolist()

In [44]:
pred_output = pd.DataFrame(list(zip(test_files, y_pred)), 
               columns =['fname', 'outcome']) 

In [45]:
pred_output.to_csv('submit_pred_vgg_onset_cv3.csv', index=False)

In [32]:
vgg_onset=pd.read_csv('submit_pred_vgg10_onset.csv')
vgg_onset

Unnamed: 0,fname,outcome
0,HB_unlabelled_1.wav,0.159029
1,HB_unlabelled_2.wav,0.091783
2,HB_unlabelled_3.wav,0.157699
3,HB_unlabelled_4.wav,0.135544
4,HB_unlabelled_5.wav,0.040380
...,...,...
134,HB_unlabelled_135.wav,0.345753
135,HB_unlabelled_136.wav,0.310372
136,HB_unlabelled_137.wav,0.472512
137,HB_unlabelled_138.wav,0.305780


In [34]:
pd.set_option("display.max_rows", 999)

In [46]:
vgg_onset.merge(pred_output,left_index=True,right_index=True)


Unnamed: 0,fname_x,outcome_x,fname_y,outcome_y
0,HB_unlabelled_1.wav,0.159029,HB_unlabelled_1.wav,0.9939626
1,HB_unlabelled_2.wav,0.091783,HB_unlabelled_2.wav,0.001941949
2,HB_unlabelled_3.wav,0.157699,HB_unlabelled_3.wav,0.0187473
3,HB_unlabelled_4.wav,0.135544,HB_unlabelled_4.wav,0.004885316
4,HB_unlabelled_5.wav,0.04038,HB_unlabelled_5.wav,7.930398e-05
5,HB_unlabelled_6.wav,0.201888,HB_unlabelled_6.wav,0.6566176
6,HB_unlabelled_7.wav,0.710836,HB_unlabelled_7.wav,0.8978719
7,HB_unlabelled_8.wav,0.135265,HB_unlabelled_8.wav,0.8905975
8,HB_unlabelled_9.wav,0.111498,HB_unlabelled_9.wav,0.8876061
9,HB_unlabelled_10.wav,0.002578,HB_unlabelled_10.wav,0.0


In [47]:
pred_output.query('outcome > 0.5').outcome.count()

47

In [35]:
vgg_onset.query('outcome > 0.5').outcome.count()

19