In [None]:
#!pip install h5py
!pip install keras==2.1.2

import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile
from sklearn.datasets import load_files

from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

#import tensorflow as tf
#sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

In [None]:
#audio length
L = 16000

#labels to predict
legal_labels = 'yes no up down left right on off stop go'.split()

#only training data
train_data_path = '../data/train/'
valid_data_path = '../data/valid/'
test_data_path = '../data/test/'

print train_data_path
print valid_data_path
print test_data_path
print legal_labels
print len(legal_labels)

In [None]:
#helpful functions for audio data
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

#make audio less than 1 s to become 1 second
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

#random sample 1 sec from audio that are more than 1 second
def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

#transform label
def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))
  
# define function to load train, test, and validation datasets
def load_dataset(path):
    data = load_files(path)
    audio_files = np.array(data['filenames'])
#    audio_targets = np.array(data['target_names'])
    return audio_files

To transform the "training" data from audio data

In [None]:
import time
start = time.time()

new_sample_rate = 16000
data = load_files(train_data_path) 
fnames = data['filenames']
y_train = []
x_train = []

for fname in fnames:
    
    if fname.endswith('wav'):
        
        #if audio data is less than 1 second, complete it
        sample_rate, samples = wavfile.read(fname)
        samples = pad_audio(samples)
        
        #if audio data is larger than 1 second, chop it
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        
        else: 
            n_samples = [samples]
        
        #for silence audio, since it is longer than 1 second
        for sample in n_samples:
    #        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(sample, sample_rate = new_sample_rate)
            x_train.append(specgram)
            y_train.append(fname.split("/")[3])

    else:
        continue

print time.time() - start

x_train = np.array(x_train)
x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
y_train = label_transform(y_train)
label_index = y_train.columns.values
y_train = y_train.values
y_train = np.array(y_train)
#del labels, fnames
gc.collect()
print "Done"

To transform the "validation" data from audio data

In [None]:
import time
start = time.time()

data = load_files(valid_data_path)
fnames = data['filenames']
new_sample_rate = 16000
y_valid = []
x_valid = []

for fname in fnames:
    
    if fname.endswith('wav'):
        
        sample_rate, samples = wavfile.read(fname)
        samples = pad_audio(samples)
        
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
            
        else: 
            n_samples = [samples]
        
        #for silence audio, since it is longer than 1 second
        for sample in n_samples:
    #        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(sample, sample_rate = new_sample_rate)
            x_valid.append(specgram)
            y_valid.append(fname.split("/")[3])

    else:
        continue

print time.time() - start

x_valid = np.array(x_valid)
x_valid = x_valid.reshape(tuple(list(x_valid.shape) + [1]))
y_valid = label_transform(y_valid)
label_index = y_valid.columns.values
y_valid = y_valid.values
y_valid = np.array(y_valid)
#del labels, fnames
gc.collect()
print "Done"

In [None]:
x_train = x_train.reshape(x_train.shape[0], 15939)
x_valid = x_valid.reshape(x_valid.shape[0], 15939)

x_train = x_train.astype('float32')
x_valid = x_valid.astype('float32')

To show the current shape

In [None]:
print x_train.shape
print y_train.shape
print x_valid.shape
print y_valid.shape
print "Done"

CNN model and its plan

In [None]:
#seq model
from keras import layers
from keras import models

pdrop = .25

model = models.Sequential()
model.add(Dense(200, batch_input_shape=(None, 15939), activation='relu'))
model.add(Dropout(pdrop))
model.add(Dense(100, activation='relu'))
model.add(Dropout(pdrop))
model.add(Dense(60, activation='relu'))
model.add(Dropout(pdrop))
model.add(Dense(30, activation='relu'))
model.add(Dropout(pdrop))
model.add(Dense(10, activation='softmax'))

model.summary()

define optimizer, loss function and metrics to evaluate

In [None]:
rms_lr = optimizers.RMSprop(lr=0.001)

model.compile(optimizer=rms_lr,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Model training

In [None]:
print "ready"

In [None]:
import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

filepath='../model/benchmark-{epoch:02d}-{val_acc:.3f}.h5'

checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')

model.fit(x_train, y_train, batch_size=512, validation_data=(x_valid, y_valid), epochs=50, shuffle=True, verbose=1, callbacks = [checkpoint])

model.save('../model/benchmark.h5')

In [None]:
print x_train.shape
print y_train.shape
print x_valid.shape
print y_valid.shape

In [None]:

!pip install keras==2.1.2 
#!pip install h5py
from keras.models import load_model
model = load_model('../model/cnn.h5')

prediction

In [None]:
import time
start = time.time()

data = load_files(test_data_path)
fnames = data['filenames']
new_sample_rate = 16000
y_test = []
x_test = []

for fname in fnames:
    
    if fname.endswith('wav'):
        
        sample_rate, samples = wavfile.read(fname)
        samples = pad_audio(samples)
        
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
            
        else: 
            n_samples = [samples]
        
        #for silence audio, since it is longer than 1 second
        for sample in n_samples:
    #        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(sample, sample_rate = new_sample_rate)
            x_test.append(specgram)
            y_test.append(fname.split("/")[3])

    else:
        continue

print time.time() - start

x_test = np.array(x_test)
x_test = x_test.reshape(tuple(list(x_test.shape) + [1]))
y_test = label_transform(y_test)
label_index = y_test.columns.values
y_test = y_test.values
y_test = np.array(y_test)
#del labels, fnames
gc.collect()
print "Done"

In [None]:
x_test = x_test.reshape(x_test.shape[0], 15939)

x_test = x_test.astype('float32')

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

x_pred = model.predict(x_test)

In [None]:
import time
start = time.time()

output_path="../result/sub-01.csv"

index = []
results = []
for fnames, imgs in test_data_generator(batch=32):
    predicts = model.predict(imgs)
    predicts = np.argmax(predicts, axis=1)
    predicts = [label_index[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)

df = pd.DataFrame(columns=['fname', 'label'])
df['fname'] = index
df['label'] = results

df.to_csv(output_path, index=False)

print time.time() - start