# Import useful package

In [2]:
#!pip install h5py
#keras 2.1.2 is used since h5py cannot be loaded in datalab
#!pip install keras==2.1.2

import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile
from sklearn.datasets import load_files

from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras



# Check the device that we are using, we can see GPU is ready!

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

#import tensorflow as tf
#sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9166805881876366451
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11266673869
locality {
  bus_id: 1
}
incarnation: 13368641323544195102
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"
]


# Define the file path and selected commands

In [5]:
#audio length
L = 16000

#labels to predict
legal_labels = 'yes no up down left right on off stop go'.split()

#only training data
train_data_path = '../data/train/'
valid_data_path = '../data/valid/'
test_data_path = '../data/test/'

print train_data_path
print valid_data_path
print test_data_path
print legal_labels
print len(legal_labels)

../data/train/
../data/valid/
../data/test/
['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
10


# Some helpful transformation functions

In [6]:
#helpful functions for audio data
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

#make audio less than 1 s to become 1 second
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

#random sample 1 sec from audio that are more than 1 second
def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

#transform label
def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))
  
# define function to load train, test, and validation datasets
def load_dataset(path):
    data = load_files(path)
    audio_files = np.array(data['filenames'])
#    audio_targets = np.array(data['target_names'])
    return audio_files

# To transform the training data from audio data

In [7]:
import time
start = time.time()

new_sample_rate = 16000
data = load_files(train_data_path) 
fnames = data['filenames']
y_train = []
x_train = []

for fname in fnames:
    
    if fname.endswith('wav'):
        
        #if audio data is less than 1 second, complete it
        sample_rate, samples = wavfile.read(fname)
        samples = pad_audio(samples)
        
        #if audio data is larger than 1 second, chop it
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        
        else: 
            n_samples = [samples]
        
        #for silence audio, since it is longer than 1 second
        for sample in n_samples:
    #        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(sample, sample_rate = new_sample_rate)
            x_train.append(specgram)
            y_train.append(fname.split("/")[3])

    else:
        continue

print time.time() - start

x_train = np.array(x_train)
x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
y_train = label_transform(y_train)
label_index = y_train.columns.values
y_train = y_train.values
y_train = np.array(y_train)
#del labels, fnames
gc.collect()
print "Done"

32.524930954
Done


# To transform the validation data from audio data

In [8]:
import time
start = time.time()

data = load_files(valid_data_path)
fnames = data['filenames']
new_sample_rate = 16000
y_valid = []
x_valid = []

for fname in fnames:
    
    if fname.endswith('wav'):
        
        sample_rate, samples = wavfile.read(fname)
        samples = pad_audio(samples)
        
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
            
        else: 
            n_samples = [samples]
        
        #for silence audio, since it is longer than 1 second
        for sample in n_samples:
    #        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(sample, sample_rate = new_sample_rate)
            x_valid.append(specgram)
            y_valid.append(fname.split("/")[3])

    else:
        continue

print time.time() - start

x_valid = np.array(x_valid)
x_valid = x_valid.reshape(tuple(list(x_valid.shape) + [1]))
y_valid = label_transform(y_valid)
label_index = y_valid.columns.values
y_valid = y_valid.values
y_valid = np.array(y_valid)
#del labels, fnames
gc.collect()
print "Done"

4.20760607719
Done


# To show the data shape and size

In [9]:
print x_train.shape
print y_train.shape
print x_valid.shape
print y_valid.shape
print "Done"

(18538, 99, 161, 1)
(18538, 10)
(2577, 99, 161, 1)
(2577, 10)
Done


# CNN model and its plan

In [10]:
#seq model
from keras import layers
from keras import models

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(99, 161, 1)))
model.add(Dropout(0.2))
model.add(layers.MaxPooling2D((3, 3)))

model.add(layers.Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(layers.MaxPooling2D((3, 3)))

model.add(layers.Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(layers.MaxPooling2D((3, 3)))

model.add(layers.Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(layers.MaxPooling2D((3, 3)))

model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(10, activation='softmax'))

model.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 99, 161, 32)       320       
_________________________________________________________________
dropout_1 (Dropout)          (None, 99, 161, 32)       0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 33, 53, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 33, 53, 64)        18496     
_________________________________________________________________
batch_normalization_1 (Batch (None, 33, 53, 64)        256       
_________________________________________________________________
dropout_2 (Dropout)          (None, 33, 53, 64)        0         
_________________________________________________________________
max

# Define optimizer, loss function and metrics to evaluate

In [11]:
rms_lr = optimizers.RMSprop(lr=0.001)

model.compile(optimizer=rms_lr,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


# Model training, first 250 epoches

In [12]:
print "ready"

ready


In [None]:
import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

filepath='../model/cnn-{epoch:02d}-{val_acc:.3f}.h5'

checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')

model.fit(x_train, y_train, batch_size=512, validation_data=(x_valid, y_valid), epochs=250, shuffle=True, verbose=1, callbacks = [checkpoint])

model.save('../model/cnn.h5')

Train on 18538 samples, validate on 2577 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/25

In [15]:
print x_train.shape
print y_train.shape
print x_valid.shape
print y_valid.shape

(18538, 99, 161, 1)
(18538, 10)
(2577, 99, 161, 1)
(2577, 10)


# Extra training with reduced learning rate for 50 epoches

In [14]:

#!pip install keras==2.1.2 
#!pip install h5py
from keras.models import load_model
model = load_model('../model/cnn.h5')

In [None]:
rms_lr = optimizers.RMSprop(lr=0.00025)

model.compile(optimizer=rms_lr,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

filepath='../model/cnn-base200-{epoch:02d}-{val_acc:.3f}.h5'

checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')

model.fit(x_train, y_train, batch_size=512, validation_data=(x_valid, y_valid), epochs=50, shuffle=True, verbose=1, callbacks = [checkpoint])

model.save('../model/cnn_300_reduced_lr.h5')

Train on 18538 samples, validate on 2577 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50

# To transform the testing data from audio data

In [None]:
import time
start = time.time()

data = load_files(test_data_path)
fnames = data['filenames']
new_sample_rate = 16000
y_test = []
x_test = []

for fname in fnames:
    
    if fname.endswith('wav'):
        
        sample_rate, samples = wavfile.read(fname)
        samples = pad_audio(samples)
        
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
            
        else: 
            n_samples = [samples]
        
        #for silence audio, since it is longer than 1 second
        for sample in n_samples:
    #        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(sample, sample_rate = new_sample_rate)
            x_test.append(specgram)
            y_test.append(fname.split("/")[3])

    else:
        continue

print time.time() - start

x_test = np.array(x_test)
x_test = x_test.reshape(tuple(list(x_test.shape) + [1]))
y_test = label_transform(y_test)
label_index = y_test.columns.values
y_test = y_test.values
y_test = np.array(y_test)
#del labels, fnames
gc.collect()
print "Done"

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

y_pred = model.predict(x_test)

# Export the result

In [None]:
import time
start = time.time()

output_path="../result/final_result.csv"

df = pd.DataFrame(columns=['fname', 'label', 'prediction'])
df['fname'] = fnames
df['label'] = y_test
df['prediction'] = y_pred

df.to_csv(output_path, index=False)

print time.time() - start