In [1]:
import os, sys, re
import numpy as np
import pandas as pd
import hashlib
import random
from tqdm import tqdm
from python_speech_features import mfcc
import scipy.io.wavfile as wav

In [2]:
path = os.path.abspath("../Datasets/google_dataset/")

In [3]:
keywords = ['up', 'down']
non_keywords = ['happy']

Get keyword filename of audio samples

In [4]:
filenames = []

for w in (keywords + non_keywords):
    current_path = os.path.join(path, w) 
    for _, _, files in os.walk(current_path):
        for file in files:
            filenames.append(os.path.join(current_path, file))


In [5]:
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M

def which_set(filename, validation_percentage, testing_percentage):
    """Determines which data partition the file should belong to.

    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.

    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.

    Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.

    Returns:
    String, one of 'training', 'validation', or 'testing'.
    """
    base_name = os.path.basename(filename)
    # We want to ignore anything after '_nohash_' in the file name when
    # deciding which set to put a wav in, so the data set creator has a way of
    # grouping wavs that are close variations of each other.
    hash_name = re.sub(r'_nohash_.*$', '', base_name).encode('utf-8')
    # This looks a bit magical, but we need to decide whether this file should
    # go into the training, testing, or validation sets, and we want to keep
    # existing files in the same set even if more files are subsequently
    # added.
    # To do that, we need a stable way of deciding based on just the file name
    # itself, so we do a hash of that and then use that to generate a
    # probability value that we use to assign it.
    hash_name_hashed = hashlib.sha1(hash_name).hexdigest()
    percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
    if percentage_hash < validation_percentage:
        result = 'validation'
    elif percentage_hash < (testing_percentage + validation_percentage):
        result = 'testing'
    else:
        result = 'training'
    return result

In [6]:
which_set(random.choice(filenames), validation_percentage=10, testing_percentage=10)

'testing'

In [7]:
def compute_mfcc(signal, num_features=40): 

    features = mfcc(signal, samplerate=16000, winlen=0.030, winstep=0.01, numcep=num_features, 
                         lowfreq=20, highfreq=4000, appendEnergy=False, nfilt=num_features)
    #features = np.mean(features, axis=0).reshape(1,-1)
    return features

In [8]:
fs, sig = wav.read(random.choice(filenames))
feats = compute_mfcc(sig)

In [9]:
feats.shape

(98, 40)

In [10]:
sig.shape

(16000,)

In [11]:
def generate_sets(filenames, validation_percentage=10, testing_percentage=10):
    '''
    each data sample in the tuple (features, label)
    '''
    
    non_keywords_label = len(keywords)

    training, validation, testing = [], [], []
    min_nb_frames = 98
    for filename in tqdm(filenames):
        _, signal = wav.read(filename)
        feats = compute_mfcc(signal)
        
        if feats.shape[0] < min_nb_frames:
            continue
        kw = filename.split('/')[-2]
        if kw in keywords:
            label = keywords.index(kw)
        else:
            label = non_keywords_label
            
        grp = which_set(filename, validation_percentage, testing_percentage)
        
        if grp is 'training':
            training.append((feats, label))
        elif grp is 'validation':
            validation.append((feats, label))
        else:
            testing.append((feats, label))
    return training, validation, testing

In [12]:
training, validation, testing = generate_sets(filenames)

100%|██████████| 6476/6476 [00:20<00:00, 322.22it/s]


In [13]:
def get_X_y(grp):
    X, y = zip(*grp)
    X = list(map(lambda x: x.reshape(98, 40, 1), X))
    return np.array(X).reshape(-1, 98, 40, 1), np.array(y).reshape(-1,1)

In [14]:
a, b = get_X_y(testing)

In [15]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import BatchNormalization, Conv2D, AveragePooling2D, Dense, Flatten
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [23]:
def create_model():
    model = Sequential()
    model.add(Conv2D(45, kernel_size=(3,3), activation='relu', input_shape=(98, 40, 1), 
                     data_format='channels_last', padding='same')) 
    model.add(BatchNormalization(axis=-1))
    model.add(Conv2D(45, kernel_size=(3,3), activation='relu', padding='same',dilation_rate=2))
    model.add(BatchNormalization(axis=-1))
    model.add(AveragePooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(units=len(keywords) + 1, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [24]:
X_train, y_train = get_X_y(training)
X_validation, y_validation = get_X_y(validation)

In [25]:
y_train, y_validation = to_categorical(y_train), to_categorical(y_validation)

In [26]:
model = create_model()
model.fit(X_train, y_train, batch_size=20, epochs=2, verbose=1, validation_data=(X_validation, y_validation))

Train on 4484 samples, validate on 634 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1045e87b8>

Test Score

In [27]:
X_test, y_test = get_X_y(testing)

In [28]:
y_pred = model.predict_classes(X_test, verbose=1)



In [29]:
accuracy_score(y_test, y_pred)

0.8637770897832817

In [None]:
0.804953560371517