In [1]:
import os, sys, re
from itertools import islice 
import numpy as np
import pandas as pd
import hashlib
import random
import math
from tqdm import tqdm
from python_speech_features import mfcc
import scipy.io.wavfile as wav

In [2]:
keywords = ['up', 'down', 'left', 'right']
non_keywords = ['happy', 'bed', 'bird', 'cat', 'dog', 'marvin', 'sheila', 'house', 'tree', 'wow']

Get keyword filename of audio samples

In [3]:
path = os.path.abspath("Datasets/google_dataset/")
filenames = []

for w in (keywords + non_keywords):
    current_path = os.path.join(path, w) 
    for _, _, files in os.walk(current_path):
        for file in files:
            filenames.append(os.path.join(current_path, file))

In [4]:
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M

def which_set(filename, validation_percentage, testing_percentage):
    """Determines which data partition the file should belong to.

    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.

    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.

    Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.

    Returns:
    String, one of 'training', 'validation', or 'testing'.
    """
    base_name = os.path.basename(filename)
    # We want to ignore anything after '_nohash_' in the file name when
    # deciding which set to put a wav in, so the data set creator has a way of
    # grouping wavs that are close variations of each other.
    hash_name = re.sub(r'_nohash_.*$', '', base_name).encode('utf-8')
    # This looks a bit magical, but we need to decide whether this file should
    # go into the training, testing, or validation sets, and we want to keep
    # existing files in the same set even if more files are subsequently
    # added.
    # To do that, we need a stable way of deciding based on just the file name
    # itself, so we do a hash of that and then use that to generate a
    # probability value that we use to assign it.
    hash_name_hashed = hashlib.sha1(hash_name).hexdigest()
    percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
    if percentage_hash < validation_percentage:
        result = 'validation'
    elif percentage_hash < (testing_percentage + validation_percentage):
        result = 'testing'
    else:
        result = 'training'
    return result

In [5]:
which_set(random.choice(filenames), validation_percentage=10, testing_percentage=10)

'testing'

In [6]:
def compute_mfcc(signal, num_features=40): 

    features = mfcc(signal, samplerate=16000, winlen=0.030, winstep=0.01, numcep=num_features, 
                         lowfreq=20, highfreq=4000, appendEnergy=False, nfilt=num_features)
    #features = np.mean(features, axis=0).reshape(1,-1)
    return features

In [7]:
filenames[0:5]

['/aimlx/Datasets/google_dataset/up/b7a0754f_nohash_4.wav',
 '/aimlx/Datasets/google_dataset/up/748cb308_nohash_0.wav',
 '/aimlx/Datasets/google_dataset/up/e7334395_nohash_0.wav',
 '/aimlx/Datasets/google_dataset/up/4f781a59_nohash_1.wav',
 '/aimlx/Datasets/google_dataset/up/cd8b1781_nohash_0.wav']

In [8]:
fs, sig = wav.read(random.choice(filenames))
feats = compute_mfcc(sig)

In [9]:
feats.shape

(98, 40)

In [10]:
sig.shape

(16000,)

In [None]:
count = {}
for file in tqdm(filenames):
    fs, signal = wav.read(file)
    features = compute_mfcc(signal, fs)
    #features = mfcc(signal, samplerate=16000, winlen=0.030, winstep=0.01, numcep=40, 
    #                     lowfreq=20, highfreq=4000, appendEnergy=False, nfilt=40)
    if features.shape[0] in count.keys():
        count[features.shape[0]] += 1
    else:
        count[features.shape[0]] = 1

In [None]:
plt.bar(count.keys(), count.values(), 1.0, color='g')

In [11]:
def load_negative_samples(neg_filenames, validation_neg_percentage, label):
    '''
    Generates training and validation sets for the negative samples 
    '''
    neg_feats = list(map(lambda x: (np.load(x, allow_pickle=False), label), neg_filenames))
    random.shuffle(neg_feats)

    neg_size = len(neg_feats)
    nb_validation = int(neg_size * (validation_neg_percentage / 100))
    nb_training = neg_size - nb_validation
    
    split_list = [nb_training, nb_validation] 
    
    temp = iter(neg_feats) 
    splits = [list(islice(temp, 0, size)) for size in split_list ]
    return splits[0], splits[1]


def generate_sets(filenames, neg_filenames, validation_percentage=10, testing_percentage=10, validation_neg_percentage=20):
    '''
    each data sample in the tuple (features, label)
    '''
    
    non_keywords_label = len(keywords)

    training, validation, testing = [], [], []
    min_nb_frames = 98
    for filename in tqdm(filenames):
        _, signal = wav.read(filename)
        feats = compute_mfcc(signal)
        
        if feats.shape[0] < min_nb_frames:
            continue
        kw = filename.split('/')[-2]
        if kw in keywords:
            label = keywords.index(kw)
        else:
            label = non_keywords_label
            
        grp = which_set(filename, validation_percentage, testing_percentage)
        
        if grp is 'training':
            training.append((feats, label))
        elif grp is 'validation':
            validation.append((feats, label))
        else:
            testing.append((feats, label))
            
    training_neg, validation_neg = load_negative_samples(neg_filenames, validation_neg_percentage, non_keywords_label)
    
    training.extend(training_neg)
    validation.extend(validation_neg)
    
    return training, validation, testing

In [12]:
path = os.path.abspath("Datasets/negative_samples/")
neg_filenames = []

for _, _, files in os.walk(path):
    for file in files:
        if file.endswith('.npy'):
            neg_filenames.append(os.path.join(path, file))

In [13]:
training, validation, testing = generate_sets(filenames, neg_filenames)

100%|██████████| 26827/26827 [00:57<00:00, 470.14it/s]


In [15]:
def get_X_y(grp):
    X, y = zip(*grp)
    X = list(map(lambda x: x.reshape(98, 40, 1), X))
    return np.array(X).reshape(-1, 98, 40, 1), np.array(y).reshape(-1,1)

In [17]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import BatchNormalization, Conv2D, AveragePooling2D, Dense, Flatten, Input, Add, Lambda
from keras.utils import to_categorical
from keras.models import Model
from sklearn.metrics import accuracy_score

In [18]:
def res_net():
    
    input_data = Input(shape=(98,40,1))
    l = 0
    for i in range(6):
        if i == 0:
            x = Conv2D(7, kernel_size=(3,3), activation='relu', data_format='channels_last', 
                       padding='same', kernel_initializer='glorot_uniform')(input_data)
            x = BatchNormalization(axis=-1)(x)
            l += 1
            x = Conv2D(7, kernel_size=(3,3), activation='relu', data_format='channels_last', 
                       padding='same', kernel_initializer='glorot_uniform', dilation_rate=int(math.pow(2, np.floor(l/3))))(x)
            l += 1
            x = BatchNormalization(axis=-1)(x)
        else:
            y = Conv2D(7, kernel_size=(3,3), activation='relu', data_format='channels_last', 
                       padding='same', kernel_initializer='glorot_uniform', dilation_rate=int(math.pow(2, np.floor(l/3))))(x)
            y = BatchNormalization(axis=-1)(y)
            l += 1
            y = Conv2D(7, kernel_size=(3,3), activation='relu', data_format='channels_last', 
                       padding='same', kernel_initializer='glorot_uniform', dilation_rate=int(math.pow(2, np.floor(l/3))))(y)
            l += 1
            y = BatchNormalization(axis=-1)(y)

            y = Add()([y, x])
            x = Lambda(lambda x: x)(y)
    
    x = AveragePooling2D(pool_size=(2,2),data_format='channels_last')(x)
    x = Flatten()(x)
    x = Dense(units=len(keywords) + 1, activation='softmax')(x)
    
    model = Model(inputs=input_data, outputs=x) 
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

def dnn_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(5,5), activation='relu', input_shape=(98, 40, 1), data_format='channels_last')) 
    model.add(BatchNormalization(axis=-1))
    model.add(AveragePooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(64, kernel_size=(5,5), activation='relu')) 
    model.add(BatchNormalization(axis=-1))
    model.add(AveragePooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(128, kernel_size=(5,5), activation='relu')) 
    model.add(BatchNormalization(axis=-1))
    model.add(AveragePooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())
    model.add(Dense(units=128, activation='relu'))
    model.add(Dense(units=len(keywords) + 1, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [19]:
X_train, y_train = get_X_y(training)
X_validation, y_validation = get_X_y(validation)
y_train, y_validation = to_categorical(y_train), to_categorical(y_validation)

In [21]:
model = dnn_model()
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 94, 36, 32)        832       
_________________________________________________________________
batch_normalization_4 (Batch (None, 94, 36, 32)        128       
_________________________________________________________________
average_pooling2d_4 (Average (None, 47, 18, 32)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 43, 14, 64)        51264     
_________________________________________________________________
batch_normalization_5 (Batch (None, 43, 14, 64)        256       
_________________________________________________________________
average_pooling2d_5 (Average (None, 21, 7, 64)         0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 17, 3, 128)        204928    
__________

In [22]:
model.fit(X_train, y_train, batch_size=20, epochs=3, verbose=1, validation_data=(X_validation, y_validation))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 58273 samples, validate on 12284 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7a9b080f60>

Test Score

In [23]:
X_test, y_test = get_X_y(testing)

In [24]:
#y_pred = model.predict_classes(X_test, verbose=1)
y_pred = model.predict(X_test, verbose=1)
y_pred = np.argmax(y_pred, axis=1)



In [25]:
accuracy_score(y_test, y_pred)

0.9187898089171974

In [None]:
0.9434713375796179

In [26]:
def generates_frames(features, shift=5, frame_length=98, num_mffc=40):
    frames = []
    window_size = features.shape[0]
    enough_samples = True
    current_index = 0
    
    while enough_samples:
        if current_index + frame_length < window_size:
            frames.append(features[current_index: current_index + frame_length, :])
            current_index += shift
        else:
            frames.append(features[-frame_length:,:])
            enough_samples = False
    return np.array(frames).reshape(-1, frame_length, num_mffc, 1)

def compute_mfcc_frames(signal, shape=(98,40)):
    features = mfcc(signal, samplerate=16000, winlen=0.030, winstep=0.01, numcep=shape[1], 
                         lowfreq=20, highfreq=4000, appendEnergy=False, nfilt=shape[1])
    print(features.shape)
    if features.shape[0] <= shape[0]:
        nb_samples = shape[0] - features.shape[0]
        features = np.concatenate((np.zeros((nb_samples, shape[1])), features), axis=0)
        return features.reshape(1, shape[0], shape[1], 1)
    else:
        frames = generates_frames(features, shift=1, frame_length=shape[0], num_mffc=shape[1])
        return frames

In [52]:
fs, sig = wav.read('phrase1_down.wav')

In [53]:
frames = compute_mfcc_frames(sig)

(212, 40)


In [54]:
frames.shape

(115, 98, 40, 1)

In [55]:
#y_pred = model.predict_classes(frames, verbose=1)
y_pred = model.predict(frames, verbose=1)
y_pred = np.argmax(y_pred, axis=1)



In [56]:
y_pred

array([1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1,
       1, 1, 1, 1, 4])

In [347]:
keywords = ['up', 'down', 'left', 'right']

In [28]:
1

1