# Purpose of this notebook:

This notebook recreates the functionality from 'speech_commands' example from the [Tensorflow](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands) library to make dataloading and training for keyword detection on microcontrollers more flexible. (At least in my opinion) it provides easie access points to do data augmentation, transfer learning and experiment with different model architectures.

In [2]:
import numpy as np
import sys
import glob
import matplotlib.pyplot as plt
import math
import os
import random

import tensorflow as tf
import tensorflow_io as tfio
from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op

In [1]:
import audiomentations

In [3]:
training_meta = dict(
    wanted_words = ['licht', 'party', 'aus'],
    data_path = '/home/average-joe/coding_data/keyword_detection_nano/dataset/',
    epochs = 5,
    learning_rate = 1e-3,
    batch_size = 32,
)

audio_meta = dict(
    sample_rate = 16000,
    clip_duration = 1000,
    window_size_ms = 30,
    window_stride = 20,
    feature_bin_count = 40,
)

desired_samples = int(audio_meta['sample_rate'] * audio_meta['clip_duration'] / 1000)
window_size_samples = int(audio_meta['sample_rate'] * audio_meta['window_size_ms'] / 1000)
window_stride_samples = int(audio_meta['sample_rate'] * audio_meta['window_stride'] / 1000)
length_minus_window = desired_samples - window_size_samples
spectrogram_lenght = 1 + int(length_minus_window / window_stride_samples)

audio_meta['desired_samples'] = desired_samples
audio_meta['spectrogram_lenght'] = spectrogram_lenght
audio_meta['fingerprint_size'] = spectrogram_lenght * audio_meta['feature_bin_count']

augmentation_meta = dict(
    background_frequency = 0.8,
    background_volume_range = 0.1,
    time_shift_ms = 100.0,
    silence_percentage = 0.2,
    unknown_percentage = 0.2,
)

meta_dict = dict(
    audio = audio_meta,
    augmentation = augmentation_meta,
    training = training_meta
)

In [37]:
class KeywordDataset(tf.keras.utils.Sequence):
    def __init__(self,
                 fns,
                 background_fns,
                 meta_dict,
                 batch_size,
                 is_validation = False
                ):
        self.batch_size = batch_size
        self.words = meta_dict['training']['wanted_words']
        self.vocab = {word: i for i,word in enumerate(['silence', 'unknown'] + self.words)}
        self.unknown_fns = fns[1]
        self.audio_meta = meta_dict['audio']
        self.augmentation_meta = meta_dict['augmentation']
        self.is_validation = is_validation
        self.augment = audiomentations.Compose([
            #audiomentations.PitchShift(min_semitones=2, max_semitones=2, p=1.),
            audiomentations.BandPassFilter(p = .5),
            audiomentations.HighPassFilter(p = .5),
            audiomentations.LowPassFilter(p = .5),
            audiomentations.RoomSimulator(leave_length_unchanged = True, p = .5),
            audiomentations.TanhDistortion(p = .5)
        ])

        self.items = self.prepare_items(fns[0])
        self.background_data = self.prepare_background_data(background_fns)

    def __len__(self):
        return math.ceil(len(self.items) / self.batch_size)

    def __getitem__(self, idx):
        items = self.items[idx * self.batch_size: (idx + 1) * self.batch_size]
        xs, ys = [], []
        for fn in items:
            label = self.get_label(fn)
            audio = self.get_audio(fn, label).numpy().flatten()
            if not self.is_validation:
                audio = self.augment(audio, sample_rate = self.audio_meta['sample_rate'])
            spectro = self.get_spectrogram(audio)       
            xs.append(spectro)
            ys.append(self.vocab[label])
        return np.stack(xs), np.stack(ys)
    
    def on_epoch_end(self):
        if not self.is_validation:
            random.shuffle(self.items)
        
    def prepare_items(self, items):
        """ 
        Add the same amoung of placeholders for silence as there are unknowns.
        Return a shuffled list of items.
        To-Do: move this to the filename retrieval
        """
        items = items + ['silence_placeholder'] * len(self.unknown_fns) + self.unknown_fns
        random.shuffle(items)
        return items

    def prepare_background_data(self,fns):
        background_data = []
        for fn in fns:
            file = tf.io.read_file(fn)
            audio, _ = tf.audio.decode_wav(file, desired_channels=1)
            if len(audio) < self.audio_meta['desired_samples']:
                continue
            background_data.append(audio)
        return background_data
        
    def get_label(self, fn):
        if fn == 'silence_placeholder':
            return 'silence'
        else:
            folder = fn.split('/')[-2]
            if folder in self.words:
                return folder
            return 'unknown'
    
    def load_audio(self, fn):
        file = tf.io.read_file(fn)
        audio, _ = tf.audio.decode_wav(contents = file, desired_channels = 1, desired_samples = self.audio_meta['desired_samples'])     
        return audio
    
    def get_timeshift_params(self):
        time_shift = self.augmentation_meta['time_shift_ms']
        background_frequency = self.augmentation_meta['background_frequency']
        background_volume_range = self.augmentation_meta['background_volume_range']
        
        time_shift_amount = np.random.randint(-time_shift, time_shift) if time_shift > 0 else 0
        if time_shift_amount > 0:
            time_shift_padding = [[time_shift_amount, 0], [0,0]]
            time_shift_offset = [0,0]
        else:
            time_shift_padding = [[0,-time_shift_amount], [0,0]]
            time_shift_offset = [-time_shift_amount, 0]
            
        return time_shift_padding, time_shift_offset
    
    def get_random_background(self, label):
        background_sample = random.choice(self.background_data)

        background_offset = np.random.randint(0, len(background_sample) - self.audio_meta['desired_samples'])
        background_clipped = background_sample[background_offset:(background_offset + self.audio_meta['desired_samples'])]
        background_reshaped = tf.reshape(background_clipped, [self.audio_meta['desired_samples'],1])
        
        if label == 'silence':
            background_volume = np.random.uniform(0,1)
        elif np.random.uniform(0,1) < self.augmentation_meta['background_frequency']:
            background_volume = np.random.uniform(0, self.augmentation_meta['background_volume_range'])
        else:
            background_volume = 0

        background_mul = tf.multiply(background_reshaped, background_volume)
        return background_mul
    
    def get_audio(self, fn, label):
        if self.is_validation and label != 'silence':
            return self.load_audio(fn)
        background_mul = self.get_random_background(label)
        if label == 'silence':
            return background_mul

        foreground = self.load_audio(fn)
        time_shift_padding, time_shift_offset = self.get_timeshift_params()
        
        padded_foreground = tf.pad(tensor = foreground, paddings = time_shift_padding, mode = 'CONSTANT')
        sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [self.audio_meta['desired_samples'], -1])
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1., 1.)
        
        return background_clamp
        
    def get_spectrogram(self, audio):
        int_16_input = tf.cast(tf.multiply(audio, 32768), tf.int16)
        micro_frontend = frontend_op.audio_microfrontend(
            int_16_input,
            sample_rate = self.audio_meta['sample_rate'],
            window_size = self.audio_meta['window_size_ms'],
            window_step = self.audio_meta['window_stride'],
            num_channels = self.audio_meta['feature_bin_count'],
            out_scale = 1,
            out_type = tf.float32
        )
        spectro = tf.multiply(micro_frontend, (10. / 256.)).numpy().flatten()
        return spectro

In [38]:
def get_fns(path, wanted_words, val_pct = 0.2, unknown_pct = 0.2, seed = None):
    wanted_words_fns = {}
    unknown_words_fns = []
    background_fns = []
    
    """ Get all .wav files contained at the provided path and add them to the appropriate list """
    fns = glob.glob(os.path.join(path,'*','*.wav'))
    for fn in fns:
        folder = os.path.split(os.path.dirname(fn))[-1]
        if folder == '_background_noise_':
            background_fns.append(fn)
        elif folder in wanted_words:
                if wanted_words_fns.get(folder, False):
                    wanted_words_fns[folder].append(fn)
                else:
                    wanted_words_fns[folder] = [fn]
        else:
            unknown_words_fns.append(fn)
            
    """ Split wanted/unknown in training and validation """
    training_words = []
    validation_words = []
    for key in wanted_words_fns.keys():
        word_fns = wanted_words_fns[key]
        random.shuffle(word_fns)
        n_val_word = int(len(word_fns) * val_pct)
        validation_words.extend(word_fns[:n_val_word])
        training_words.extend(word_fns[n_val_word:])
    
    n_val_unknown = int(len(unknown_words_fns) * val_pct)
    validation_unknowns = unknown_words_fns[:n_val_unknown]
    training_unknowns = unknown_words_fns[n_val_unknown:]
    
    n_training = len(training_words)
    n_training_unknown = int(n_training * unknown_pct)
    training_unknowns = random.sample(training_unknowns, k = n_training_unknown)
    
    n_validation = len(validation_words)
    n_validation_unknown = int(n_validation * unknown_pct)
    validation_unknowns = random.sample(validation_unknowns, k = n_validation_unknown)
    
    training_fns = [training_words, training_unknowns]
    validation_fns = [validation_words, validation_unknowns]
    return training_fns, validation_fns, background_fns

In [39]:
training_fns, validation_fns, background_fns = get_fns(training_meta['data_path'], training_meta['wanted_words'])

training_ds = KeywordDataset(
    training_fns,
    background_fns,
    meta_dict,
    training_meta['batch_size'],
    is_validation = False
)
validation_ds = KeywordDataset(
    validation_fns,
    background_fns,
    meta_dict,
    training_meta['batch_size'] * 2, ## a bigger batch size is possible since no gradients are used
    is_validation = True
)

In [40]:
## Sequential API: doesn't work
#model = tf.keras.Sequential([
#    tf.keras.layers.Reshape((spectrogram_lenght,feature_bin_count,1), input_shape = (fingerprint_size, )),
#    tf.keras.layers.Conv2D(filters = 8, kernel_size = (8,10), strides = (2,2), padding = 'same', activation="relu"),
#    ##tf.keras.layers.Dropout(0.5),
#    ##tf.keras.layers.Flatten(),
#    tf.keras.layers.Reshape((4000,)),
#    tf.keras.layers.Dense(5, activation = "softmax"),
#])

In [41]:
## Functional API: does work!
fingerprint_size = audio_meta['fingerprint_size']
spectrogram_length = audio_meta['spectrogram_lenght']
feature_bin_count = audio_meta['feature_bin_count']

n_labels = len(training_ds.vocab)

inputs = tf.keras.Input(shape = (fingerprint_size,))
x = tf.keras.layers.Reshape(target_shape = [-1, spectrogram_length, feature_bin_count, 1])(inputs)
x = tf.keras.layers.Conv2D(filters = 8, 
                           kernel_size = (8, 10), 
                           strides = (2, 2), 
                           padding = 'same', 
                           activation = 'relu')(x)
x = tf.keras.layers.ReLU()(x)
x = tf.keras.layers.Dropout(0.7)(x)
x = tf.keras.layers.Flatten()(x)
out = tf.keras.layers.Dense(n_labels, 
                            activation = 'softmax')(x)

model = tf.keras.Model(inputs = inputs, outputs = out)

In [42]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1960)]            0         
                                                                 
 reshape_3 (Reshape)         (None, 1, 49, 40, 1)      0         
                                                                 
 conv2d_3 (Conv2D)           (None, 1, 25, 20, 8)      648       
                                                                 
 re_lu_3 (ReLU)              (None, 1, 25, 20, 8)      0         
                                                                 
 dropout_3 (Dropout)         (None, 1, 25, 20, 8)      0         
                                                                 
 flatten_3 (Flatten)         (None, 4000)              0         
                                                                 
 dense_3 (Dense)             (None, 5)                 2000

In [43]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate = training_meta['learning_rate']),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
    metrics = ['accuracy']
)

In [44]:
history = model.fit(
    training_ds,
    validation_data = validation_ds,
    epochs = 25,
    verbose = 1,
    shuffle = False, ## is handled by dataloader
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [45]:
export_name = 'functional_augmented_859'
export_dir = f'saved_model/{export_name}'
tf.saved_model.save(model, export_dir)



INFO:tensorflow:Assets written to: saved_model/functional_augmented_859/assets


INFO:tensorflow:Assets written to: saved_model/functional_augmented_859/assets


In [46]:
REP_DATA_SIZE = 100
def representative_dataset_gen():
    for i in range(REP_DATA_SIZE):
            fn = random.choice(validation_ds.items)
            label = validation_ds.get_label(fn)
            audio = validation_ds.get_audio(fn, label)
            spectro = validation_ds.get_spectrogram(audio).reshape(1,1960)
            
            yield [spectro]

In [47]:
converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.inference_input_type = tf.compat.v1.lite.constants.INT8 
converter.inference_output_type = tf.compat.v1.lite.constants.INT8

converter.representative_dataset = representative_dataset_gen
tflite_model = converter.convert()
tflite_model_size = open(f"models/{export_name}.tflite", "wb").write(tflite_model)
print(f"Quantized modelsize: {tflite_model_size}")

Quantized modelsize: 23752


2022-08-09 16:33:33.074444: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2022-08-09 16:33:33.074472: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2022-08-09 16:33:33.075026: I tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: saved_model/functional_augmented_859
2022-08-09 16:33:33.076320: I tensorflow/cc/saved_model/reader.cc:81] Reading meta graph with tags { serve }
2022-08-09 16:33:33.076334: I tensorflow/cc/saved_model/reader.cc:122] Reading SavedModel debug info (if present) from: saved_model/functional_augmented_859
2022-08-09 16:33:33.079187: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
2022-08-09 16:33:33.079990: I tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-08-09 16:33:33.109710: I tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel 

In [48]:
!xxd -i models/{export_name}.tflite > models/{export_name}.cc
#REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')
#!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}

In [19]:
!code models/{export_name}.cc