In [1]:
%%capture
import numpy as np
from pprint import pprint
import tensorflow as tf
import random

##import re
##import shutil
##import glob
##import os

!pip install tensorflow-io
!pip install audiomentations

In [2]:
%%capture
!wget https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz
DATASET_DIR =  'dataset/'
!mkdir dataset
!tar -xf speech_commands_v0.02.tar.gz -C 'dataset'
!rm -r -f speech_commands_v0.02.tar.gz

!apt-get update && apt-get -qq install xxd

!rm KeywordDataset.py
!wget https://raw.githubusercontent.com/ben-karr/KeywordDetectionTinyML/master/KeywordDataset.py

In [3]:
from KeywordDataset import get_fns, KeywordDataset, get_meta, get_pretrain_words, get_model

# Prepare settings

At first we build a dictionary to hold metadata about training and data handling. At least pass the path to the dataset as `data_path` and `wanted_words` as a list. You can check the default values for the used keywords:

In [4]:
pprint(get_meta(data_path='dataset/', wanted_words = ['yes', 'no']))

{'audio': {'clip_duration': 1000,
           'desired_samples': 16000,
           'feature_bin_count': 40,
           'fingerprint_size': 1960,
           'sample_rate': 16000,
           'spectrogram_lenght': 49,
           'window_size_ms': 30,
           'window_stride': 20},
 'augmentation': {'background_frequency': 0.8,
                  'background_volume_range': 0.1,
                  'silence_percentage': 0.2,
                  'time_shift_ms': 100.0,
                  'unknown_percentage': 0.2},
 'training': {'batch_size': 32,
              'data_path': 'dataset/',
              'epochs': 5,
              'excluded_words': [],
              'learning_rate': 0.001,
              'wanted_words': ['yes', 'no']}}


If you want to use pretraining you can use `get_pretrain_words`. It pulls `n` words from the provided `path` ignoring the `excluded_words` so you can fine tune on these words while the pretrained model has not seen them yet.

In [5]:
data_path = 'dataset/'
excluded_words = ['yes', 'no']
wanted_words = get_pretrain_words(data_path, excluded_words, shuffle = True, n = 5)

meta_dict = get_meta(data_path=data_path, wanted_words=wanted_words, excluded_words=excluded_words, epochs=1)
pprint(meta_dict)

{'audio': {'clip_duration': 1000,
           'desired_samples': 16000,
           'feature_bin_count': 40,
           'fingerprint_size': 1960,
           'sample_rate': 16000,
           'spectrogram_lenght': 49,
           'window_size_ms': 30,
           'window_stride': 20},
 'augmentation': {'background_frequency': 0.8,
                  'background_volume_range': 0.1,
                  'silence_percentage': 0.2,
                  'time_shift_ms': 100.0,
                  'unknown_percentage': 0.2},
 'training': {'batch_size': 32,
              'data_path': 'dataset/',
              'epochs': 1,
              'excluded_words': ['yes', 'no'],
              'learning_rate': 0.001,
              'wanted_words': ['sheila', 'nine', 'five', 'down', 'follow']}}


# Load item / background filenames
To load the .wav files into memory when training, the KeywordDataset needs a list of filenames to build batches from. `get_fns` collects all .wav files from subfolders in a given directory. Besides filenames cointained in folders named like one of the `wanted_words` it adds such words that are neither wanted nor excluded as `unknown` words as well as placeholders for silence (`silence_placeholder`) in the required ratio (given by `silent_pct`, `unknown_pct`). This ratio extends to train and validation split which size is determined by `val_pct`.

_Note:_ When generating the list of filenames every audio file is loaded into memory to check if it has the required length and skipped if not. Since this takes some time you could save the results (as a python list or .csv); you could also do this if you want to fix the training / validation split for model comparison.

In [6]:
training_fns, validation_fns, background_fns = get_fns(
    path = meta_dict['training']['data_path'],
    wanted_words = meta_dict['training']['wanted_words'],
    excluded_words = meta_dict['training']['excluded_words'],
    desired_samples = meta_dict['audio']['desired_samples'],
    val_pct = 0.2,
    silent_pct = 0.2,
    unknown_pct = 0.2
)

In [7]:
print(training_fns[:10])

['dataset/down/a6285644_nohash_1.wav', 'silence_placeholder', 'dataset/down/0819edb0_nohash_1.wav', 'dataset/four/f9643d42_nohash_3.wav', 'silence_placeholder', 'silence_placeholder', 'dataset/down/b36c27c2_nohash_0.wav', 'dataset/five/784e281a_nohash_1.wav', 'dataset/down/9b02d503_nohash_1.wav', 'dataset/down/8f4c551f_nohash_0.wav']


# Build training / validation datasets
Other than the filenames, the class gets all required metadata from the provided `meta_dict`. For validation we use double the batch size of that in training since not using gradients in validation frees up memory.

In [8]:
training_ds = KeywordDataset(
    training_fns,
    background_fns,
    meta_dict,
    meta_dict['training']['batch_size'],
    is_validation = False
)
validation_ds = KeywordDataset(
    validation_fns,
    background_fns,
    meta_dict,
    meta_dict['training']['batch_size'] * 2,
    is_validation = True
)

The KeywordDataset class copies most of the data loading process from various parts of [input_data.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/speech_commands/input_data.py). A thing it adds is augmentations (in addition to the default backgound noise and shifting the audio signal a bit back or forth), in particular those of the [audiomentations library](https://github.com/iver56/audiomentations). By default it uses the following steps but I spend next to no time to find good performing augmentations / settings, so it would be a good idea to tinker around with them some more.

In [9]:
training_ds.augment.transforms

[<audiomentations.augmentations.clipping_distortion.ClippingDistortion at 0x7fdf0de55490>,
 <audiomentations.augmentations.high_pass_filter.HighPassFilter at 0x7fdf0de55dd0>,
 <audiomentations.augmentations.low_pass_filter.LowPassFilter at 0x7fdf0de55890>,
 <audiomentations.augmentations.gain_transition.GainTransition at 0x7fdf0de55bd0>,
 <audiomentations.augmentations.pitch_shift.PitchShift at 0x7fdf0de55510>,
 <audiomentations.augmentations.seven_band_parametric_eq.SevenBandParametricEQ at 0x7fdf0de55c50>,
 <audiomentations.augmentations.polarity_inversion.PolarityInversion at 0x7fdf0de55390>,
 <audiomentations.augmentations.time_mask.TimeMask at 0x7fdf0de552d0>,
 <audiomentations.augmentations.add_gaussian_noise.AddGaussianNoise at 0x7fdf0de55990>]

You can easily change this at the `training_ds.augment` argument, e.g.:
```python
import audiomentations
augs = audiomentations.Compose([
  audiomentations.ClippingDistortion(p=0.5),
  audiomentations.LowPassFilter(p=0.5),
  …
])
training_ds.augment = augs
```
or remove augmentations (other than background and timeshift) at all:
```python
training_ds.augment = lambda x: return x
```

If you want to have a quick listen at the results of your augmentations you can check the audio before the spectrogram is createt with the `get_audio` method.

In [10]:
from IPython.display import Audio

fn = training_ds.items[2]
label = training_ds.get_label(fn)
audio = training_ds.get_audio(fn, label).numpy().flatten()
audio = training_ds.augment(audio, sample_rate = meta_dict['audio']['sample_rate'])

print(f'Label: {label}')
Audio(audio, rate = 16000)

Label: down


# Building a model
Since the data is ready to train now, we need to create a model. You can build your own keras model but make shure to use the Functional API since when using the Sequential API inference on the Arduino doesn't work (at least for me). Alternatively you can use `get_model` to receive an implementation of the `tiny` models from the original [tensorflow speech_command example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/speech_commands/models.py) (`tiny_conv` or `tiny_embedding_conv`).

In [11]:
n_labels = len(training_ds.vocab)
model = get_model(n_labels, meta_dict, arch = 'tiny_conv', dropout = 0.5)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1960)]            0         
                                                                 
 reshape (Reshape)           (None, 1, 49, 40, 1)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, 25, 20, 8)      648       
                                                                 
 dropout (Dropout)           (None, 1, 25, 20, 8)      0         
                                                                 
 flatten (Flatten)           (None, 4000)              0         
                                                                 
 dense (Dense)               (None, 7)                 28007     
                                                                 
Total params: 28,655
Trainable params: 28,655
Non-trainable p

# Train the model
Compile the model with the desired optimizer, loss and metrics and train for a few epochs.

In [12]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate = meta_dict['training']['learning_rate']),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
    metrics = ['accuracy']
)

In [None]:
history = model.fit(
    training_ds,
    validation_data = validation_ds,
    epochs = meta_dict['training']['epochs'],
    verbose = 1,
    shuffle = False, ## is handled by the dataloader
)



# Save model
Use the save method of the keras model or `tf.keras.models.save_model` if you want to use `get_model` for transfer learning otherwise you have to change the head of the model yourself.

In [None]:
model.save('pretrain_1epochs')

# Prepare settings and dataset for transfer learning
We can now use the words to fine tune for that we excluded before...


In [13]:
meta_dict = get_meta(data_path='dataset/', wanted_words = ['yes', 'no'])
pprint(meta_dict)

{'audio': {'clip_duration': 1000,
           'desired_samples': 16000,
           'feature_bin_count': 40,
           'fingerprint_size': 1960,
           'sample_rate': 16000,
           'spectrogram_lenght': 49,
           'window_size_ms': 30,
           'window_stride': 20},
 'augmentation': {'background_frequency': 0.8,
                  'background_volume_range': 0.1,
                  'silence_percentage': 0.2,
                  'time_shift_ms': 100.0,
                  'unknown_percentage': 0.2},
 'training': {'batch_size': 32,
              'data_path': 'dataset/',
              'epochs': 5,
              'excluded_words': [],
              'learning_rate': 0.001,
              'wanted_words': ['yes', 'no']}}


get the new filenames...

In [14]:
training_fns, validation_fns, background_fns = get_fns(
    path = meta_dict['training']['data_path'],
    wanted_words = meta_dict['training']['wanted_words'],
    excluded_words = meta_dict['training']['excluded_words'],
    desired_samples = meta_dict['audio']['desired_samples'],
    val_pct = 0.2,
    silent_pct = 0.2,
    unknown_pct = 0.2
)

and create the datasets:

In [15]:
training_ds = KeywordDataset(
    training_fns,
    background_fns,
    meta_dict,
    meta_dict['training']['batch_size'],
    is_validation = False
)
validation_ds = KeywordDataset(
    validation_fns,
    background_fns,
    meta_dict,
    meta_dict['training']['batch_size'] * 2,
    is_validation = True
)

# Load the pretrained model
`get_model` loads the pretrained model from a given path, removes the head (last Dense layer) and adds a Dense layer that has the right amount of output nodes.

In [17]:
n_labels = len(training_ds.vocab)
fine_tune_model = get_model(n_labels, meta_dict, pretrain_path = 'pretrain_1epochs')
fine_tune_model.summary()

In [None]:
fine_tune_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate = meta_dict['training']['learning_rate']),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
    metrics = ['accuracy']
)

As you can see in the summary: `get_model` freezes all but the last layer (`Non-trainable params: 648`). We should train the model in this fashion for a few epochs so the untrained layer can 'catch up' with the pretrained ones.

In [None]:
fine_tune_history = fine_tune_model.fit(
    training_ds,
    validation_data = validation_ds,
    epochs = 2,
    verbose = 1,
    shuffle = False, ## is handled by dataloader
)

Epoch 1/2
Epoch 2/2


Unfreeze the model and train some more.

In [None]:
fine_tune_model.trainable = True
fine_tune_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1960)]            0         
                                                                 
 reshape (Reshape)           (None, 1, 49, 40, 1)      0         
                                                                 
 conv2d (Conv2D)             (None, 1, 25, 20, 8)      648       
                                                                 
 dropout (Dropout)           (None, 1, 25, 20, 8)      0         
                                                                 
 flatten (Flatten)           (None, 4000)              0         
                                                                 
 dense_1 (Dense)             (None, 4)                 16004     
                                                                 
Total params: 16,652
Trainable params: 16,652
Non-trainable

In [None]:
fine_tune_history_thawn = fine_tune_model.fit(
    training_ds,
    validation_data = validation_ds,
    epochs = 3,
    verbose = 1,
    shuffle = False, ## is handled by dataloader
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Save model for TFLite
from: [CustomDatasetKWSModel](https://colab.research.google.com/github/tinyMLx/colabs/blob/master/4-6-8-CustomDatasetKWSModel.ipynb) of [Deploying TinyML](https://learning.edx.org/course/course-v1:HarvardX+TinyML3+1T2022/home).

In [None]:
export_name = 'model_fromPre_wAugs'
export_dir = f'saved_model/{export_name}'
tf.saved_model.save(fine_tune_model, export_dir)

Build a dataset generator for quantization

In [None]:
REP_DATA_SIZE = 100
items = validation_ds.items
random.shuffle(items)
def representative_dataset_gen():
    for fn in items[:REP_DATA_SIZE]:
            label = validation_ds.get_label(fn)
            audio = validation_ds.get_audio(fn, label)
            spectro = validation_ds.get_spectrogram(audio).reshape(1,meta_dict['audio']['fingerprint_size'])
            
            yield [spectro]

Do quantization:

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.inference_input_type = tf.int8 #tf.compat.v1.lite.constants.INT8 
converter.inference_output_type = tf.int8 #tf.compat.v1.lite.constants.INT8

converter.representative_dataset = representative_dataset_gen
tflite_model = converter.convert()
tflite_model_size = open(f"{export_name}.tflite", "wb").write(tflite_model)
print(f"Quantized modelsize: {tflite_model_size}")



Quantized modelsize: 19424


# Save for Arduino

In [None]:
!xxd -i {export_name}.tflite > {export_name}.cc
!cat {export_name}.cc

unsigned char model_fromPre_wAugs_tflite[] = {
  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00,
  0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x00, 0x00,
  0x18, 0x00, 0x1c, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
  0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x34, 0x01, 0x00, 0x00,
  0x28, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
  0x04, 0x00, 0x00, 0x00, 0xf8, 0x05, 0x00, 0x00, 0x34, 0x04, 0x00, 0x00,
  0xa8, 0x02, 0x00, 0x00, 0xe8, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
  0x30, 0x01, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x80, 0x4b, 0x00, 0x00,
  0x7c, 0x4b, 0x00, 0x00, 0x68, 0x4a, 0x00, 0x00, 0x04, 0x4a, 0x00, 0x00,
  0xac, 0x46, 0x00, 0x00, 0xd8, 0x45, 0x00, 0x00, 0xe8, 0x06, 0x00, 0x00,
  0x70, 0x06, 0x00, 0x00, 0x60, 0x4b, 0x00, 0x00, 0x5c, 0x4b, 0x00, 0x00,
  0x58, 0x4b, 0x00, 0x00, 0x54, 0x4b, 0x00, 0x00, 0x50, 0x4b, 0x00, 0x00,
  0xb4, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x0