In [1]:
import os
import os.path as path
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
from sklearn.model_selection import train_test_split

In [2]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [3]:
data_dir = 'speech_commands'
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands_directories = np.array([tf.io.gfile.isdir(path.join(data_dir, command)) for command in commands])
commands = commands[commands_directories]

In [4]:
filenames = tf.io.gfile.glob(data_dir + '/*/*')
filenames = tf.random.shuffle(filenames).numpy()
num_samples = len(filenames)
num_examples_per_label = np.array([len(tf.io.gfile.listdir(path.join(data_dir, command))) for command in commands])
print('Number of total examples:', num_samples)
print('Average number of examples per label:', num_examples_per_label.mean())
print('Standard Deviation of examples per label:', num_examples_per_label.std())

Number of total examples: 105836
Average number of examples per label: 2939.8888888888887
Standard Deviation of examples per label: 1099.0900725847364


In [5]:
train_files, val_files = train_test_split(filenames, test_size=0.3, random_state=seed)
val_files, test_files = train_test_split(val_files, test_size=0.33, random_state=seed)

print('Training set size:', len(train_files))
print('Validation set size:', len(val_files))
print('Test set size:', len(test_files))

Training set size: 74085
Validation set size: 21273
Test set size: 10478


In [6]:
def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1)

def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2]

def get_waveform_and_label(file_path):
    label = get_label(file_path)
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform, label

In [7]:
def get_spectrogram(waveform):
    # zero-padding for an audio waveform with less than 16,000 samples
    input_len = 16000
    waveform = waveform[:input_len]
    zero_padding = tf.zeros([16000] - tf.shape(waveform), dtype=tf.float32)
    # cast the waveform tensors to float32
    waveform = tf.cast(waveform, dtype=tf.float32)
    # concatenate the waveform with zero_padding, which ensures all
    # audio clips are of the same length.decode_audio
    equal_length = tf.concat([waveform, zero_padding], axis=0)
    # convert the waveform to a spectrogram via a STFT.
    spectrogram = tf.signal.stft(equal_length, frame_length=255, frame_step=128)
    # obtain the magnitude of the STFT.
    spectrogram = tf.abs(spectrogram)
    # add a channels dimension, so that the spectrogram can be used
    # as image-like input data with conv layers (which expect
    # shape (batch_size, height, width, channels)).
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram

def get_spectrogram_and_label_id(audio, label):
    spectrogram = get_spectrogram(audio)
    label_id = tf.argmax(label == commands)
    return spectrogram, label_id

In [8]:
AUTOTUNE = tf.data.AUTOTUNE

In [9]:
def preprocess_dataset(files):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(map_func=get_waveform_and_label, num_parallel_calls=AUTOTUNE)
    output_ds = output_ds.map(map_func=get_spectrogram_and_label_id, num_parallel_calls=AUTOTUNE)
    return output_ds

In [10]:
train_ds = preprocess_dataset(train_files)
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

In [11]:
batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

In [12]:
train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)