In [1]:
!pip install tensorflow_io

Collecting tensorflow_io
  Downloading tensorflow_io-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (28.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.0/28.0 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_io
Successfully installed tensorflow_io-0.32.0


In [2]:
import os
import wave
import librosa
import random
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import IPython.display as ipd
import tensorflow as tf
import tensorflow_io as tfio
from functools import reduce

# Speech Recognition

In [3]:
# path = 'drive/MyDrive/Kaggle/speech_recognition/'

path = '/content/gdrive/MyDrive/speech_recognition/'
classes = ['yes', 'no', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'up', 'down']

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Building dataset

In [44]:
# Define Paths to labeled data and Split into Train, Validation and Test using Kaggle split
tf_dic_train = {}
tf_dic_val = {}
tf_dic_test = {}
FILES_TO_LOAD = 500

with open(os.path.join(path + "/testing_list.txt"), 'r') as file:
    test_names = file.read().splitlines()
with open(os.path.join(path + "/validation_list.txt"), 'r') as file:
    val_names = file.read().splitlines()

for i, label in enumerate(classes):
  train_tmp = []
  val_tmp = []
  test_tmp = []
  for filename in os.listdir(os.path.join(path + label)):
    if "/".join([label, filename]) in test_names:
      if len(test_tmp) >= FILES_TO_LOAD*0.1:
        pass
      else:
        test_tmp.append(os.path.join(path + label + "/" + filename))
    elif "/".join([label, filename]) in val_names:
      if len(val_tmp) >= FILES_TO_LOAD*0.1:
        pass
      else:
        val_tmp.append(os.path.join(path + label + "/" + filename))
    else:
      if len(train_tmp) >= FILES_TO_LOAD*0.8:
        pass
      else:
        train_tmp.append(os.path.join(path + label + "/" + filename))

  train_tmp_tf = tf.data.Dataset.list_files(train_tmp)
  val_tmp_tf = tf.data.Dataset.list_files(val_tmp)
  test_tmp_tf = tf.data.Dataset.list_files(test_tmp)

  tf_dic_train[label] = tf.data.Dataset.zip((train_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(train_tmp_tf),), i))))
  tf_dic_val[label] = tf.data.Dataset.zip((val_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(val_tmp_tf),), i))))
  tf_dic_test[label] = tf.data.Dataset.zip((test_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(test_tmp_tf),), i))))

datasets_train = list(tf_dic_train.values())
datasets_val = list(tf_dic_val.values())
datasets_test = list(tf_dic_test.values())

In [45]:
def dataset_reduce(datasets):
  merged_dataset_reduce = reduce(lambda d1, d2: d1.concatenate(d2), datasets)
  return merged_dataset_reduce.shuffle(buffer_size=1000)

In [46]:
datasets_train = dataset_reduce(datasets_train)
datasets_val = dataset_reduce(datasets_val)
datasets_test = dataset_reduce(datasets_test)

In [47]:
len(datasets_train), len(datasets_val), len(datasets_test)

(5171, 650, 650)

# Preprocessing

In [48]:
def load_wav_16k_mono(filename):
    # Load encoded wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channels)
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [49]:
# Load padding wav file from _noice folder (/content/gdrive/MyDrive/speech_recognition/_noise/dude_miaowing.wav)
padding_file = '/content/gdrive/MyDrive/speech_recognition/_noise/dude_miaowing.wav'
padding_contents = tf.io.read_file(padding_file)
padding_waveform, _ = tf.audio.decode_wav(padding_contents, desired_channels=1)
padding_waveform = tf.squeeze(padding_waveform, axis=-1)

In [50]:
def preprocess_multy(file_path, label):
    wav = load_wav_16k_mono(file_path)
    wav = wav[:16000]

    # Calculate difference in length
    target_length = 16000
    current_length = tf.shape(wav)[0]
    pad_length = target_length - current_length

    # Pad waveform if it is shorter than the target length
    if pad_length > 0:
        padding_start = tf.random.uniform(shape=[], minval=0, maxval=padding_waveform.shape[0] - pad_length, dtype=tf.int32)
        padding_slice = padding_waveform[padding_start : padding_start + pad_length]

        # Pad the waveform with the selected slice
        wav = tf.concat([wav, padding_slice], axis=0)

    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, tf.one_hot(label, 13)

In [51]:
# Train data:
train_data = datasets_train.map(preprocess_multy)
train_data = train_data.cache()
train_data = train_data.shuffle(buffer_size=1000)
train_data = train_data.batch(16)
train_data = train_data.prefetch(8)



In [52]:
# Validation data:
val_data = datasets_val.map(preprocess_multy)
val_data = val_data.cache()
val_data = val_data.shuffle(buffer_size=1000)
val_data = val_data.batch(16)
val_data = val_data.prefetch(8)



In [53]:
# Test data:
test_data = datasets_test.map(preprocess_multy)
test_data = test_data.cache()
test_data = test_data.shuffle(buffer_size=1000)
test_data = test_data.batch(16)
test_data = test_data.prefetch(8)



In [54]:
len(train_data), len(val_data), len(test_data)

(324, 41, 41)

In [55]:
samples, labels = train_data.as_numpy_iterator().next()
samples.shape, labels.shape

((16, 491, 257, 1), (16, 13))