In [1]:
!pip install tensorflow_io



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import wave
import librosa
import random
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import IPython.display as ipd
import tensorflow as tf
import tensorflow_io as tfio
from functools import reduce

# Speech Recognition

In [4]:
path = '/content/drive/MyDrive/Kaggle/speech_recognition/'

#path = '/content/gdrive/MyDrive/speech_recognition/'
classes = ['yes', 'no', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'up', 'down']

# Building dataset

In [5]:
# Define Paths to labeled data and Split into Train, Validation and Test using Kaggle split
tf_dic_train = {}
tf_dic_val = {}
tf_dic_test = {}
FILES_TO_LOAD = 500

with open(os.path.join(path + "/testing_list.txt"), 'r') as file:
    test_names = file.read().splitlines()
with open(os.path.join(path + "/validation_list.txt"), 'r') as file:
    val_names = file.read().splitlines()

for i, label in enumerate(classes):
  train_tmp = []
  val_tmp = []
  test_tmp = []
  for filename in os.listdir(os.path.join(path + label)):
    if "/".join([label, filename]) in test_names:
      if len(test_tmp) >= FILES_TO_LOAD*0.1:
        pass
      else:
        test_tmp.append(os.path.join(path + label + "/" + filename))
    elif "/".join([label, filename]) in val_names:
      if len(val_tmp) >= FILES_TO_LOAD*0.1:
        pass
      else:
        val_tmp.append(os.path.join(path + label + "/" + filename))
    else:
      if len(train_tmp) >= FILES_TO_LOAD*0.8:
        pass
      else:
        train_tmp.append(os.path.join(path + label + "/" + filename))

  train_tmp_tf = tf.data.Dataset.list_files(train_tmp)
  val_tmp_tf = tf.data.Dataset.list_files(val_tmp)
  test_tmp_tf = tf.data.Dataset.list_files(test_tmp)

  tf_dic_train[label] = tf.data.Dataset.zip((train_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(train_tmp_tf),), i))))
  tf_dic_val[label] = tf.data.Dataset.zip((val_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(val_tmp_tf),), i))))
  tf_dic_test[label] = tf.data.Dataset.zip((test_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(test_tmp_tf),), i))))

datasets_train = list(tf_dic_train.values())
datasets_val = list(tf_dic_val.values())
datasets_test = list(tf_dic_test.values())

In [6]:
def dataset_reduce(datasets):
  merged_dataset_reduce = reduce(lambda d1, d2: d1.concatenate(d2), datasets)
  return merged_dataset_reduce.shuffle(buffer_size=1000)

In [7]:
datasets_train = dataset_reduce(datasets_train)
datasets_val = dataset_reduce(datasets_val)
datasets_test = dataset_reduce(datasets_test)

In [8]:
len(datasets_train), len(datasets_val), len(datasets_test)

(3120, 466, 486)

# Preprocessing

In [9]:
def load_wav_16k_mono(filename):
    # Load encoded wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channels)
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [10]:
# Load padding wav file from _noise folder (/content/gdrive/MyDrive/speech_recognition/_noise/dude_miaowing.wav)
padding_file = '/content/drive/MyDrive/Kaggle/speech_recognition/_noise/dude_miaowing.wav'
padding_contents = tf.io.read_file(padding_file)
padding_waveform, _ = tf.audio.decode_wav(padding_contents, desired_channels=1)
padding_waveform = tf.squeeze(padding_waveform, axis=-1)

In [11]:
def preprocess_multy(file_path, label, style="mfcc"):
    wav = load_wav_16k_mono(file_path)
    wav = wav[:16000]
    # print(type(wav))

    # Calculate difference in length
    target_length = 16000
    current_length = tf.shape(wav)[0]
    pad_length = target_length - current_length

    # Pad waveform if it is shorter than the target length
    if pad_length > 0:
        padding_start = tf.random.uniform(shape=[], minval=0, maxval=padding_waveform.shape[0] - pad_length, dtype=tf.int32)
        padding_slice = padding_waveform[padding_start : padding_start + pad_length]

        # Pad the waveform with the selected slice
        wav = tf.concat([wav, padding_slice], axis=0)

    # print(type(wav))

    # Build the output, i.e. spectrogram or mfcc

    spc = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spc = tf.abs(spc)

    # Define shape for the spectrogram tensor
    spectrogram_shape = tf.shape(spc)

    if style=="mfcc":
      # Compute MFCCs
      mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
      num_mel_bins=40,
      num_spectrogram_bins=spectrogram_shape[-1],
      sample_rate=target_length
      )
      mel_spectrogram = tf.matmul(spc, mel_filterbank)
      log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
      mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram)[:, :20]  # Keep only the first 20 coefficients
      img = tf.expand_dims(mfccs, axis=2)
    else:
      img = tf.expand_dims(spc, axis=2)

    return img, tf.one_hot(label, 13)

In [12]:
# Train data:
train_data = datasets_train.map(preprocess_multy)
train_data = train_data.cache()
train_data = train_data.shuffle(buffer_size=1000)
train_data = train_data.batch(16)
train_data = train_data.prefetch(8)



In [13]:
# Validation data:
val_data = datasets_val.map(preprocess_multy)
val_data = val_data.cache()
val_data = val_data.shuffle(buffer_size=1000)
val_data = val_data.batch(16)
val_data = val_data.prefetch(8)



In [14]:
# Test data:
test_data = datasets_test.map(preprocess_multy)
test_data = test_data.cache()
test_data = test_data.shuffle(buffer_size=1000)
test_data = test_data.batch(16)
test_data = test_data.prefetch(8)



In [15]:
len(train_data), len(val_data), len(test_data)

(195, 30, 31)

In [16]:
# samples, labels = train_data.as_numpy_iterator().next()
# samples.shape, labels.shape

# Base line model

In [17]:
# Build the baseline model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten

base_model = Sequential()
base_model.add(Conv2D(16, (3,3), activation='relu', input_shape=(491, 20, 1)))
base_model.add(Flatten())
base_model.add(Dense(len(classes), activation='softmax'))

In [18]:
base_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 489, 18, 16)       160       
                                                                 
 flatten (Flatten)           (None, 140832)            0         
                                                                 
 dense (Dense)               (None, 13)                1830829   
                                                                 
Total params: 1,830,989
Trainable params: 1,830,989
Non-trainable params: 0
_________________________________________________________________


In [19]:
base_model.compile('Adam', loss='CategoricalCrossentropy', metrics=['accuracy', tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

In [20]:
base_model.fit(train_data, validation_data=val_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4d5a510070>

# AlexNet inspiration Model

In [21]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define the CNN model
model = tf.keras.Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(491, 20, 1)),
    layers.Dropout(0.5),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.5),
    layers.Conv2D(256, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.5),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(13, activation='softmax')
])

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy', tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

# Print model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_1 (Conv2D)           (None, 489, 18, 64)       640       
                                                                 
 dropout (Dropout)           (None, 489, 18, 64)       0         
                                                                 
 conv2d_2 (Conv2D)           (None, 487, 16, 128)      73856     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 243, 8, 128)      0         
 )                                                               
                                                                 
 dropout_1 (Dropout)         (None, 243, 8, 128)       0         
                                                                 
 conv2d_3 (Conv2D)           (None, 241, 6, 256)       295168    
                                                      

In [22]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True)

# Train the model with callbacks
history = model.fit(train_data,
                    epochs=25,
                    validation_data=val_data,
                    callbacks=[early_stopping, model_checkpoint])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25


In [25]:
# Evaluate the model on the test data
test_loss, test_accuracy, test_recall, test_precision = model.evaluate(test_data)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)
print('Test Recall:', test_recall)
print('Test Precision:', test_precision)

Test Loss: 1.639186978340149
Test Accuracy: 0.6152263283729553
Test Recall: 0.5205761194229126
Test Precision: 0.7047353982925415


# ResNET (Future work)

In [29]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.applications import ResNet50

# Load pre-trained ResNet50 model without the top classification layer
resnet = ResNet50(weights='imagenet', include_top=False, input_shape=(32, 32, 1))

# Freeze the layers in the base model
resnet.trainable = False

# Create a new model by adding a global average pooling layer and a dense output layer
resnet_model = tf.keras.Sequential([
    tf.keras.layers.Resizing(32,32),
    resnet,
    layers.GlobalAveragePooling2D(),
    layers.Dense(13, activation='softmax')
])

# Compile the model
resnet_model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])

# Print model summary
resnet_model.summary()

ValueError: ignored

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True)

# Train the model with callbacks
history = model.fit(train_data,
                    epochs=10,
                    validation_data=val_data,
                    callbacks=[early_stopping, model_checkpoint])

In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy, test_recall, test_precision = resnet_model.evaluate(test_data)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)
print('Test Recall:', test_recall)
print('Test Precision:', test_precision)