In [None]:
!pip install tensorflow_io

Collecting tensorflow_io
  Downloading tensorflow_io-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (28.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.0/28.0 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_io
Successfully installed tensorflow_io-0.32.0


In [None]:
import os
import wave
import librosa
import random
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import IPython.display as ipd
import tensorflow as tf
import tensorflow_io as tfio
from functools import reduce

# Speech Recognition

In [None]:
# path = 'drive/MyDrive/Kaggle/speech_recognition/'

path = '/content/gdrive/MyDrive/Final_Project/speech_recognition/'
classes = ['yes', 'no', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'up', 'down']

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Building dataset

In [None]:
# Define Paths to labeled data and Split into Train, Validation and Test using Kaggle split
tf_dic_train = {}
tf_dic_val = {}
tf_dic_test = {}
FILES_TO_LOAD = 500

with open(os.path.join(path + "/testing_list.txt"), 'r') as file:
    test_names = file.read().splitlines()
with open(os.path.join(path + "/validation_list.txt"), 'r') as file:
    val_names = file.read().splitlines()

for i, label in enumerate(classes):
  train_tmp = []
  val_tmp = []
  test_tmp = []
  for filename in os.listdir(os.path.join(path + label)):
    if "/".join([label, filename]) in test_names:
      if len(test_tmp) >= FILES_TO_LOAD*0.1:
        pass
      else:
        test_tmp.append(os.path.join(path + label + "/" + filename))
    elif "/".join([label, filename]) in val_names:
      if len(val_tmp) >= FILES_TO_LOAD*0.1:
        pass
      else:
        val_tmp.append(os.path.join(path + label + "/" + filename))
    else:
      if len(train_tmp) >= FILES_TO_LOAD*0.8:
        pass
      else:
        train_tmp.append(os.path.join(path + label + "/" + filename))

  train_tmp_tf = tf.data.Dataset.list_files(train_tmp)
  val_tmp_tf = tf.data.Dataset.list_files(val_tmp)
  test_tmp_tf = tf.data.Dataset.list_files(test_tmp)

  tf_dic_train[label] = tf.data.Dataset.zip((train_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(train_tmp_tf),), i))))
  tf_dic_val[label] = tf.data.Dataset.zip((val_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(val_tmp_tf),), i))))
  tf_dic_test[label] = tf.data.Dataset.zip((test_tmp_tf, tf.data.Dataset.from_tensor_slices(tf.fill((len(test_tmp_tf),), i))))

datasets_train = list(tf_dic_train.values())
datasets_val = list(tf_dic_val.values())
datasets_test = list(tf_dic_test.values())

In [None]:
def dataset_reduce(datasets):
  merged_dataset_reduce = reduce(lambda d1, d2: d1.concatenate(d2), datasets)
  return merged_dataset_reduce.shuffle(buffer_size=1000)

In [None]:
datasets_train = dataset_reduce(datasets_train)
datasets_val = dataset_reduce(datasets_val)
datasets_test = dataset_reduce(datasets_test)

In [None]:
len(datasets_train), len(datasets_val), len(datasets_test)

(5171, 650, 650)

# Preprocessing

In [None]:
def load_wav_16k_mono(filename):
    # Load encoded wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channels)
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [None]:
# Load padding wav file from _noice folder (/content/gdrive/MyDrive/speech_recognition/_noise/dude_miaowing.wav)
padding_file = '/content/gdrive/MyDrive/Final_Project/speech_recognition/_noise/dude_miaowing.wav'
padding_contents = tf.io.read_file(padding_file)
padding_waveform, _ = tf.audio.decode_wav(padding_contents, desired_channels=1)
padding_waveform = tf.squeeze(padding_waveform, axis=-1)

In [None]:
def preprocess_multy(file_path, label):
    wav = load_wav_16k_mono(file_path)
    wav = wav[:16000]

    # Calculate difference in length
    target_length = 16000
    current_length = tf.shape(wav)[0]
    pad_length = target_length - current_length

    # Pad waveform if it is shorter than the target length
    if pad_length > 0:
        padding_start = tf.random.uniform(shape=[], minval=0, maxval=padding_waveform.shape[0] - pad_length, dtype=tf.int32)
        padding_slice = padding_waveform[padding_start : padding_start + pad_length]

        # Pad the waveform with the selected slice
        wav = tf.concat([wav, padding_slice], axis=0)

    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, tf.one_hot(label, 13)

In [None]:
# Train data:
train_data = datasets_train.map(preprocess_multy)
train_data = train_data.cache()
train_data = train_data.shuffle(buffer_size=1000)
train_data = train_data.batch(16)
train_data = train_data.prefetch(8)



In [None]:
# Validation data:
val_data = datasets_val.map(preprocess_multy)
val_data = val_data.cache()
val_data = val_data.shuffle(buffer_size=1000)
val_data = val_data.batch(16)
val_data = val_data.prefetch(8)



In [None]:
# Test data:
test_data = datasets_test.map(preprocess_multy)
test_data = test_data.cache()
test_data = test_data.shuffle(buffer_size=1000)
test_data = test_data.batch(16)
test_data = test_data.prefetch(8)



In [None]:
len(train_data), len(val_data), len(test_data)

(324, 41, 41)

In [None]:
samples, labels = train_data.as_numpy_iterator().next()
samples.shape, labels.shape

((16, 491, 257, 1), (16, 13))

In [None]:
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import precision_score, make_scorer


In [None]:
base_model = Sequential()
base_model.add(Conv2D(16, (3, 3), activation='relu', input_shape=(491, 257, 1)))
base_model.add(Conv2D(16, (3, 3), activation='relu'))
base_model.add(Flatten())
base_model.add(Dense(128, activation='relu'))
base_model.add(Dense(13, activation='sigmoid'))

In [None]:
base_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])


In [None]:
base_model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 489, 255, 16)      160       
                                                                 
 conv2d_3 (Conv2D)           (None, 487, 253, 16)      2320      
                                                                 
 flatten_1 (Flatten)         (None, 1971376)           0         
                                                                 
 dense_2 (Dense)             (None, 128)               252336256 
                                                                 
 dense_3 (Dense)             (None, 13)                1677      
                                                                 
Total params: 252,340,413
Trainable params: 252,340,413
Non-trainable params: 0
_________________________________________________________________


In [None]:
hist = base_model.fit(train_data, validation_data=val_data, epochs=4)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
base_model.save('/content/gdrive/MyDrive/Final_Project/speech_recognition/baseline_4')

NameError: ignored

In [None]:
from keras.models import load_model

base_model_loaded = load_model('/content/gdrive/MyDrive/Final_Project/speech_recognition/baseline_4')

In [None]:
X_test, y_test = test_data.as_numpy_iterator().next()

base_model_loaded.evaluate(X_test, y_test)



[0.31772035360336304, 0.1875, 0.375]

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Create empty arrays/lists for true labels and predicted labels
y_true = []
y_pred = []

# Iterate over the test dataset and make predictions
for x, y in test_data:
    # Predict the labels using the trained model
    predictions = base_model_loaded.predict(x)
    predicted_labels = np.argmax(predictions, axis=1)

    # Store the true labels and predicted labels
    y_true.extend(np.argmax(y, axis=1))
    y_pred.extend(predicted_labels)

# Convert the true labels and predicted labels to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Generate the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.90      0.56      0.69        50
           1       0.50      0.20      0.29        50
           2       0.48      0.30      0.37        50
           3       0.69      0.48      0.56        50
           4       0.78      0.42      0.55        50
           5       0.75      0.60      0.67        50
           6       0.70      0.38      0.49        50
           7       0.89      0.68      0.77        50
           8       0.65      0.82      0.73        50
           9       0.50      0.62      0.55        50
          10       0.38      0.90      0.53        50
          11       0.58      0.70      0.64        50
          12       0.37      0.72      0.49        50

    accuracy                           0.57       650
   macro avg       0.63      0.57      0.56       650
weighted avg       0.63      0.57      0.56       650



**Model with dropout added**

In [None]:
base_model_new = Sequential()
base_model_new.add(Conv2D(16, (3, 3), activation='relu', input_shape=(491, 257, 1)))
base_model_new.add(Dropout(0.25))
base_model_new.add(Conv2D(16, (3, 3), activation='relu'))
base_model_new.add(Flatten())
base_model_new.add(Dense(128, activation='relu'))
base_model_new.add(Dropout(0.5))
base_model_new.add(Dense(13, activation='sigmoid'))

In [None]:
base_model_new.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision(),tf.keras.metrics.Accuracy() ])

In [None]:
hist = base_model_new.fit(train_data, validation_data=val_data, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
base_model_new.save('/content/gdrive/MyDrive/Final_Project/speech_recognition/base_model_new_15')



In [None]:
from keras.models import load_model

base_model_new_15_loaded = load_model('/content/gdrive/MyDrive/Final_Project/speech_recognition/base_model_new_15')

In [None]:
X_test, y_test = test_data.as_numpy_iterator().next()

base_model_new_15_loaded.evaluate(X_test, y_test)



[0.32944822311401367, 0.1875, 0.75, 0.0]

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Create empty arrays/lists for true labels and predicted labels
y_true = []
y_pred = []

# Iterate over the test dataset and make predictions
for x, y in test_data:
    # Predict the labels using the trained model
    predictions = base_model_new_15_loaded.predict(x)
    predicted_labels = np.argmax(predictions, axis=1)

    # Store the true labels and predicted labels
    y_true.extend(np.argmax(y, axis=1))
    y_pred.extend(predicted_labels)

# Convert the true labels and predicted labels to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Generate the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.75      0.60      0.67        50
           1       0.42      0.26      0.32        50
           2       0.64      0.14      0.23        50
           3       0.50      0.26      0.34        50
           4       0.47      0.36      0.41        50
           5       0.61      0.46      0.52        50
           6       0.55      0.46      0.50        50
           7       0.83      0.58      0.68        50
           8       0.62      0.72      0.67        50
           9       0.51      0.72      0.60        50
          10       0.31      0.70      0.43        50
          11       0.47      0.68      0.56        50
          12       0.30      0.46      0.37        50

    accuracy                           0.49       650
   macro avg       0.54      0.49      0.48       650
weighted avg       0.54      0.49      0.48       650



**AlexNet Format**

In [None]:
base_model_alex_net = Sequential()

# Layer 1
base_model_alex_net.add(Conv2D(96, (11, 11), strides=(4, 4), activation='relu', input_shape=(491, 257, 1)))
base_model_alex_net.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))

# Layer 2
base_model_alex_net.add(Conv2D(256, (5, 5), padding='same', activation='relu'))
base_model_alex_net.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))

# Layer 3
base_model_alex_net.add(Conv2D(384, (3, 3), padding='same', activation='relu'))

# Layer 4
base_model_alex_net.add(Conv2D(384, (3, 3), padding='same', activation='relu'))

# Layer 5
base_model_alex_net.add(Conv2D(256, (3, 3), padding='same', activation='relu'))
base_model_alex_net.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))

# Flatten layer
base_model_alex_net.add(Flatten())

# Fully connected layers
base_model_alex_net.add(Dense(4096, activation='relu'))
base_model_alex_net.add(Dropout(0.5))

base_model_alex_net.add(Dense(4096, activation='relu'))
base_model_alex_net.add(Dropout(0.5))

# Output layer
base_model_alex_net.add(Dense(13, activation='softmax'))

In [None]:
base_model_alex_net.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), tf.keras.metrics.Accuracy()])


In [None]:
base_model_alex_net.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_5 (Conv2D)           (None, 121, 62, 96)       11712     
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 60, 30, 96)       0         
 2D)                                                             
                                                                 
 conv2d_6 (Conv2D)           (None, 60, 30, 256)       614656    
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 29, 14, 256)      0         
 2D)                                                             
                                                                 
 conv2d_7 (Conv2D)           (None, 29, 14, 384)       885120    
                                                                 
 conv2d_8 (Conv2D)           (None, 29, 14, 384)      

In [None]:
hist = base_model_alex_net.fit(train_data, validation_data=val_data, epochs=6)


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
base_model_alex_net.save('/content/gdrive/MyDrive/Final_Project/speech_recognition/alex_net_6')

