In [1]:
!pip install tensorflow_io

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
from matplotlib import pyplot as plt
import tensorflow as tf
import tensorflow_io as tfio

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
%cd ..
%cd content
%cd gdrive
%cd MyDrive
%cd speech_recognition

/
/content
/content/gdrive
/content/gdrive/MyDrive
/content/gdrive/MyDrive/speech_recognition


In [5]:
def load_wav_16k_mono(filename):
    # Load encoded wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channels)
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    # Goes from 44100Hz to 16000hz - amplitude of the audio signal
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

## Creating dataset

In [6]:
def preprocess_multy(file_path, label):
    wav = load_wav_16k_mono(file_path)
    wav = wav[:16000]
    zero_padding = tf.zeros([16000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav],0)
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, tf.one_hot(label, 13)

In [7]:
tf_yes = tf.data.Dataset.list_files(os.path.join('yes', '*.wav')).take(500)
tf_no = tf.data.Dataset.list_files(os.path.join('no', '*.wav')).take(500)
tf_one = tf.data.Dataset.list_files(os.path.join('one', '*.wav')).take(500)
tf_two = tf.data.Dataset.list_files(os.path.join('two', '*.wav')).take(500)
tf_three = tf.data.Dataset.list_files(os.path.join('three', '*.wav')).take(500)
tf_four = tf.data.Dataset.list_files(os.path.join('four', '*.wav')).take(500)
tf_five = tf.data.Dataset.list_files(os.path.join('five', '*.wav')).take(500)
tf_six = tf.data.Dataset.list_files(os.path.join('six', '*.wav')).take(500)
tf_seven = tf.data.Dataset.list_files(os.path.join('seven', '*.wav')).take(500)
tf_eight = tf.data.Dataset.list_files(os.path.join('eight', '*.wav')).take(500)
tf_nine = tf.data.Dataset.list_files(os.path.join('nine', '*.wav')).take(500)
tf_up = tf.data.Dataset.list_files(os.path.join('up', '*.wav')).take(500)
tf_down = tf.data.Dataset.list_files(os.path.join('down', '*.wav')).take(500)

In [8]:
yes = tf.data.Dataset.zip((tf_yes, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_yes),), 0))))
no = tf.data.Dataset.zip((tf_no, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_no),), 1))))
one = tf.data.Dataset.zip((tf_one, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_one),), 2))))
two = tf.data.Dataset.zip((tf_two, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_two),), 3))))
three = tf.data.Dataset.zip((tf_three, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_three),), 4))))
four = tf.data.Dataset.zip((tf_four, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_four),), 5))))
five = tf.data.Dataset.zip((tf_five, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_five),), 6))))
six = tf.data.Dataset.zip((tf_six, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_six),), 7))))
seven = tf.data.Dataset.zip((tf_seven, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_seven),), 8))))
eight = tf.data.Dataset.zip((tf_eight, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_eight),), 9))))
nine = tf.data.Dataset.zip((tf_nine, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_nine),), 10))))
up = tf.data.Dataset.zip((tf_up, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_up),), 11))))
down = tf.data.Dataset.zip((tf_down, tf.data.Dataset.from_tensor_slices(tf.fill((len(tf_down),), 12))))

In [9]:
datasets = [yes, no, one, two, three, four, five, six, seven, eight, nine, up, down]

In [10]:
from functools import reduce

In [11]:
merged_dataset_loop = None
for dataset in datasets:
    if merged_dataset_loop is None:
        merged_dataset_loop = dataset
    else:
        merged_dataset_loop = merged_dataset_loop.concatenate(dataset)

In [12]:
merged_dataset_reduce = reduce(lambda d1, d2: d1.concatenate(d2), datasets)

In [13]:
dataset = merged_dataset_reduce.shuffle(buffer_size=1000)

In [14]:
type(dataset)

tensorflow.python.data.ops.shuffle_op._ShuffleDataset

## Building a baseline model (precision+recall)

In [None]:
data = dataset.map(preprocess_multy)
data = data.cache()
data = data.shuffle(buffer_size=1000)
data = data.batch(16)
data = data.prefetch(8)



In [None]:
len(data), type(data)

(407, tensorflow.python.data.ops.prefetch_op._PrefetchDataset)

In [None]:
train = data.take(245)
val = data.skip(245).take(81)
test = data.skip(326).take(81)

In [None]:
samples, labels = train.as_numpy_iterator().next()
samples.shape, labels.shape

((16, 491, 257, 1), (16, 13))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten

In [None]:
base_model = Sequential()
base_model.add(Conv2D(16, (3,3), activation='relu', input_shape=(491, 257,1)))
base_model.add(Flatten())
base_model.add(Dense(13, activation='softmax'))

In [None]:
base_model.compile('Adam', loss='BinaryCrossentropy', metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

In [None]:
base_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 489, 255, 16)      160       
                                                                 
 flatten_2 (Flatten)         (None, 1995120)           0         
                                                                 
 dense_2 (Dense)             (None, 13)                25936573  
                                                                 
Total params: 25,936,733
Trainable params: 25,936,733
Non-trainable params: 0
_________________________________________________________________


In [None]:
hist = base_model.fit(train, validation_data=val, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
base_model.save('/content/gdrive/MyDrive/speech_recognition/baseline_10')



In [None]:
from keras.models import load_model

base_model_loaded = load_model('/content/gdrive/MyDrive/speech_recognition/baseline_10')

In [None]:
X_test, y_test = test.as_numpy_iterator().next()

base_model_loaded.evaluate(X_test, y_test)



[0.14569172263145447, 0.75, 0.800000011920929]

Computing classification report on loaded model

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Create empty arrays/lists for true labels and predicted labels
y_true = []
y_pred = []

# Iterate over the test dataset and make predictions
for x, y in test:
    # Predict the labels using the trained model
    predictions = base_model_loaded.predict(x)
    predicted_labels = np.argmax(predictions, axis=1)

    # Store the true labels and predicted labels
    y_true.extend(np.argmax(y, axis=1))
    y_pred.extend(predicted_labels)

# Convert the true labels and predicted labels to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Generate the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.84      0.54      0.66       103
           1       0.60      0.41      0.49        92
           2       0.63      0.81      0.71       101
           3       0.79      0.82      0.81        99
           4       0.80      0.82      0.81        95
           5       0.60      0.91      0.73        93
           6       0.51      0.76      0.61        92
           7       0.81      0.82      0.81        99
           8       0.91      0.82      0.87       114
           9       0.76      0.81      0.78        98
          10       0.68      0.68      0.68       105
          11       0.84      0.53      0.65       116
          12       0.87      0.70      0.78        77

    accuracy                           0.72      1284
   macro avg       0.74      0.73      0.72      1284
weighted avg       0.75      0.72      0.72      1284



In [None]:
base_model.evaluate(X_test, y_test)



[0.21875151991844177, 0.5, 0.7272727489471436]

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Create empty arrays/lists for true labels and predicted labels
y_true = []
y_pred = []

# Iterate over the test dataset and make predictions
for x, y in test:
    # Predict the labels using the trained model
    predictions = base_model.predict(x)
    predicted_labels = np.argmax(predictions, axis=1)

    # Store the true labels and predicted labels
    y_true.extend(np.argmax(y, axis=1))
    y_pred.extend(predicted_labels)

# Convert the true labels and predicted labels to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Generate the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.65      0.54      0.59        97
           1       0.44      0.48      0.46       105
           2       0.41      0.41      0.41        95
           3       0.64      0.60      0.62       122
           4       0.65      0.56      0.60        99
           5       0.53      0.76      0.62        95
           6       0.36      0.61      0.45        97
           7       0.68      0.63      0.66       103
           8       0.65      0.54      0.59        95
           9       0.63      0.66      0.64        93
          10       0.50      0.59      0.54        88
          11       0.64      0.51      0.57       102
          12       0.56      0.24      0.33        93

    accuracy                           0.55      1284
   macro avg       0.56      0.55      0.54      1284
weighted avg       0.57      0.55      0.55      1284



## Building a baseline model - now with accuracy for evaluation

In [15]:
data1 = dataset.map(preprocess_multy)
data1 = data1.cache()
data1 = data1.shuffle(buffer_size=1000)
data1 = data1.batch(16)
data1 = data1.prefetch(8)



In [17]:
len(data1), type(data1)

(407, tensorflow.python.data.ops.prefetch_op._PrefetchDataset)

In [16]:
train1 = data1.take(205)
val1 = data1.skip(205).take(101)
test1 = data1.skip(306).take(101)

In [18]:
samples, labels = train1.as_numpy_iterator().next()
samples.shape, labels.shape

((16, 491, 257, 1), (16, 13))

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten

base_model_acc = Sequential()
base_model_acc.add(Conv2D(8, (3,3), activation='relu', input_shape=(491, 257,1)))
base_model_acc.add(Flatten())
base_model_acc.add(Dense(13, activation='softmax'))

In [20]:
base_model_acc.compile('Adam', loss='BinaryCrossentropy', metrics=['accuracy'])

In [21]:
base_model_acc.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 489, 255, 8)       80        
                                                                 
 flatten (Flatten)           (None, 997560)            0         
                                                                 
 dense (Dense)               (None, 13)                12968293  
                                                                 
Total params: 12,968,373
Trainable params: 12,968,373
Non-trainable params: 0
_________________________________________________________________


In [17]:
from keras.models import load_model

base_model_acc = load_model('/content/gdrive/MyDrive/speech_recognition/baseline_acc_10')

In [None]:
for i in range(5):
  hist1 = base_model_acc.fit(train1, validation_data=val1, epochs=1)
  base_model_acc.save('/content/gdrive/MyDrive/speech_recognition/baseline_acc_10')








In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Create empty arrays/lists for true labels and predicted labels
y_true = []
y_pred = []

# Iterate over the test dataset and make predictions
for x, y in test1:
    # Predict the labels using the trained model
    predictions = base_model_acc.predict(x)
    predicted_labels = np.argmax(predictions, axis=1)

    # Store the true labels and predicted labels
    y_true.extend(np.argmax(y, axis=1))
    y_pred.extend(predicted_labels)

# Convert the true labels and predicted labels to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

y_true.shape, y_pred.shape

# Generate the classification report
# report = classification_report(y_true, y_pred)

# print(report)

((0,), (0,))