### Import necessary packages

In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import librosa
import scipy.io.wavfile as wav
import scipy

from sklearn.model_selection import train_test_split

import os

##### Install dependencies and unzip dataset 

In [2]:
!pip install py7zr
!py7zr x '../input/tensorflow-speech-recognition-challenge/train.7z'
!py7zr x '../input/tensorflow-speech-recognition-challenge/test.7z'

Collecting py7zr
  Downloading py7zr-0.15.1-py3-none-any.whl (66 kB)
[K     |████████████████████████████████| 66 kB 2.5 MB/s eta 0:00:011
[?25hCollecting multivolumefile<0.3.0,>=0.2.0
  Downloading multivolumefile-0.2.2-py3-none-any.whl (16 kB)
Collecting ppmd-cffi<0.5.0,>=0.4.1
  Downloading ppmd_cffi-0.4.1-cp37-cp37m-manylinux2014_x86_64.whl (124 kB)
[K     |████████████████████████████████| 124 kB 7.3 MB/s eta 0:00:01
Collecting bcj-cffi<0.6.0,>=0.5.1
  Downloading bcj_cffi-0.5.1-cp37-cp37m-manylinux2014_x86_64.whl (36 kB)
Collecting pyzstd<0.15.0,>=0.14.4
  Downloading pyzstd-0.14.4-cp37-cp37m-manylinux2014_x86_64.whl (2.2 MB)
[K     |███████████████████████████████▉| 2.2 MB 7.4 MB/s eta 0:00:01     |████████████████████████████████| 2.2 MB 7.4 MB/s 
[?25hCollecting pycryptodome
  Downloading pycryptodome-3.10.1-cp35-abi3-manylinux2010_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 2.9 MB/s eta 0:00:01
Installing collected packages: pyzstd, pycryptodome

### Create train-dataset


In [3]:
TRAIN_DIR = './train/audio'
labels = os.listdir(TRAIN_DIR)

Dataset of all labels except silence

In [174]:
X_mfcc = []
X_chroma = []
Y = []
for label in labels:
    print(label, end=", ")
    path = os.path.join(TRAIN_DIR, label)
    filenames = os.listdir(path)
    for filename in filenames:
        if ".wav" not in filename or label == "_background_noise_":
            continue
        sr, audio = wav.read(os.path.join(path, filename))
        audio = audio.astype('float')
        a = librosa.feature.mfcc(y = audio, sr=sr, n_mfcc=32) # Generate MFCC of the audio sample
        a = np.pad(a, ((0, 0), (0, 32-a.shape[1])), mode="constant", constant_values=0)
        X_mfcc.append(a)
        a = librosa.feature.chroma_stft(y = audio, sr=sr, n_chroma=32) # Generate Chroma STFT of the audio sample
        a = np.pad(a, ((0, 0), (0, 32-a.shape[1])), mode="constant", constant_values=0)
        X_chroma.append(a)
        Y.append(labels.index(label))

five, 



dog, _background_noise_, marvin, happy, seven, go, three, four, bird, bed, cat, tree, yes, zero, on, eight, right, stop, wow, sheila, off, nine, left, down, up, one, no, house, two, six, 

Dataset creation of silence label

In [176]:
label = "_background_noise_"
path = os.path.join(TRAIN_DIR, label)
filenames = os.listdir(path)
for filename in filenames:
    if ".wav" not in filename:
        continue
    sr, audio = wav.read(os.path.join(path, filename))
    audio = audio.astype('float')
    for i in range(0, len(audio), sr):
        aa = audio[i:i+sr:10]
        a = librosa.feature.mfcc(y = aa, sr=sr, n_mfcc=32)
        a = np.pad(a, ((0, 0), (0, 32-a.shape[1])), mode="constant", constant_values=0)
        X_mfcc.append(a)
        a = librosa.feature.chroma_stft(y = aa, sr=sr, n_chroma=32)
        a = np.pad(a, ((0, 0), (0, 32-a.shape[1])), mode="constant", constant_values=0)
        X_chroma.append(a)
        Y.append(labels.index(label))

  n_fft, y.shape[-1]
  
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]


To balance the dataset add more samples of audio samples for <strong><i>silence</i></strong>. This is done by reversing each audio sample of 1 sec adn generating the said features

In [177]:
label = "_background_noise_"
path = os.path.join(TRAIN_DIR, label)
filenames = os.listdir(path)
for filename in filenames:
    if ".wav" not in filename:
        continue
    sr, audio = wav.read(os.path.join(path, filename))
    audio = audio.astype('float')
    for i in range(0, len(audio), sr):
        aa = audio[i:i+sr:-10]
        a = librosa.feature.mfcc(y = aa, sr=sr, n_mfcc=32)
        a = np.pad(a, ((0, 0), (0, 32-a.shape[1])), mode="constant", constant_values=0)
        X_mfcc.append(a)
        a = librosa.feature.chroma_stft(y = aa, sr=sr, n_chroma=32)
        a = np.pad(a, ((0, 0), (0, 32-a.shape[1])), mode="constant", constant_values=0)
        X_chroma.append(a)
        Y.append(labels.index(label))

  n_fft, y.shape[-1]


ValueError: can't extend empty axis 0 using modes other than 'constant' or 'empty'

In [181]:
X_mfcc = np.asarray(X_mfcc)
X_mfcc = np.expand_dims(X_mfcc, -1)
X_chroma = np.asarray(X_chroma)
X_chroma = np.expand_dims(X_chroma, -1)
Y = np.asarray(Y)
X_mfcc.shape, X_chroma.shape, Y.shape

((65123, 32, 32, 1), (65123, 32, 32, 1), (65123,))

To visualize the number of samples for each label

In [182]:
i = np.bincount(Y)
ii = np.nonzero(i)[0]
np.vstack([ii, i[ii]]).T

array([[   0, 2357],
       [   1, 1746],
       [   2,  402],
       [   3, 1746],
       [   4, 1742],
       [   5, 2377],
       [   6, 2372],
       [   7, 2356],
       [   8, 2372],
       [   9, 1731],
       [  10, 1713],
       [  11, 1733],
       [  12, 1733],
       [  13, 2377],
       [  14, 2376],
       [  15, 2367],
       [  16, 2352],
       [  17, 2367],
       [  18, 2380],
       [  19, 1745],
       [  20, 1734],
       [  21, 2357],
       [  22, 2364],
       [  23, 2353],
       [  24, 2359],
       [  25, 2375],
       [  26, 2370],
       [  27, 2375],
       [  28, 1750],
       [  29, 2373],
       [  30, 2369]])

### Train-Test Split

In [183]:
m_train, m_test, c_train, c_test, y_train, y_test = train_test_split(X_mfcc, X_chroma, Y, test_size=0.25, random_state=42)

### Model Creation & Training

In [184]:
def Model(X1, X2, dropout=0.4):
    _input1 = tf.keras.layers.Input(X1)
    x = tf.keras.layers.Conv2D(64, 3, padding="same")(_input1)
    x = tf.keras.layers.Activation(tf.nn.leaky_relu)(x)
    x = tf.keras.layers.BatchNormalization()(x)    
    x = tf.keras.layers.MaxPool2D()(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    
    x = tf.keras.layers.Conv2D(128, 3, padding="same")(x)
    x = tf.keras.layers.Activation(tf.nn.leaky_relu)(x)
    x = tf.keras.layers.BatchNormalization()(x)    
    x = tf.keras.layers.MaxPool2D()(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    
    x = tf.keras.layers.Conv2D(256, 3, padding="same")(x)
    x = tf.keras.layers.Activation(tf.nn.leaky_relu)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPool2D()(x)
    
    x1 = tf.keras.layers.Flatten()(x)
    
    _input2 = tf.keras.layers.Input(X2)
    x = tf.keras.layers.Conv2D(64, 3, padding="same")(_input2)
    x = tf.keras.layers.Activation(tf.nn.leaky_relu)(x)
    x = tf.keras.layers.BatchNormalization()(x)    
    x = tf.keras.layers.MaxPool2D()(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    
    x = tf.keras.layers.Conv2D(128, 3, padding="same")(x)
    x = tf.keras.layers.Activation(tf.nn.leaky_relu)(x)
    x = tf.keras.layers.BatchNormalization()(x)    
    x = tf.keras.layers.MaxPool2D()(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    
    x = tf.keras.layers.Conv2D(256, 3, padding="same")(x)
    x = tf.keras.layers.Activation(tf.nn.leaky_relu)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPool2D()(x)
    
    x2 = tf.keras.layers.Flatten()(x)
    
    x = tf.keras.layers.Concatenate()([x1, x2])
    
    x = tf.keras.layers.Dense(128)(x)
    x = tf.keras.layers.Activation(tf.nn.leaky_relu)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    
    x = tf.keras.layers.Dense(64)(x)
    x = tf.keras.layers.Activation(tf.nn.leaky_relu)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    
    x = tf.keras.layers.Dense(len(labels))(x)
    x = tf.keras.layers.Activation(tf.nn.softmax)(x)
    return tf.keras.Model([_input1, _input2], x)

In [185]:
model = Model(m_train[0].shape, c_train[0].shape)
model.compile(loss="sparse_categorical_crossentropy", metrics=["accuracy"])
checkpoint_filepath = "weights_mfcc_Conv2D/"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

Summary of the output shape and parameters of the model

In [186]:
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 32, 32, 1)]  0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 32, 32, 1)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 32, 32, 64)   640         input_10[0][0]                   
__________________________________________________________________________________________________
conv2d_3 (Conv2D)               (None, 32, 32, 64)   640         input_11[0][0]                   
____________________________________________________________________________________________

In [187]:
H = model.fit([m_train, c_train], y_train, batch_size=128, epochs=500, validation_split=0.25, shuffle=True, callbacks=[model_checkpoint_callback])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

KeyboardInterrupt: 

Plot of the train history (X-axis: <i>EPOCHS</i>, Y-axis: <i>ACCURACY</i>)

In [None]:
plt.plot(H.history["accuracy"])
plt.plot(H.history["val_accuracy"])

Load the best weights and evaluate the model

In [188]:
model.load_weights("weights_mfcc_Conv2D/")
print("Train Accuracy", model.evaluate([m_train, c_train], y_train)[1])
print("Test Accuracy", model.evaluate([m_test, c_test], y_test)[1])

Train Accuracy 0.9852790832519531
Test Accuracy 0.9383944272994995


Save the model weights in a zip file (for downloading from server)

In [190]:
!zip -r 'weights_mfcc_Conv2D.zip' 'weights_mfcc_Conv2D/'

updating: weights_mfcc_Conv2D/ (stored 0%)
updating: weights_mfcc_Conv2D/.index (deflated 76%)
updating: weights_mfcc_Conv2D/checkpoint (deflated 34%)
updating: weights_mfcc_Conv2D/.data-00000-of-00001 (deflated 8%)


### Create Test-Dataset


In [191]:
TEST_DIR = './test/audio'

In [196]:
filenames = os.listdir(TEST_DIR)
M_Test = []
C_Test = []
for filename in filenames:
    if ".wav" not in filename:
        continue
    sr, audio = wav.read(os.path.join(TEST_DIR, filename))
    audio = audio.astype('float')
    a = librosa.feature.mfcc(y = audio, sr=sr, n_mfcc=32)
    a = np.pad(a, ((0, 0), (0, 32-a.shape[1])), mode="constant", constant_values=0)
    M_Test.append(a)
    a = librosa.feature.chroma_stft(y = audio, sr=sr, n_chroma=32)
    a = np.pad(a, ((0, 0), (0, 32-a.shape[1])), mode="constant", constant_values=0)
    C_Test.append(a)

In [197]:
M_Test = np.asarray(M_Test)
M_Test = np.expand_dims(M_Test, -1)

C_Test = np.asarray(C_Test)
C_Test = np.expand_dims(C_Test, -1)
M_Test.shape, C_Test.shape

((158538, 32, 32, 1), (158538, 32, 32, 1))

### Load weights and predict class labels for test set provided for the challenge

In [None]:
checkpoint_filepath = "weights_mfcc_Conv2D/"
model = Model(X_Test[0].shape)
model.compile(loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.load_weights(checkpoint_filepath)

In [198]:
pred = model.predict([M_Test, C_Test])

##### Create csv file for the predicted labels in the format specified for submission

In [200]:
_fname = []
_labels = []
ans_labels = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "silence", "unknown"]
for i in range(len(M_Test)):
    y = np.argmax(pred[i])
    y = labels[y]
    if y not in ans_labels:
        if y == "_background_noise_":
            y = "silence"
        else:
            y = "unknown"
    _fname.append(filenames[i])
    _labels.append(y)

In [201]:
df = pd.DataFrame.from_dict({'fname':_fname, 'label':_labels})
df.head()

Unnamed: 0,fname,label
0,clip_6ffc4eb63.wav,unknown
1,clip_6da59a15b.wav,up
2,clip_c12a2e8fc.wav,yes
3,clip_05b92927e.wav,go
4,clip_e605c025d.wav,off


Save dataframe as csv

In [202]:
df.to_csv('submission.csv', index=False)