<h1>Imports</h1>

In [1]:
import os
import librosa
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras import backend as K
from keras.utils import np_utils
from keras.models import Model
from keras.models import load_model
from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint

<h1>Data path and classes</h1>
<p>Lets specifiy the data path and see how many classes we got in data set</p>

In [27]:
train_audio_path = "../../data"
labels = os.listdir(train_audio_path)


le = LabelEncoder()
y = le.fit_transform(labels)
classes = list(le.classes_)
y = np_utils.to_categorical(y, num_classes = len(labels))

print(classes)

['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']


<h1>Reading data and resizeing</h1>
<p>We read all data and resize them to one size, if some one fails to resize we simply ignore it, sixe by side we print the number of datapoints that got resized successfully.<p>

In [None]:
all_wave = []
all_label = []
for label in labels:
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    r = 0
    for wav in waves:
        try:
            samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)
            samples = librosa.resample(samples, sample_rate, 8000)
            if(len(samples) == 8000) : 
                r += 1
                all_wave.append(samples)
                all_label.append(label)
        except:
            pass
    print(f'{label}:{r}')

bed:1484
bird:1521
cat:1515
dog:1547
down:2152
eight:2111
five:2161
four:2158


In [29]:
le = LabelEncoder()
y = le.fit_transform(all_label)
classes = list(le.classes_)
y = np_utils.to_categorical(y, num_classes = len(classes))

print(classes)
print(y)

['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


<h1>Splitting data into test and train</h1>

In [30]:
#all_wave = np.array(all_wave).reshape(-1, 8000, 1)
print(np.array(all_label).shape)

(58250,)


In [None]:
all_wave = np.array(all_wave).reshape(-1, 8000, 1)
x_train, x_valid, y_train, y_valid = train_test_split(np.array(all_wave),np.array(y),stratify=y,test_size = 0.2,random_state=777,shuffle=True)

<h1>Defining the CNN Model</h1>
<p>We are using <a href="https://en.wikipedia.org/wiki/Convolutional_neural_network" target="_blank">CNN</a> model for training  upon data to predict labels</p>

In [17]:
K.clear_session()

def create_model():
    inputs = Input(shape=(8000,1))
    
    #First Conv1D layer
    conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    
    #Second Conv1D layer
    conv = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    
    #Third Conv1D layer
    conv = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    
    #Fourth Conv1D layer
    conv = Conv1D(64, 7, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.3)(conv)
    
    #Flatten layer
    conv = Flatten()(conv)
    
    #Dense Layer 1
    conv = Dense(256, activation='relu')(conv)
    conv = Dropout(0.3)(conv)
    
    #Dense Layer 2
    conv = Dense(128, activation='relu')(conv)
    conv = Dropout(0.3)(conv)
    
    outputs = Dense(len(labels), activation='softmax')(conv)
    
    model = Model(inputs, outputs)
    return model

<h1>Train Model</h1>
<p>This funtion trains and returns the model, and have an optional argument <b>from_scratch</b> if set to true, it will train the model from scratch or elsewise will use the already avaible model to retrain and return, by default it is set to false</p>

In [14]:
def train_model(from_scratch = False):
    if(from_scratch or not os.path.exists('best_model.hdf5')):
        model = create_model()
        print('Model created')
    else:
        model = load_model('best_model.hdf5')
        print('Model loaded')
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam',metrics = ['accuracy'])
    es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 10, min_delta = 0.0001)
    mc = ModelCheckpoint('best_model.hdf5', monitor = 'val_accuracy', verbose = 1, save_best_only = True, mode = 'max')
    model.fit(x_train, y_train , epochs = 100, callbacks = [es, mc], batch_size = 32, validation_data = (x_valid, y_valid))
    return model

<h1>Load Model</h1>
<p>This method simply returns the already trained model. <small style="color:red">(Throws error if no model found)</small></p>

In [15]:
def load_trainded_model():
    model = load_model('best_model.hdf5')
    return model

<h1>Get Model</h1>
<p>This method is responsible for generating the model and is exposed as api in .py file of docker container, it accepts the parameter <b>already_trained</b> if set to true it uses the already trained model else starts training the new one</p>

In [19]:
def get_model(already_trained = True):
    global model
    model =  load_trainded_model() if already_trained else train_model()

#Send false if want to retrain model
#get_model(False)

print(x_train.shape)
print(y_train.shape)

print(y_train[0])
print(x_valid.shape)
print(y_valid.shape)

(46600, 8000, 1)
(46600,)
two
(11650, 8000, 1)
(11650,)


<h1>Predict Function</h1>
<p>This function inputs an array of float as audio and its sample rate and outputs the most likely value of it, it is also exposed as an api in docker.</p>

In [6]:
def predict(audio, sample_rate):
    audio = np.array(audio, dtype=np.float32)
    audio = librosa.resample(audio, sample_rate, 8000)
    prob = model.predict(audio.reshape(1,8000,1))
    index = np.argmax(prob[0])
    return classes[index]

<h1>Lets predict!</h1>
<p>Here we simply try to predict an input<p>

In [9]:
audio, sample_rate = librosa.load('./data/bed/00f0204f_nohash_0.wav', sr = 16000)
print("Predicted Label:", predict(audio, sample_rate))

Predicted Label: bed


<h1>End 🐍</h1>

<hr><hr>