In [4]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
# sys.path.append(os.path.abspath(os.path.join('./scripts')))
import pandas as pd
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
import os
import pickle
import pandas as pd
from collections import Counter
import librosa
import librosa.display
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import backend as K
from jiwer import wer
import random
import mlflow
import warnings
warnings.filterwarnings('ignore')
import helper
from data_generator import DataGenerator
from tokenizer import Tokenizer
from logspectrogram import LogMelSpectrogram
from ctc_loss import CTC_loss
from model2 import simple_rnn_model, CNN_net, BidirectionalRNN2, cnn_rnn_model, preprocessin_model

In [5]:
frame_step = 256
ctc = CTC_loss(frame_step)

### Model trainer function

In [6]:
def train(model_builder,
          data_gen,
          batch_size = 32,
          epochs=20,
          verbose=1,
          save_path="../models/model.h5",
          optimizer=SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
          ):

    model = ctc.add_ctc_loss(model_builder)

    checkpointer = ModelCheckpoint(filepath=save_path, verbose=0)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    print(model.summary())


    hist = model.fit_generator(generator=data_gen,
                               callbacks=[checkpointer],

                               epochs=epochs,
                               verbose=verbose,
                               use_multiprocessing=False)
    return model

In [9]:
translation_obj = helper.read_obj("../models/translation_dict.pkl")
audio_obj = helper.read_obj("../models/audio_dict.pkl")
meta_data = helper.read_csv("../data/am_train_fin.csv")

file read as csv


In [11]:
sorted_metadata = meta_data.sort_values(by="duration")
labels = sorted_metadata['label'].to_list()
sorted_metadata

KeyError: 'label'

In [None]:
audios = []
for label in labels:
    audios.append(audio_obj[label][0])

translations = []
for label in labels:
    translations.append(translation_obj[label])

In [None]:
def build_model(output_dim, custom_model, preprocess_model, mfcc=False, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = preprocess_model(input_audios)
    pre = tf.squeeze(pre, [3])

    y_pred = custom_model(pre)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model

In [None]:
def predict(model, audio, tokenizer, int_to_char, actual=None):

    pred_audios = tf.convert_to_tensor([audio])

    y_pred = model.predict(pred_audios)

    input_shape = tf.keras.backend.shape(y_pred)
    input_length = tf.ones(shape=input_shape[0]) * tf.keras.backend.cast(input_shape[1], 'float32')
    prediction = tf.keras.backend.ctc_decode(y_pred, input_length, greedy=False)[0][0]

    pred = K.eval(prediction).flatten().tolist()
    pred = [i for i in pred if i != -1]

    predicted_text = tokenizer.decode_text(pred, int_to_char)

    error = None
    if actual != None:
        error = wer(actual, predicted_text)

    return predicted_text, error

In [None]:
tokenizer = Tokenizer(translations)
int_to_char, char_to_int = tokenizer.build_dict()
sample = translations[0]
encoded = tokenizer.encode(sample, char_to_int)
decoded = tokenizer.decode_text(encoded, int_to_char)

print(f"sample snt: {sample}")
print(f"encoded snt: {encoded}")
print(f"decoed snt: {decoded}")

In [None]:
helper.write_obj("../int_to_char.pkl", int_to_char)
helper.write_obj("../char_to_int.pkl", char_to_int)

In [None]:
sample_rate = 8000
fft_size = 512
frame_step = 256
n_mels = 128
batch_size = 100
epochs = 20
data_len = len(translations)
output_dim = len(char_to_int) + 2

In [None]:
dg = DataGenerator(translations, audios, batch_size, shuffle=True)
preprocess_model = preprocessin_model(sample_rate, fft_size, frame_step, n_mels)
preprocess_model.summary()

In [None]:
sample_audio = dg[0][0]['the_input'][0].numpy()
sample_lbl = dg[0][0]['the_labels'][0].numpy()

a = np.zeros((1, len(sample_audio)))
a[0, ] = sample_audio
print(a.shape)
pred = preprocess_model.predict(a)
fig, ax = plt.subplots(figsize=(16, 4))
display(pred.shape)
pred = pred[0, :, :, 0]
librosa.display.specshow(pred.T, sr=8000, hop_length=128, cmap="jet")
print("char_len", len(sample_lbl))

In [None]:

sample_audio = dg[49][0]['the_input'][-1].numpy()
sample_lbl = dg[49][0]['the_labels'][-1].numpy()

a = np.zeros((1, len(sample_audio)))
a[0, ] = sample_audio
print(a.shape)
pred = preprocess_model.predict(a)
fig, ax = plt.subplots(figsize=(16, 4))
display(pred.shape)
pred = pred[0, :, :, 0]
librosa.display.specshow(pred.T, sr=8000, hop_length=128, cmap="jet")
print("char_len", len(sample_lbl))

## Simple RNN

In [None]:
speech_simple_rnn = simple_rnn_model(n_mels, output_dim)
speech_simple_rnn.summary()

In [None]:
simple_rnn_speech_model = build_model(output_dim, speech_simple_rnn, preprocess_model)
simple_rnn_speech_model.summary()

#### Train and Export Model

In [None]:
train(simple_rnn_speech_model, dg, epochs=20, save_path="../models/simple_rnn_model.h5",  batch_size=batch_size)

#### Predicting Using Simple RNN

In [None]:
simple_rnn_speech_model.load_weights("../models/simple_rnn_model.h5")
actual_translation = translations[10]
sample_test_audio = audios[0]
predicted, error = predict(simple_rnn_speech_model, sample_test_audio , tokenizer, int_to_char, actual=actual_translation)

print("actual", actual_translation)
print("predicted", predicted)
print("WER: ", error)

## CNN + Simple RNN

In [None]:
speech_cnn_rnn = cnn_rnn_model(n_mels, 250, 4, 1, 'same', 400, output_dim)
speech_cnn_rnn.summary()

In [None]:
speech_cnn_rnn_model = build_model(output_dim, speech_cnn_rnn, preprocess_model)
speech_cnn_rnn_model.summary()

#### Train and Export Model

In [None]:
train(speech_cnn_rnn_model, dg, epochs=20, save_path="../models/cnn_rnn_model.h5",  batch_size=batch_size)

#### Predicting Using CNN + Simple RNN

In [None]:
speech_cnn_rnn_model.load_weights("../models/cnn_rnn_model.h5")
for k in range(10):
    i = random.randint(0, 3000)
    actual_translation = translations[i]
    sample_test_audio = audios[i]
    predicted, error = predict(speech_cnn_rnn_model, sample_test_audio,
                               tokenizer, int_to_char, actual=actual_translation)

    print("actual", actual_translation)
    print("predicted", predicted)
    print(f"WER: {error:.2f}")
    print()

## CNN and Bi-directional RNN

In [None]:
batch_size = 32
dg = DataGenerator(translations, audios, batch_size, shuffle=True)

#### Train and Export Model

In [None]:
cnn_model, cnn_shape = CNN_net(n_mels)
cnn_model.summary(), cnn_shape

In [None]:
BI_RNN_2 = BidirectionalRNN2(1024, batch_size=batch_size, output_dim=output_dim)
BI_RNN_2.summary()

In [None]:
def build_model2(output_dim, cnn_model, custom_model, preprocess_model, mfcc=False, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = preprocess_model(input_audios)
    pre = tf.squeeze(pre, [3])

    cnn_output = cnn_model(pre)

    y_pred = custom_model(cnn_output)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model

In [None]:
cnn_bi_rnn_model = build_model2(output_dim, cnn_model, BI_RNN_2, preprocess_model)
cnn_bi_rnn_model.summary()

In [None]:
train(cnn_bi_rnn_model, dg, epochs=20, save_path="../models/cnn_bi_rnn_model.h5",  batch_size=batch_size)

#### Predicting Using CNN and Bi-directional RNN

In [None]:
cnn_bi_rnn_model.load_weights("../models/cnn-bi-rnn.h5")
for k in range(10):
    i = random.randint(0, 1000)
    actual_translation = translations[i]
    sample_test_audio = audios[i]
    predicted, error = predict(cnn_bi_rnn_model, sample_test_audio,
                               tokenizer, int_to_char, actual=actual_translation)

    print("actual", actual_translation)
    print("predicted", predicted)
    print(f"WER: {error:.2f}")
    print()