In [15]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))

import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
import os
import pickle
import pandas as pd
from collections import Counter

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import * 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import backend as K
import mlflow

In [16]:
import helper


In [None]:
from data_augmentation import Data_Augmentation
from data_loader import DataLoader

In [17]:
sample_rate = 8000

In [18]:
class Tokenizer:
    
    def __init__(self, translations):
        self.translations = translations
        self.unk = -1
        
    def build_dict(self):
        text = ''
        for t in self.translations:
            text += t
        
        char_counts = Counter(text)
        sorted_vocab = sorted(char_counts, key=char_counts.get, reverse=True)
        int_to_char = {ii: word for ii, word in enumerate(sorted_vocab, 1)}

        char_to_int = {word: ii for ii, word in int_to_char.items()}
        
        return int_to_char, char_to_int
    
    def encode(self, sent, char_to_int):
        
        encoded = []
        char_list = list(sent)
        for c in char_list:
            try:
                encoded.append(char_to_int[c])
            
            except KeyError:
                encoded.append(self.unk)
        return encoded
    
    def decode_text(self, encoded_chars, int_to_char):
        
        decoded = ''
        for e in encoded_chars:
            try:
                decoded += e
            
            except KeyError:
                decoded += ''
        
        return decoded
        
    
        
        
         
        

        

In [19]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self,  translations, audios, batch_size=32, shuffle=True):
        self.audios = audios
        self.labels = translations
        self.batch_size = batch_size
        self.len = int(np.floor(len(self.labels) / self.batch_size))
        self.shuffle = shuffle
        self.on_epoch_end()
        
        self.tokenizer = Tokenizer(translations)
        self.int_to_char, self.char_to_int = tokenizer.build_dict()
        
        self.cur_index = 0

    def __len__(self):
        return self.len
    
    def encode_text(self, translations):
        encoded_trans =  []
        
        for t in translations:
            encoded = self.tokenizer.encode(t, self.char_to_int)
            encoded_trans.append(encoded)
        
        return encoded_trans
    
    def get_max_len(self, items):
        maximum = 0
        for i in items:
            if len(i) > maximum:
                maximum = len(i)
                
        return maximum

            
    def __data_generation(self, batch_translations, batch_audios):
     
        self.cur_index = 0
        encoded_trans = self.encode_text(batch_translations)
        
        maximum_trans_len = self.get_max_len(encoded_trans)
        maximum_audio_len = self.get_max_len(batch_audios)
        
        
        encoded_trans_np = np.zeros((len(encoded_trans), maximum_trans_len), dtype="int64")
        padded_audios_np = np.zeros((len(batch_audios), maximum_audio_len), dtype="float32")
        
        label_length = np.zeros(padded_audios_np.shape[0], dtype="int64")
        input_length = np.zeros(encoded_trans_np.shape[0], dtype="int64")
        
        
        ind = 0
        for trans, audio in zip(encoded_trans, batch_audios):
            encoded_trans_np[ind,0:len(trans)] = trans
            label_length[ind] = len(trans)
            
            padded_audio = np.pad(audio, (0, maximum_audio_len - len(audio)), mode = 'constant', constant_values=0)
            
            padded_audios_np[ind, ] = padded_audio
            input_length[ind] = len(audio)
            
            ind += 1
        
        outputs = {'ctc': np.zeros([self.batch_size])}
        inputs = {'the_input':   tf.convert_to_tensor(padded_audios_np), 
                  'the_labels':   tf.convert_to_tensor(encoded_trans_np), 
                  'input_length':   tf.convert_to_tensor(input_length), 
                  'label_length':   tf.convert_to_tensor(label_length) 
                 }
        
        return (inputs, outputs)
            
    def on_epoch_end(self):
                
        self.indexes = np.arange(self.len*self.batch_size)

        if self.shuffle == True:

            self.indexes = self.indexes.reshape(int(self.len), int(self.batch_size))
            np.random.shuffle(self.indexes)

            for i in range(self.len):
                np.random.shuffle(self.indexes[i])

            self.indexes = self.indexes.reshape(int(self.len*self.batch_size))


    def __getitem__(self, index):
        indexes = self.indexes[int(index*self.batch_size):int((index+1)*self.batch_size)]
        
        self.cur_index += self.batch_size
        
        if  self.cur_index >= len(self.labels):
            self.cur_index = 0

        batch_labels = [self.labels[int(k)] for k in indexes]
        batch_audios = [self.audios[int(k)] for k in indexes]
        
        batch_labels = self.labels[self.cur_index:  self.cur_index + self.batch_size]
        batch_audios = self.audios[ self.cur_index:  self.cur_index + self.batch_size]
    
        
        return  self.__data_generation(batch_labels, batch_audios)

In [20]:
class LogMelSpectrogram(tf.keras.layers.Layer):
    """Compute log-magnitude mel-scaled spectrograms."""

    def __init__(self, sample_rate, fft_size, hop_size, n_mels,
                 f_min=0.0, f_max=None, **kwargs):
        super(LogMelSpectrogram, self).__init__(**kwargs)
        self.sample_rate = sample_rate
        self.fft_size = fft_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max if f_max else sample_rate / 2
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.n_mels,
            num_spectrogram_bins=fft_size // 2 + 1,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max)

    def build(self, input_shape):
        self.non_trainable_weights.append(self.mel_filterbank)
        super(LogMelSpectrogram, self).build(input_shape)

    def call(self, waveforms):
        """Forward pass.
        Parameters
        ----------
        waveforms : tf.Tensor, shape = (None, n_samples)
            A Batch of mono waveforms.
        Returns
        -------
        log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
            The corresponding batch of log-mel-spectrograms
        """
        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        def power_to_db(magnitude, amin=1e-16, top_db=80.0):
            """
            https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
            """
            ref_value = tf.reduce_max(magnitude)
            log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
            log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

            return log_spec

        spectrograms = tf.signal.stft(waveforms,
                                      frame_length=self.fft_size,
                                      frame_step=self.hop_size,
                                      pad_end=False)

        magnitude_spectrograms = tf.abs(spectrograms)

        mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
                                     self.mel_filterbank)

        log_mel_spectrograms = power_to_db(mel_spectrograms)

        # add channel dimension
        log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)

        return log_mel_spectrograms

    def get_config(self):
        config = {
            'fft_size': self.fft_size,
            'hop_size': self.hop_size,
            'n_mels': self.n_mels,
            'sample_rate': self.sample_rate,
            'f_min': self.f_min,
            'f_max': self.f_max,
        }
        config.update(super(LogMelSpectrogram, self).get_config())

        return config

In [21]:
def preprocessin_model(sample_rate, fft_size, frame_step, n_mels, mfcc=False):

    input_data = Input(name='input', shape=(None,), dtype="float32")
    featLayer = LogMelSpectrogram(
        fft_size=fft_size,
        hop_size=frame_step,
        n_mels=n_mels,
        
        sample_rate=sample_rate,
        f_min=0.0,
        
        f_max=int(sample_rate / 2)
    )(input_data)
    
    x = BatchNormalization()(featLayer)
    model = Model(inputs=input_data, outputs=x, name="preprocessin_model")

    return model

In [22]:
def BidirectionalRNN(input_dim, batch_size, sample_rate=22000,
                     rnn_layers=2, units=400, drop_out=0.5, act='tanh', output_dim=224):

    input_data = Input(name='the_input', shape=(
        None, input_dim), batch_size=batch_size)
    


    
    x = Bidirectional(LSTM(units,  activation=act,
                      return_sequences=True, implementation=2))(input_data)
    
    x = BatchNormalization()(x)
    x = Dropout(drop_out)(x)

    for i in range(rnn_layers - 2):
        x = Bidirectional(
            LSTM(units, activation=act, return_sequences=True))(x)
        x = BatchNormalization()(x)
        x = Dropout(drop_out)(x)

    x = Bidirectional(LSTM(units,  activation=act,
                      return_sequences=True, implementation=2))(x)
    x = BatchNormalization()(x)
    x = Dropout(drop_out)(x)

    time_dense = TimeDistributed(Dense(output_dim))(x)

    y_pred = Activation('softmax', name='softmax')(time_dense)

    model = Model(inputs=input_data, outputs=y_pred, name="BidirectionalRNN")

    return model

In [23]:
def simple_rnn_model(input_dim, output_dim=224):

    input_data = Input(name='the_input', shape=(None, input_dim))
    simp_rnn = GRU(output_dim, return_sequences=True,
                   implementation=2, name='rnn')(input_data)
    y_pred = Activation('softmax', name='softmax')(simp_rnn)
    model = Model(inputs=input_data, outputs=y_pred, name="simple_rnn_model")
    model.output_length = lambda x: x
    return model

In [24]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

In [25]:
def input_lengths_lambda_func(args):
    hop_size = frame_step
    input_length = args
    return tf.cast(tf.math.ceil(input_length/hop_size)-1, dtype="float32")

In [26]:
def add_ctc_loss(model_builder):
    the_labels      = Input(name='the_labels',      shape=(None,), dtype='float32')
    input_lengths   = Input(name='input_length',    shape=(1,), dtype='float32')
    label_lengths   = Input(name='label_length',    shape=(1,), dtype='float32')

    input_lengths2 = Lambda(input_lengths_lambda_func)(input_lengths)
    if model_builder.output_length:
         output_lengths  = Lambda(model_builder.output_length)(input_lengths2) - 1
    else:
         output_lengths  = input_lengths2
    
    # CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([model_builder.output, the_labels, output_lengths, label_lengths])
    model = Model( inputs=[model_builder.input, the_labels, input_lengths, label_lengths],  outputs=loss_out)
    return model

In [27]:
def train(model_builder, 
          data_len,
          data_gen,
          batch_size = 25,
          epochs=20, 
          verbose=1,
          optimizer=SGD(learning_rate=0.002, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
          ):    
              
    model = add_ctc_loss(model_builder)

    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    print(model.summary())


    hist = model.fit_generator(generator=data_gen,
                               epochs=epochs,
                               verbose=verbose, 
                               use_multiprocessing=False)

In [29]:
translation_obj = helper.read_obj("../data/translation_dict.pkl")
audio_obj = helper.read_obj("../data/audio_dict.pkl")
# meta_data = data_loader.create_meta_data(translation_obj, audio_obj)

In [30]:
audios = []
for label in audio_obj:
    audios.append(audio_obj[label][0])
    
translations = []
for label in audio_obj:
    translations.append(translation_obj[label])

In [33]:
tokenizer = Tokenizer(translations)
int_to_char, char_to_int = tokenizer.build_dict()
sample = translations[0]
encoded = tokenizer.encode(sample, char_to_int)
decoded = tokenizer.decode_text(sample, encoded)

print(f"sample snt: {sample}")
print(f"encoded snt: {encoded}")
print(f"decoed snt: {decoded}")

sample snt: የተለያዩ የ ትግራይ አውራጃ ተወላጆች ገንዘባቸው ን አዋጥ ተው የ ልማት ተቋማትን እንዲ መሰርቱ ትልማ አይ ፈቅድ ም
encoded snt: [7, 8, 11, 6, 131, 1, 7, 1, 3, 28, 27, 24, 1, 10, 4, 27, 115, 1, 8, 37, 29, 149, 18, 1, 21, 2, 65, 23, 26, 4, 1, 2, 1, 10, 41, 43, 1, 8, 4, 1, 7, 1, 12, 22, 3, 1, 8, 88, 22, 3, 2, 1, 13, 2, 49, 1, 15, 31, 14, 69, 1, 3, 12, 22, 1, 10, 24, 1, 61, 45, 32, 1, 16]
decoed snt: የተለያዩ የ ትግራይ አውራጃ ተወላጆች ገንዘባቸው ን አዋጥ ተው የ ልማት ተቋማትን እንዲ መሰርቱ ትልማ አይ ፈቅድ ም


In [35]:

sample_rate = 22000
fft_size = 1024
frame_step = 512
n_mels = 128

batch_size = 100
epochs = 20
data_len = len(translations)
output_dim = len(char_to_int) + 2


In [36]:
dg = DataGenerator(translations, audios, batch_size)
preprocess_model = preprocessin_model(sample_rate, fft_size, frame_step, n_mels)
preprocess_model.summary()

Model: "preprocessin_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None)]            0         
_________________________________________________________________
log_mel_spectrogram (LogMelS (None, None, 128, 1)      0         
_________________________________________________________________
batch_normalization (BatchNo (None, None, 128, 1)      4         
Total params: 4
Trainable params: 2
Non-trainable params: 2
_________________________________________________________________


In [37]:
speech_model = simple_rnn_model(n_mels, output_dim)
speech_model.summary()
# speech_model = BidirectionalRNN(n_mels, output_dim=output_dim)
# speech_model.summary()

Model: "simple_rnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None, 128)]       0         
_________________________________________________________________
rnn (GRU)                    (None, None, 223)         236157    
_________________________________________________________________
softmax (Activation)         (None, None, 223)         0         
Total params: 236,157
Trainable params: 236,157
Non-trainable params: 0
_________________________________________________________________


In [38]:
def build_model(output_dim, custom_model, preprocess_model, mfcc=False, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = preprocess_model(input_audios)
    pre = tf.squeeze(pre, [3])

    y_pred = custom_model(pre)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model

In [39]:
model = build_model(output_dim, speech_model, preprocess_model)
model.summary()

Model: "model_builder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None)]            0         
_________________________________________________________________
preprocessin_model (Function (None, None, 128, 1)      4         
_________________________________________________________________
tf.compat.v1.squeeze (TFOpLa (None, None, 128)         0         
_________________________________________________________________
simple_rnn_model (Functional (None, None, 223)         236157    
Total params: 236,161
Trainable params: 236,159
Non-trainable params: 2
_________________________________________________________________


In [40]:
# mlflow.set_experiment('Speech Model-RNN-baseline')
# mlflow.tensorflow.autolog()
train(model, 100, dg, epochs=20,  batch_size=100)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
preprocessin_model (Functional) (None, None, 128, 1) 4           the_input[0][0]                  
__________________________________________________________________________________________________
tf.compat.v1.squeeze (TFOpLambd (None, None, 128)    0           preprocessin_model[0][0]         
__________________________________________________________________________________________________
input_length (InputLayer)       [(None, 1)]          0                                            
______________________________________________________________________________________________



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
