In [16]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
sys.path.append(os.path.abspath(os.path.join('./scripts')))

import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
import os
import pickle
import pandas as pd
from collections import Counter

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import * 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import backend as K
# import mlflow

In [5]:
!ln -s ./drive/MyDrive/SST/data/ ./
!ln -s ./drive/MyDrive/SST/scripts/* ./

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import helper


In [2]:
from data_generator import DataGenerator
from data_loader import DataLoader
from tokenizer import Tokenizer
from logspectrorgam import LogMelSpectrogram
from bidirectionalRNN import BidirectionalRNN
from simpleRNN import simple_rnn_model
from ctc_loss import add_ctc_loss

In [7]:
sample_rate = 8000

In [12]:
def preprocessin_model(sample_rate, fft_size, frame_step, n_mels, mfcc=False):

    input_data = Input(name='input', shape=(None,), dtype="float32")
    featLayer = LogMelSpectrogram(
        fft_size=fft_size,
        hop_size=frame_step,
        n_mels=n_mels,
        
        sample_rate=sample_rate,
        f_min=0.0,
        
        f_max=int(sample_rate / 2)
    )(input_data)
    
    x = BatchNormalization()(featLayer)
    model = Model(inputs=input_data, outputs=x, name="preprocessin_model")

    return model

In [13]:
def simple_rnn_model(input_dim, output_dim=224):

    input_data = Input(name='the_input', shape=(None, input_dim))
    simp_rnn = GRU(output_dim, return_sequences=True,
                   implementation=2, name='rnn')(input_data)
    y_pred = Activation('softmax', name='softmax')(simp_rnn)
    model = Model(inputs=input_data, outputs=y_pred, name="simple_rnn_model")
    model.output_length = lambda x: x
    return model

In [29]:
def BidirectionalRNN(input_dim, batch_size, sample_rate=22000,
                     rnn_layers=2, units=400, drop_out=0.5, act='tanh', output_dim=224):

    input_data = Input(name='the_input', shape=(
        None, input_dim))
    


    
    x = Bidirectional(LSTM(units,  activation=act,
                      return_sequences=True, implementation=2))(input_data)
    
    x = BatchNormalization()(x)
    x = Dropout(drop_out)(x)

    for i in range(rnn_layers - 2):
        x = Bidirectional(
            LSTM(units, activation=act, return_sequences=True))(x)
        x = BatchNormalization()(x)
        x = Dropout(drop_out)(x)

    x = Bidirectional(LSTM(units,  activation=act,
                      return_sequences=True, implementation=2))(x)
    x = BatchNormalization()(x)
    x = Dropout(drop_out)(x)

    time_dense = TimeDistributed(Dense(output_dim))(x)

    y_pred = Activation('softmax', name='softmax')(time_dense)

    model = Model(inputs=input_data, outputs=y_pred, name="BidirectionalRNN")

    return model

In [17]:
def train(model_builder, 
          data_len,
          data_gen,
          batch_size = 25,
          epochs=20, 
          verbose=1,
          optimizer=SGD(learning_rate=0.002, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
          ):    
              
    model = add_ctc_loss(model_builder)

    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    print(model.summary())


    hist = model.fit_generator(generator=data_gen,
                               epochs=epochs,
                               verbose=verbose, 
                               use_multiprocessing=False)
    return model

In [3]:
translation_obj = helper.read_obj("../data/translation_dict.pkl")
audio_obj = helper.read_obj("../data/audio_dict.pkl")

# translation_obj = helper.read_obj("./data/translation_dict.pkl")
# audio_obj = helper.read_obj("./data/audio_dict.pkl")
# meta_data = data_loader.create_meta_data(translation_obj, audio_obj)

In [4]:
audios = []
for label in audio_obj:
    audios.append(audio_obj[label][0])
    
translations = []
for label in audio_obj:
    translations.append(translation_obj[label])

In [6]:
def build_model(output_dim, custom_model, preprocess_model, mfcc=False, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = preprocess_model(input_audios)
    pre = tf.squeeze(pre, [3])

    y_pred = custom_model(pre)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model

In [35]:
def predict(model, audio, tokenizer, int_to_char):
    
    pred_audios = tf.convert_to_tensor([audio])
    
    y_pred = model.predict(pred_audios)

    input_shape = tf.keras.backend.shape(y_pred)
    input_length = tf.ones(shape=input_shape[0]) * tf.keras.backend.cast(input_shape[1], 'float32')
    prediction = tf.keras.backend.ctc_decode(y_pred, input_length, greedy=False)[0][0]
        
    pred = K.eval(prediction).flatten().tolist()
    pred = [i for i in pred if i != -1]
    
    predicted_text = tokenizer.decode_text(pred, int_to_char)
   
    return predicted_text

In [9]:
tokenizer = Tokenizer(translations)
int_to_char, char_to_int = tokenizer.build_dict()
sample = translations[0]
encoded = tokenizer.encode(sample, char_to_int)
decoded = tokenizer.decode_text(encoded, int_to_char)

print(f"sample snt: {sample}")
print(f"encoded snt: {encoded}")
print(f"decoed snt: {decoded}")

sample snt: የተለያዩ የ ትግራይ አውራጃ ተወላጆች ገንዘባቸው ን አዋጥ ተው የ ልማት ተቋማትን እንዲ መሰርቱ ትልማ አይ ፈቅድ ም
encoded snt: [7, 8, 11, 6, 131, 1, 7, 1, 3, 28, 27, 24, 1, 10, 4, 27, 115, 1, 8, 37, 29, 149, 18, 1, 21, 2, 65, 23, 26, 4, 1, 2, 1, 10, 41, 43, 1, 8, 4, 1, 7, 1, 12, 22, 3, 1, 8, 88, 22, 3, 2, 1, 13, 2, 49, 1, 15, 31, 14, 69, 1, 3, 12, 22, 1, 10, 24, 1, 61, 45, 32, 1, 16]
decoed snt: የተለያዩ የ ትግራይ አውራጃ ተወላጆች ገንዘባቸው ን አዋጥ ተው የ ልማት ተቋማትን እንዲ መሰርቱ ትልማ አይ ፈቅድ ም


In [18]:

sample_rate = 22000
fft_size = 1024
frame_step = 512
n_mels = 128

batch_size = 100
epochs = 20
data_len = len(translations)
output_dim = len(char_to_int) + 2


In [29]:
len(char_to_int)

221

In [19]:
dg = DataGenerator(translations, audios, batch_size)
preprocess_model = preprocessin_model(sample_rate, fft_size, frame_step, n_mels)
preprocess_model.summary()

Model: "preprocessin_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None)]            0         
_________________________________________________________________
log_mel_spectrogram (LogMelS (None, None, 128, 1)      0         
_________________________________________________________________
batch_normalization (BatchNo (None, None, 128, 1)      4         
Total params: 4
Trainable params: 2
Non-trainable params: 2
_________________________________________________________________


In [None]:
## Train using simple RNN model

In [20]:
speech_simple_rnn = simple_rnn_model(n_mels, output_dim)
speech_simple_rnn.summary()

Model: "simple_rnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None, 128)]       0         
_________________________________________________________________
rnn (GRU)                    (None, None, 223)         236157    
_________________________________________________________________
softmax (Activation)         (None, None, 223)         0         
Total params: 236,157
Trainable params: 236,157
Non-trainable params: 0
_________________________________________________________________


In [22]:
model = build_model(output_dim, speech_simple_rnn, preprocess_model)
model.summary()

Model: "model_builder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None)]            0         
_________________________________________________________________
preprocessin_model (Function (None, None, 128, 1)      4         
_________________________________________________________________
tf.compat.v1.squeeze_1 (TFOp (None, None, 128)         0         
_________________________________________________________________
simple_rnn_model (Functional (None, None, 223)         236157    
Total params: 236,161
Trainable params: 236,159
Non-trainable params: 2
_________________________________________________________________


In [23]:
# mlflow.set_experiment('Speech Model-RNN-baseline')
# mlflow.tensorflow.autolog()
train(model, 100, dg, epochs=20,  batch_size=100)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
preprocessin_model (Functional) (None, None, 128, 1) 4           the_input[0][0]                  
__________________________________________________________________________________________________
tf.compat.v1.squeeze_1 (TFOpLam (None, None, 128)    0           preprocessin_model[1][0]         
__________________________________________________________________________________________________
input_length (InputLayer)       [(None, 1)]          0                                            
______________________________________________________________________________________________



Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.engine.functional.Functional at 0x7f3c80515e10>

In [26]:
print("predicted", predict(audios[0], tokenizer, int_to_char))
print("actual", translations[0])


predicted የትየ ት ንየንየንትየትንትየንየ ንት ትየ የንትንየ ት ት ንትየት ን ን ንትየን የት
actual የተለያዩ የ ትግራይ አውራጃ ተወላጆች ገንዘባቸው ን አዋጥ ተው የ ልማት ተቋማትን እንዲ መሰርቱ ትልማ አይ ፈቅድ ም


### `Using Bi-rnn`

In [30]:
speech_BI_RNN_model = BidirectionalRNN(n_mels, batch_size=batch_size, output_dim=output_dim)
speech_BI_RNN_model.summary()

Model: "BidirectionalRNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None, 128)]       0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 800)         1692800   
_________________________________________________________________
batch_normalization_3 (Batch (None, None, 800)         3200      
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 800)         0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 800)         3843200   
_________________________________________________________________
batch_normalization_4 (Batch (None, None, 800)         3200      
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 800)    

In [31]:
model = build_model(output_dim, speech_BI_RNN_model, preprocess_model)
model.summary()

Model: "model_builder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None)]            0         
_________________________________________________________________
preprocessin_model (Function (None, None, 128, 1)      4         
_________________________________________________________________
tf.compat.v1.squeeze_3 (TFOp (None, None, 128)         0         
_________________________________________________________________
BidirectionalRNN (Functional (None, None, 223)         5721023   
Total params: 5,721,027
Trainable params: 5,717,825
Non-trainable params: 3,202
_________________________________________________________________


In [32]:
train(model, 100, dg, epochs=20,  batch_size=100)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
preprocessin_model (Functional) (None, None, 128, 1) 4           the_input[0][0]                  
__________________________________________________________________________________________________
tf.compat.v1.squeeze_3 (TFOpLam (None, None, 128)    0           preprocessin_model[3][0]         
__________________________________________________________________________________________________
input_length (InputLayer)       [(None, 1)]          0                                            
____________________________________________________________________________________________



Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.engine.functional.Functional at 0x7f3c6e647a90>

In [34]:
model.summary()

Model: "model_builder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None)]            0         
_________________________________________________________________
preprocessin_model (Function (None, None, 128, 1)      4         
_________________________________________________________________
tf.compat.v1.squeeze_3 (TFOp (None, None, 128)         0         
_________________________________________________________________
BidirectionalRNN (Functional (None, None, 223)         5721023   
Total params: 5,721,027
Trainable params: 5,717,825
Non-trainable params: 3,202
_________________________________________________________________


In [39]:
print("predicted", predict(model, audios[10], tokenizer, int_to_char))
print("actual", translations[10])

predicted  ስ ሻ ሻና ሻ ረሻ ሻ ሻረሻ ሻ ሻ ሻ ሻር ስትስትስት ስት 
actual ያ ኮምፒ ተር ለ ተጠቃሚው በ ትክክል የሚ ፈለገው ን ነገር እንዲ ያሟላ ማድረግ ነው


In [41]:
preprocess_model.summary()

Model: "preprocessin_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None)]            0         
_________________________________________________________________
log_mel_spectrogram (LogMelS (None, None, 128, 1)      0         
_________________________________________________________________
batch_normalization (BatchNo (None, None, 128, 1)      4         
Total params: 4
Trainable params: 2
Non-trainable params: 2
_________________________________________________________________


In [None]:
### Using CNN and BI-RNN

In [40]:
from tensorflow.keras.layers import * 

def CNN_net(n_mels,feat_depth, batch_size):
    input_data = Input(name='the_input', shape=(
          None, n_mels, 1))
    y = Conv2D(128, (3, 3), padding='same')(input_data)  # was 32
    y = Activation('relu')(y)
    y = BatchNormalization()(y)
    y = MaxPooling2D((1, 2))(y)

    y = Conv2D(64, (3, 3), padding='same')(y)  # was 32
    y = Activation('relu')(y)
    y = BatchNormalization()(y)
    y = MaxPooling2D((1, 2))(y)

    y = Conv2D(64, (3, 3), padding='same')(y)  # was 32
    y = Activation('relu')(y)
    y = BatchNormalization()(y)
    y = MaxPooling2D((1, 2))(y)

    y = Reshape((-1, y.shape[-1] * y.shape[-2]))(y)




    model = Model(inputs=input_data, outputs=y, name="cnn")
    return model, model.output.shape

In [42]:
cnn_model, cnn_shape = CNN_net(128,227, 100)
cnn_model.summary(), cnn_shape

Model: "cnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None, 128, 1)]    0         
_________________________________________________________________
conv2d (Conv2D)              (None, None, 128, 128)    1280      
_________________________________________________________________
activation (Activation)      (None, None, 128, 128)    0         
_________________________________________________________________
batch_normalization_5 (Batch (None, None, 128, 128)    512       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, None, 64, 128)     0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, None, 64, 64)      73792     
_________________________________________________________________
activation_1 (Activation)    (None, None, 64, 64)      0       

(None, TensorShape([None, None, 1024]))

In [59]:
def BidirectionalRNN2(input_dim, batch_size, sample_rate=22000,
                     rnn_layers=2, units=400, drop_out=0.5, act='tanh', output_dim=224):

    input_data = Input(name='the_input', shape=(
        None, input_dim))
    


    
    x = Bidirectional(LSTM(units,  activation=act,
                      return_sequences=True, implementation=2))(input_data)
    
    x = BatchNormalization()(x)
    x = Dropout(drop_out)(x)

    for i in range(rnn_layers - 2):
        x = Bidirectional(
            LSTM(units, activation=act, return_sequences=True))(x)
        x = BatchNormalization()(x)
        x = Dropout(drop_out)(x)

    x = Bidirectional(LSTM(units,  activation=act,
                      return_sequences=True, implementation=2))(x)
    x = BatchNormalization()(x)
    x = Dropout(drop_out)(x)

    time_dense = TimeDistributed(Dense(output_dim))(x)

    y_pred = Activation('softmax', name='softmax')(time_dense)

    model = Model(inputs=input_data, outputs=y_pred, name="BidirectionalRNN")

    return model

In [60]:
speech_model2 = BidirectionalRNN2(1024, batch_size=batch_size, output_dim=output_dim)
speech_model2.summary()

Model: "BidirectionalRNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None, 1024)]      0         
_________________________________________________________________
bidirectional_10 (Bidirectio (None, None, 800)         4560000   
_________________________________________________________________
batch_normalization_14 (Batc (None, None, 800)         3200      
_________________________________________________________________
dropout_10 (Dropout)         (None, None, 800)         0         
_________________________________________________________________
bidirectional_11 (Bidirectio (None, None, 800)         3843200   
_________________________________________________________________
batch_normalization_15 (Batc (None, None, 800)         3200      
_________________________________________________________________
dropout_11 (Dropout)         (None, None, 800)    

In [61]:
def build_model2(output_dim, cnn_model, custom_model, preprocess_model, mfcc=False, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = preprocess_model(input_audios)
    pre = tf.squeeze(pre, [3])

    cnn_output = cnn_model(pre)

    y_pred = custom_model(cnn_output)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model

In [62]:
model3 = build_model2(output_dim, cnn_model, speech_model2, preprocess_model)
model3.summary()

Model: "model_builder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None)]            0         
_________________________________________________________________
preprocessin_model (Function (None, None, 128, 1)      4         
_________________________________________________________________
tf.compat.v1.squeeze_5 (TFOp (None, None, 128)         0         
_________________________________________________________________
cnn (Functional)             (None, None, 1024)        113024    
_________________________________________________________________
BidirectionalRNN (Functional (None, None, 223)         8588223   
Total params: 8,701,251
Trainable params: 8,697,537
Non-trainable params: 3,714
_________________________________________________________________


In [65]:
train(model3, 100, dg, epochs=40,  batch_size=100)

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
preprocessin_model (Functional) (None, None, 128, 1) 4           the_input[0][0]                  
__________________________________________________________________________________________________
tf.compat.v1.squeeze_5 (TFOpLam (None, None, 128)    0           preprocessin_model[5][0]         
__________________________________________________________________________________________________
cnn (Functional)                (None, None, 1024)   113024      tf.compat.v1.squeeze_5[0][0]     
____________________________________________________________________________________________



Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.engine.functional.Functional at 0x7f3c7df748d0>

In [76]:
import random
for i in range(20):

  ind = random.randint(0, 3000)

  print("predicted", predict(model3, audios[ind], tokenizer, int_to_char))
  print("actual", translations[ind])
  print()

predicted በአዚህ ና ን ዋ ዋ ያ ተ ና ያናያ ያ ተ መ ያ ያ ያና ና ና ና ና ና ናን ተና ና ንና ህ ናት ህ ህስ ስ ህስህትስት ስምን ን ማ ለን ህናህት ትገ ዮች 
actual ከ ግዛታቸው ዋ ና ከተማ ጋሪ ስ ሆነው በ ስልክ ሚስተር ሞሪስ ከ ስደተኞቹ ከ ኬንያ የ መጡ ናቸው ብላ ኢትዮጵያ መግለጿ እንዳስ ገረማቸው ገልጸዋል

predicted ዚህማ ካሰራስራ ማ ማህማ ስ ስ ስማስ ስያ ሰራ ራ ራ ራዋ ሲራ ራ ሰራ ራስራ አራ ማ ስራስራ ኤ ስያጭ በራ ስሪስራ ስ ስ ስራ ራዋ በ በ ስራዋ ግስ ማ በሳማአራ ራአ ኲኰህኰጃህጃህ
actual ከ እረፍት መልስ ከ ስድስት ደቂቃዎች በኋላ የ ባንኩ አስር ቁጥር ዋቅጅራ አን በሴ ሶስተኛው ን ግብ ማስቆጠሩ ቡና ዎችን ተስፋ አስቆርጧ ቸዋል

predicted ጥህ ተናእንት ለ ጭንባ ወ አቃው 
actual የ ሰርጉ ወጥ ኩ ችም ተደርጐ ነው የ ተሰራው

predicted ሶጃጥተቃጥ ናቶ ዋ ን የ ለቃች ተች ና በ ናዳያ ዳነ ነዳቁዳ ንጭን በ ቀጃች ቀች ሽን ቀት ን ቀጥችት ጃ ቃቀች ች ጃ ዳጃ ያ የነውት ን ነ ን ዳ ጭ ተ ትዳች ተዊ ቀሎፍጹፍንም
actual አለቃ የጻፏቸው መጽሀፍት ውድ ና ጣፋጭ ከ መሆናቸው የተነሳ በ ህትመታቸው ወቅት ገዝተው ከሚ ጠቀሙ ት ብልህ ዎች በስተቀር ዘግይተው የ ሚፈልጓቸው ሰዎች አ ያገኟቸው ም

predicted ይህ ጃት ና መ የ ት ጥ ረ መለ ስና ለ ን ን ች ናስት ቀናት ናለ አ አ ረ ያው ንጠአጠ ጃቻቀ
actual የኤርትራ መንግስት ከ ደረሰበት የ ኢኮኖሚ ቀውስ ለ ማገገም የ ቦንድ ሽያጭ እንቅስቃሴ የ መገናኛ ብዙሀን አደባባይ እያ ወጡ ናቸው

predicted አህምት ና ን ሚት ሰ ራየ ዳና ጠንሚያናን ናዳ ች ና ነረና ና ሀላን የትየን ድር ትን ያትዳ ቀት ት ት ት ወ ጃቀን ንቀ ለጡ ች
actual

In [78]:
model3.save("../models/cnn_rnn_model.h5")

