In [1]:
%load_ext tensorboard
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense,RNN,Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
import numpy as np
import datetime

import pickle
import shutil
from IPython.display import Image
import io
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.bleu_score import sentence_bleu

In [2]:
#os.chdir('')
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Deep Text Corrector')

Mounted at /content/drive


In [3]:
os.getcwd()

'/content/drive/MyDrive/Deep Text Corrector'

In [4]:
f = open('./data/perturbated_text_embed_matrix.pkl','rb')
perturbated_text_embed_matrix = pickle.load(f)
f.close()

f = open('./data/text_embed_matrix.pkl','rb')
text_embed_matrix = pickle.load(f)
f.close()

f = open('./data/perturbated_text_train.pkl','rb')
perturbated_text_train = pickle.load(f)
f.close()

f = open('./data/text_inp_train.pkl','rb')
text_inp_train = pickle.load(f)
f.close()

f = open('./data/text_out_train.pkl','rb')
text_out_train = pickle.load(f)
f.close()

f = open('./data/perturbated_text_tokernizer_index.pkl','rb')
perturbated_text_tokernizer_index = pickle.load(f)
f.close()

f = open('./data/text_inp_tokernizer_word_index.pkl','rb')
text_inp_tokernizer_word_index = pickle.load(f)
f.close()

f = open('./data/text_inp_tokernizer.pkl','rb')
text_inp_tokernizer = pickle.load(f)
f.close()

f = open('./data/train_data.pkl','rb')
train_data = pickle.load(f)
f.close()


In [5]:
print(len(perturbated_text_train))
print(len(text_inp_train))
print(len(text_out_train))

print(perturbated_text_train.shape)
print(text_inp_train.shape)
print(text_out_train.shape)

print(len(text_embed_matrix))
print(len(perturbated_text_embed_matrix))

163668
163668
163668
(163668, 20)
(163668, 20)
(163668, 20)
34727
34726


In [6]:
#https://edumunozsala.github.io/BlogEms/fastpages/jupyter/encoder-decoder/lstm/attention/tensorflow%202/2020/10/07/
#Intro-seq2seq-Encoder-Decoder-ENG-SPA-translator-tf2.html

#############################    Encoder    #######################################

class Encoder(tf.keras.layers.Layer):
    def __init__(self, input_vocab_size, embedding_dim, inp_len, lstm_units):
        super().__init__()
        self.input_vocab_size = input_vocab_size
        self.embedding_dim = embedding_dim
        self.inp_len = inp_len
        self.lstm_units= lstm_units
        #self.lstm_output = 0
        self.lstm_hidden_state=0
        self.lstm_cell_state=0

        
    def build(self, input_shape):
        self.embedding = Embedding(input_dim=self.input_vocab_size, output_dim=self.embedding_dim, input_length=self.inp_len,
                           mask_zero=True, name="embedding_layer_encoder", input_shape=(self.input_vocab_size,))
        self.lstm = LSTM(self.lstm_units, return_state=True, return_sequences=True, name="Encoder")

        
    def call(self, input_sentances, training=True):
        input_embedd = self.embedding(input_sentances)
        self.lstm_output, self.lstm_hidden_state,self.lstm_cell_state = self.lstm(input_embedd)
        return self.lstm_output, self.lstm_hidden_state,self.lstm_cell_state
    
    
    def get_states(self):
        return self.lstm_hidden_state,self.lstm_cell_state

#############################    Decoder    #######################################

class Decoder(tf.keras.layers.Layer):
    def __init__(self, input_vocab_size, embedding_dim, inp_len, lstm_units):
        super().__init__()
        self.input_vocab_size = input_vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.inp_len = inp_len

        
    def build(self, input_shape):
   
        self.embedding = Embedding(input_dim=self.input_vocab_size, output_dim=self.embedding_dim, input_length=self.inp_len,
                                   mask_zero=True, name="embedding_layer_decoder", weights=[text_embed_matrix],
                                   input_shape=(self.input_vocab_size,))
        self.lstm = LSTM(self.lstm_units, return_sequences=True, return_state=True, name="Decoder")

        
    def call(self, target_sentances, state_h, state_c):
        target_embedd = self.embedding(target_sentances)
        lstm_output, _,_ = self.lstm(target_embedd, initial_state=[state_h, state_c])
        return lstm_output

#############################    EncoderDecoder    #######################################
    
class EncoderDecoder(Model):
    def __init__(self, enc_inp_len, dec_inp_length, output_vocab_size, embedding_dim, lstm_units):
        super().__init__() 
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.encoder = Encoder(input_vocab_size=len(perturbated_text_embed_matrix) , embedding_dim=embedding_dim, 
                               inp_len=enc_inp_len, lstm_units=lstm_units)
        self.decoder = Decoder(input_vocab_size=len(text_embed_matrix) , embedding_dim=embedding_dim, 
                               inp_len=dec_inp_length, lstm_units=lstm_units)
        self.dense   = Dense(output_vocab_size, activation='softmax')


    def call(self, data):
        input_ ,output = data[0], data[1]
        encoder_output, encoder_h, encoder_c = self.encoder(input_)
        decoder_output = self.decoder(output, encoder_h, encoder_c)
        final_output = self.dense(decoder_output)
        return final_output

In [7]:
#https://edumunozsala.github.io/BlogEms/fastpages/jupyter/encoder-decoder/lstm/attention/tensorflow%202/2020/10/07/Intro-seq2seq-Encoder-Decoder-ENG-SPA-translator-tf2.ht
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_func(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_value = loss_obj(real, pred)

    mask = tf.cast(mask, dtype=loss_value.dtype)
    loss_value *= mask

    return tf.reduce_mean(loss_value)


In [8]:
#from tensorflow.python.ops.clip_ops import clip_by_norm
inp_vocab_size = len(perturbated_text_embed_matrix) 
out_vocab_size = len(text_embed_matrix) 
embedding_dim=300
inp_length=20
lstm_size=64
batch_size=256

optimizer = tf.keras.optimizers.Adam(clipnorm=1.0)
log_directory = os.getcwd()+"/logs/seq2seq/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [9]:
checkpoint_model = ModelCheckpoint("seq2seq_model_checkpoint.h5", monitor='loss', save_best_only=True, save_weights_only=True, verbose=0, mode='min')
earlystopping_model = EarlyStopping(monitor='loss', patience=5, verbose=1)
tensorboard_model = TensorBoard(log_dir=log_directory)
callbacks_model = [checkpoint_model, tensorboard_model, earlystopping_model]
#callbacks_model=[tensorboard_model]

In [10]:
model = EncoderDecoder(enc_inp_len=inp_length,dec_inp_length=inp_length,output_vocab_size=len(text_embed_matrix), 
                       embedding_dim=embedding_dim, lstm_units=lstm_size )
model.compile(optimizer=optimizer,loss=loss_func, metrics=['accuracy'])
model.train_on_batch([perturbated_text_train[:batch_size],text_inp_train[:batch_size]],text_out_train[:batch_size])
model.save_weights('seq2seq_model_weights', save_format='tf')
#model.load_weights('seq2seq_model_weights')

In [11]:
model.summary()

Model: "encoder_decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  10511240  
                                                                 
 decoder (Decoder)           multiple                  10511540  
                                                                 
 dense (Dense)               multiple                  2257255   
                                                                 
Total params: 23,280,035
Trainable params: 23,280,035
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(x=[perturbated_text_train,text_inp_train],y=text_out_train, epochs=10,batch_size=batch_size, callbacks=[callbacks_model])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4f77ef7040>

In [19]:
model.fit(x=[perturbated_text_train,text_inp_train],y=text_out_train, epochs=20,batch_size=batch_size, callbacks=[callbacks_model])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4f77ef7ca0>

In [29]:
model.fit(x=[perturbated_text_train,text_inp_train],y=text_out_train, epochs=20,batch_size=batch_size, callbacks=[callbacks_model])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4fa5edf790>

In [13]:
#tensorboard --logdir logs

In [30]:
def EncoderOutput(encoder_input):
    enc_input=list()
    for word in encoder_input.split():
        if perturbated_text_tokernizer_index.get(word) != None:
            enc_input.append(perturbated_text_tokernizer_index.get(word))
        else:
            enc_input.append(0)
            
    enc_output, enc_hidden_state, enc_cell_state = model.layers[0](np.array([enc_input], dtype='int32'))
    
    return enc_output, enc_hidden_state, enc_cell_state

def PredictOutput(encoder_input, decoder_input):

    dec_input=list()
    for word in decoder_input.split():
        if text_inp_tokernizer_word_index.get(word) != None:
            dec_input.append(text_inp_tokernizer_word_index.get(word))
        else:
            dec_input.append(0)
    
    enc_output, enc_hidden_state, enc_cell_state=EncoderOutput(encoder_input)
    
    pred = model.layers[2](model.layers[1](np.array([dec_input], dtype='int32'),
                                                                  enc_hidden_state, enc_cell_state))
    transalated_output=""
    for word in pred[0]:
        word = text_inp_tokernizer.index_word[tf.argmax(word).numpy()]
        transalated_output += word + " "
    
    return transalated_output

def InferResults(data):
    output = []

    for enc_inp,dec_inp, dec_out in data.values:
        pred = PredictOutput(enc_inp,dec_inp)
        output.append(pred)

    data['correct_output'] = data['dec_out']
    data['predicted_output'] = output

    data = data.drop(['dec_inp', 'dec_out'], axis=1)
    
    return data

In [53]:
sample_data_train=train_data.sample(100)
results=InferResults(sample_data_train)
results.tail(10)

Unnamed: 0,enc_inp,correct_output,predicted_output
106784,was dying and i cannot deal with it,he is dying and i cannot deal with it eos,is is and and i cannot deal with it eos
21949,we are still venting trace gasses gimme twenty...,we are still venting trace gasses gimme twenty...,we are still venting trace gasses gimme twenty...
82855,what about luca sollozzo did not seem worried ...,what about luca sollozzo did not seem worried ...,what about luca sollozzo did not seem worried ...
1378,no i never did either,no i never did either eos,no i never did either eos
119530,i am thinking of your man,i am thinking of your man eos,i am thinking of your man eos
86473,yes there i would like to go up,yes there is i would like to go up eos,yes i i to would like to go up eos
70201,get hot mug of chocolate first thing i gonna do,get a hot mug of chocolate first thing i am go...,get a hot couple of working first time i am go...
215591,no i saw it on his face,no i saw it on his face eos,no i saw it on his face eos
30683,i want to be alone,i want to be alone eos,i want to be alone eos
104140,whatever you need whatever jamie needs i am he...,whatever you need whatever jamie needs i am he...,whatever you need whatever i gets i was i they...


In [54]:
results.values[2]

array(['oh come on', 'oh come on eos', 'oh come on eos '], dtype=object)

In [55]:
train_output_bleu_score = []
for encoder_input_data, correct_output , predicted_output in results.values:
    correct_output = correct_output.split()
    predicted_output = predicted_output.rstrip().split()
    if len(correct_output) == len(predicted_output):
        train_output_bleu_score.append(sentence_bleu([correct_output],predicted_output))
print("BLEU Score of train dataset is",sum(train_output_bleu_score)/len(train_output_bleu_score))

BLEU Score of train dataset is 0.7996630746957749
