In [75]:
%load_ext tensorboard
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense,RNN,Flatten
from tensorflow.keras import layers

from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
from tensorflow.keras import initializers, regularizers, constraints
import numpy as np
import datetime

import pickle
import shutil
from IPython.display import Image
import io
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.bleu_score import sentence_bleu

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [4]:
#from google.colab import drive
#drive.mount('/content/drive')
#os.chdir('/content/drive/MyDrive/Deep Text Corrector')

In [5]:
os.getcwd()

'C:\\Users\\Bazinga\\AAIC\\Deep Text Corrector'

In [6]:
f = open('./data/perturbated_text_embed_matrix.pkl','rb')
perturbated_text_embed_matrix = pickle.load(f)
f.close()

f = open('./data/text_embed_matrix.pkl','rb')
text_embed_matrix = pickle.load(f)
f.close()

f = open('./data/perturbated_text_train.pkl','rb')
perturbated_text_train = pickle.load(f)
f.close()

f = open('./data/text_inp_train.pkl','rb')
text_inp_train = pickle.load(f)
f.close()

f = open('./data/text_out_train.pkl','rb')
text_out_train = pickle.load(f)
f.close()

f = open('./data/perturbated_text_tokernizer_index.pkl','rb')
perturbated_text_tokernizer_index = pickle.load(f)
f.close()

f = open('./data/text_inp_tokernizer_word_index.pkl','rb')
text_inp_tokernizer_word_index = pickle.load(f)
f.close()

f = open('./data/text_inp_tokernizer.pkl','rb')
text_inp_tokernizer = pickle.load(f)
f.close()

f = open('./data/train_data.pkl','rb')
train_data = pickle.load(f)
f.close()

f = open('./data/test_data.pkl','rb')
test_data = pickle.load(f)
f.close()

f = open('./data/validation_data.pkl','rb')
validation_data = pickle.load(f)
f.close()


In [7]:
print(len(perturbated_text_train))
print(len(text_inp_train))
print(len(text_out_train))

print(perturbated_text_train.shape)
print(text_inp_train.shape)
print(text_out_train.shape)

print(len(text_embed_matrix))
print(len(perturbated_text_embed_matrix))

163668
163668
163668
(163668, 20)
(163668, 20)
(163668, 20)
34594
34593


In [161]:
class Encoder(tf.keras.layers.Layer):
    '''
    Encoder model -- That takes a input sequence and returns encoder-outputs,encoder_final_state_h,encoder_final_state_c
    '''

    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        
        super().__init__()
        self.lstm_size = lstm_size
        self.embedding = Embedding(input_dim=inp_vocab_size, output_dim=300, input_length=input_length,
                           mask_zero=True,name="embedding_layer_encoder", weights=[perturbated_text_embed_matrix], trainable=False)
        self.lstmcell = tf.keras.layers.LSTMCell(lstm_size)
        self.encoder_lstm = RNN(self.lstmcell,return_sequences=True, return_state=True)


    def call(self,input_sequence,states):

        output1 = self.embedding(input_sequence)
        mask = self.embedding.compute_mask(input_sequence)
        enco_output, enco_state_h, enco_state_c = self.encoder_lstm(output1, initial_state=states,mask=mask)
        return enco_output, enco_state_h, enco_state_c

    
    def initialize_states(self,batch_size):

        initial_hidden_state = tf.zeros([batch_size,self.lstm_size])
        initial_cell_state = tf.zeros([batch_size,self.lstm_size])
        
        return [initial_hidden_state,initial_cell_state]
    
############################## Decoder class #############################################################
    
# code reference for concat scoing scoring function from https://www.tensorflow.org/tutorials/text/nmt_with_attention
from tensorflow.keras.layers import Input, Softmax, RNN, Dense, Embedding, LSTM
class Attention(tf.keras.layers.Layer):
  '''
    Class the calculates score based on the scoring_function using Bahdanu attention mechanism.
  '''
  def __init__(self,scoring_function,att_units):


    # Please go through the reference notebook and research paper to complete the scoring functions
    super().__init__()
    self.scoring_function = scoring_function
    
    if self.scoring_function=='dot':
      # Intialize variables needed for Dot score function here
        #self.similarity = []
        self.softmax = Softmax(axis=1)
        #self.similarity = [j for j in range(att_units)]
        pass
  
  def call(self,decoder_hidden_state,encoder_output):
    
    if self.scoring_function == 'dot':
        # Implement Dot score function here
        #print(decoder_hidden_state.shape,encoder_output.shape)
        attention_weight = tf.matmul(encoder_output,tf.expand_dims(decoder_hidden_state,axis=2))
        #print(attention_weight.shape)
        context = tf.matmul(tf.transpose(encoder_output, perm=[0,2,1]),attention_weight)
        context = tf.squeeze(context,axis=2)
        output = self.softmax(attention_weight)
        return context,output
    
class One_Step_Decoder(tf.keras.Model):
    def __init__(self,tar_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):

      # Initialize decoder embedding layer, LSTM and any other objects needed
        super().__init__()
        self.tar_vocab_size = tar_vocab_size
        self.lstm_size = dec_units
        self.att_units = att_units
        self.score_fun = score_fun
        #print("output vocan size ",tar_vocab_size)
        self.embedding = Embedding(input_dim=tar_vocab_size, output_dim=300, input_length=input_length,
                           mask_zero=True,name="embedding_layer_encoder",weights=[text_embed_matrix], trainable=False)
        self.lstmcell = tf.keras.layers.LSTMCell(dec_units)
        self.decoder_lstm = RNN(self.lstmcell,return_sequences=True, return_state=True)
        self.dense   = Dense(tar_vocab_size)
        #self.decoder_lstm = LSTM(lstm_size, return_state=True, return_sequences=True, name="decoder_LSTM")
        self.attention=Attention(self.score_fun,self.att_units)


    def call(self,input_to_decoder, encoder_output, state_h,state_c):

        output2 = self.embedding(input_to_decoder)
        mask = self.embedding.compute_mask(input_to_decoder)
        #print("one step decoder SHAPE after embedding:",output2.shape)
        output2 = tf.squeeze(output2,axis=1)
        #print("one step decoder SHAPE after embedding and sqeezing:",output2.shape)

        # step b
    #         attention=Attention(self.score_fun,self.att_units)
        context_vector,attention_weights=self.attention(state_h,encoder_output)
        # step c
        output3 = tf.concat([context_vector,output2],1)
        #print("shape after concating ",output3.shape)
        output3 = tf.expand_dims(output3,1)
        deco_output, deco_state_h, deco_state_c = self.decoder_lstm(output3,initial_state=[state_h,state_c],mask=mask)
        # step e
        output4 = self.dense(deco_output)
        output4 = tf.squeeze(output4,axis=1)
        #print("shape afyer dense layer and softmax ",output4.shape)
        return output4,deco_state_h, deco_state_c,attention_weights,context_vector
    
class Decoder(tf.keras.Model):
    def __init__(self,out_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
      #Intialize necessary variables and create an object from the class onestepdecoder
        super().__init__()
        self.out_vocab_size = out_vocab_size
        self.embedding_dim = embedding_dim
        self.dec_units = dec_units
        self.att_units = att_units
        self.input_length = input_length
        self.score_fun = score_fun
        self.onestepdecoder = One_Step_Decoder(self.out_vocab_size,self.embedding_dim,self.input_length,self.dec_units,self.score_fun,self.att_units)
        
    @tf.function    
    def call(self, input_to_decoder,encoder_output,decoder_hidden_state,decoder_cell_state):


        all_outputs = tf.TensorArray(tf.float32,size=input_to_decoder.shape[1])
        for timestep in range(input_to_decoder.shape[1]):
            output,decoder_hidden_state,decoder_cell_state,attention_weights,context_vector=self.onestepdecoder(input_to_decoder[:,timestep:timestep+1],encoder_output,decoder_hidden_state,decoder_cell_state)
            all_outputs = all_outputs.write(timestep,output)
        # Return the tensor array
        all_outputs = tf.transpose(all_outputs.stack(),[1,0,2])
        #print("all outpt shape is ",all_outputs.shape)
        return all_outputs
    
class EncoderDecoder(tf.keras.Model):
    def __init__(self,inp_vocab_size,out_vocab_size,embedding_size,lstm_size,input_length,batch_size,score_fun,att_units,*args):
        #Intialize objects from encoder decoder
        super().__init__() # https://stackoverflow.com/a/27134600/4084039
        #print("input vocab size in encoder decoder class",inp_vocab_size)
        self.encoder = Encoder(inp_vocab_size,embedding_size,lstm_size,input_length)
        #print("output vocab size in encoder decoder class",out_vocab_size)
        self.decoder = Decoder(out_vocab_size,embedding_size,input_length,lstm_size,score_fun,att_units)
        self.dense   = Dense(out_vocab_size, activation='softmax')
        self.flatten = Flatten()
        self.batch = batch_size

    def call(self,data):
        input,output = data[0], data[1]
        #Intialize encoder states, Pass the encoder_sequence to the embedding layer
        l = self.encoder.initialize_states(self.batch)
        #print("WE ARE INITIALIZING encoder WITH initial STATES as zeroes :",l[0].shape, l[1].shape)
        encoder_output,encoder_final_state_h,encoder_final_state_c = self.encoder(input,l)
        decoder_output = self.decoder(output,encoder_output,encoder_final_state_h,encoder_final_state_c)
        decoder_output  = self.dense(decoder_output, activation='softmax')
        decoder_output=self.flatten()

        return decoder_output

In [151]:
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_func(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_value = loss_obj(real, pred)

    mask = tf.cast(mask, dtype=loss_value.dtype)
    loss_value *= mask

    return tf.reduce_mean(loss_value)

In [152]:
inp_vocab_size = len(perturbated_text_embed_matrix) 
out_vocab_size = len(text_embed_matrix) 
embedding_dim=300
inp_length=20
lstm_size=64
batch_size=256
score_fun = "dot"
att_units = 256

In [153]:
optimizer = tf.keras.optimizers.Adam(clipnorm=1.0)
log_directory = os.getcwd() + '/attention_model/logs/fit/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [158]:
model = EncoderDecoder(inp_vocab_size,out_vocab_size,embedding_dim,lstm_size,input_length,batch_size,score_fun,att_units)
model.compile(optimizer=optimizer,loss=loss_func, metrics=['accuracy'])
model.train_on_batch([perturbated_text_train[:batch_size],text_inp_train[:batch_size]],text_out_train[:batch_size])
model.save_weights('attention_model_weights', save_format='tf')

In [160]:
checkpoint_model = ModelCheckpoint("seq2seq_model_checkpoint.h5", monitor='loss', save_best_only=True, save_weights_only=True, verbose=0, mode='min')
earlystopping_model = EarlyStopping(monitor='loss', patience=5, verbose=1)
tensorboard = TensorBoard(log_dir=log_directory, histogram_freq=1)
callbacks_model = [checkpoint_model, tensorboard, earlystopping_model]

In [156]:
model.summary()

Model: "encoder_decoder_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_18 (Encoder)         multiple                  10471340  
_________________________________________________________________
decoder_14 (Decoder)         multiple                  12736634  
Total params: 23,207,974
Trainable params: 2,451,874
Non-trainable params: 20,756,100
_________________________________________________________________


In [157]:
model.fit(x=[perturbated_text_train,text_inp_train],y=text_out_train, epochs=60,batch_size=batch_size, callbacks=[callbacks_model])

Epoch 1/60
 15/640 [..............................] - ETA: 28:02 - loss: 4.6563 - accuracy: 0.0492

KeyboardInterrupt: 

In [None]:
#%tensorboard --log_dir logs/fit

In [None]:
def EncoderOutput(encoder_input):
    enc_input=list()
    for word in encoder_input.split():
        if perturbated_text_tokernizer_index.get(word) != None:
            enc_input.append(perturbated_text_tokernizer_index.get(word))
        else:
            enc_input.append(0)
            
    enc_output, enc_hidden_state, enc_cell_state = model.layers[0](np.array([enc_input], dtype='int32'))
    
    return enc_output, enc_hidden_state, enc_cell_state

def PredictOutput(encoder_input, decoder_input):

    dec_input=list()
    for word in decoder_input.split():
        if text_inp_tokernizer_word_index.get(word) != None:
            dec_input.append(text_inp_tokernizer_word_index.get(word))
        else:
            dec_input.append(0)
    
    enc_output, enc_hidden_state, enc_cell_state=EncoderOutput(encoder_input)
    
    pred = model.layers[2](model.layers[1](np.array([dec_input], dtype='int32'),
                                                                  enc_hidden_state, enc_cell_state))
    transalated_output=""
    for word in pred[0]:
        word = text_inp_tokernizer.index_word[tf.argmax(word).numpy()]
        transalated_output += word + " "
    
    return transalated_output

def InferResults(data):
    output = []

    for enc_inp,dec_inp, dec_out in data.values:
        pred = PredictOutput(enc_inp,dec_inp)
        output.append(pred)

    data['correct_output'] = data['dec_out']
    data['predicted_output'] = output

    data = data.drop(['dec_inp', 'dec_out'], axis=1)
    
    return data

def compute_blue_score(results):
    data_output_bleu_score = []
    for encoder_input_data, correct_output , predicted_output in results.values:
        correct_output = correct_output.split()
        predicted_output = predicted_output.rstrip().split()
        if len(correct_output) == len(predicted_output):
            data_output_bleu_score.append(sentence_bleu([correct_output],predicted_output))
          
    blue_score=sum(data_output_bleu_score)/len(data_output_bleu_score)
    return blue_score

In [None]:
sample_data_train=train_data.sample(1000)
results=InferResults(sample_data_train)
results.head(10)

In [None]:
results.values[-7]

In [None]:
blue_score=compute_blue_score(results)
print('BLUE score for Training data is: {}'.format(blue_score))

In [None]:
sample_data_test=test_data.sample(1000)
results=InferResults(sample_data_test)
results.head(10)

In [None]:
blue_score=compute_blue_score(results)
print('BLUE score for Test data is: {}'.format(blue_score))

In [None]:
sample_data_validation=validation_data.sample(1000)
results=InferResults(sample_data_validation)
results.head(10)

In [None]:
blue_score=compute_blue_score(results)
print('BLUE score for Validation data is: {}'.format(blue_score))

## Observations
- The basic seq2seq model or basic encoder decoder model is proven to be ok.
- The model gave good BLUE score of 0.75 on Train data, 0.437 on Test data and 0.44 on validation data 
- We need to build bit complex models by introducing attention mechanism in the base model to improve the accuracy drastically