# PRELIMINARY

In [None]:
# Install dependencies, restart and run all after running this cell
!pip install tensorflow==2.5.2

# LOAD MODULES AND YOUR DATASET SECTION

In [2]:
import re
import pandas as pd
import os
import json
import io
from collections import Counter
from tensorflow.python.keras.models import load_model
from keras_preprocessing.text import tokenizer_from_json
from sklearn.model_selection import train_test_split
from string import punctuation
import tensorflow.keras as keras
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
data = pd.read_csv('/content/researchParallelCorpus.csv')

# CLASS AND FUNCTIONS SECTION

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K



class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """

            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            batch_size = encoder_out_seq.shape[0]
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""

            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))

            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """

            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """

            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))

            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        def create_inital_state(inputs, hidden_size):

            fake_state = K.zeros_like(inputs)
            fake_state = K.sum(fake_state, axis=[1, 2]) 
            fake_state = K.expand_dims(fake_state) 
            fake_state = K.tile(fake_state, [1, hidden_size]) 
            return fake_state

        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1]) 

        """ Computing energy outputs """
        
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [5]:
def save_model(dir_hash,model_dict,full_model,encoder_model,decoder_model,source_tokenizer,target_tokenizer):
    if not os.path.exists('h5.models/' + dir_hash):
        os.makedirs('h5.models/' + dir_hash)
    with open('h5.models/' + dir_hash + "/model_params.json", 'w') as f:
        f.write(json.dumps(model_dict))
    tokenizer_json=source_tokenizer.to_json()
    with io.open('h5.models/'+dir_hash+'/source_tokenizer.json','w',encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json,ensure_ascii=False))
    tokenizer_json=target_tokenizer.to_json()
    with io.open('h5.models/'+dir_hash+'/target_tokenizer.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    full_model.save('h5.models/'+dir_hash+'/full_model.h5')
    encoder_model.save('h5.models/'+dir_hash+'/encoder_model.h5')
    decoder_model.save('h5.models/'+dir_hash+'/decoder_model.h5')

def load_saved_model(dir_hash):
    with open('h5.models/'+dir_hash+'/model_params.json','r') as f:
        for line in f:
            data=json.loads(line)
    with open('h5.models/'+dir_hash+'/source_tokenizer.json',encoding='utf-8') as f:
        temp=json.load(f)
        source_tokenizer=tokenizer_from_json(temp)
    with open('h5.models/'+dir_hash+'/target_tokenizer.json',encoding='utf-8') as f:
        temp=json.load(f)
        target_tokenizer=tokenizer_from_json(temp)
    full_model=load_model('h5.models/'+dir_hash+'/full_model.h5', custom_objects={'AttentionLayer': AttentionLayer})
    encoder_model=load_model('h5.models/'+dir_hash+'/encoder_model.h5', custom_objects={'AttentionLayer': AttentionLayer})
    decoder_model=load_model('h5.models/'+dir_hash+'/decoder_model.h5', custom_objects={'AttentionLayer': AttentionLayer})
    return data,source_tokenizer,target_tokenizer,full_model,encoder_model,decoder_model

In [6]:
def get_data(data_file):

    std = data_file['Standard']
    n_std = data_file['Non-Standard']

    std = std.str.lower()
    std = std.str.replace('\W', ' ')
    std = std.dropna()

    n_std = n_std.str.lower()
    n_std = n_std.str.replace('\W', ' ')
    n_std = n_std.dropna()
  
    # Swap here to alter model training 
    source_lang_text_data = n_std.values
    target_lang_text_data = std.values 

    source_lang_text_data=[item.rstrip() for item in source_lang_text_data]
    target_lang_text_data=[item.rstrip() for item in target_lang_text_data]
    target_lang_text_data=['sentencestart '+sent[:-1]+' sentenceend .' if sent.endswith('.') else 'sentencestart '+sent+' sentenceend .' for sent in target_lang_text_data]

    print("Length of text {}".format(len(source_lang_text_data)))
    return source_lang_text_data,target_lang_text_data

In [7]:
def build_tokenizer_and_split_text(data_file,src_min_words=1,tgt_min_words=1):
    source_lang_text_data,target_lang_text_data=get_data(data_file)

    split_condition = re.compile(r"\w+|[^\w\s]", re.UNICODE)

    source_words = Counter()
    for sentence in source_lang_text_data:
        words=[word for word in split_condition.findall(sentence.lower()) if word not in punctuation]
        for word in words:
            source_words[word] += 1
    target_words = Counter()
    for sentence in target_lang_text_data:
        words = [word for word in split_condition.findall(sentence.lower()) if word not in punctuation]
        for word in words:
            target_words[word] += 1
    print("Total unique words in source lang: "+str(len(source_words)))
    print("Total unique words in target lang: "+str(len(target_words)))
    source_vocab_count=0
    for word in source_words:
        if source_words[word]>=src_min_words:source_vocab_count+=1
    target_vocab_count=0
    for word in target_words:
        if target_words[word]>=tgt_min_words:target_vocab_count+=1
    print("Total unique source words with min count "+str(src_min_words)+': '+str(source_vocab_count))
    print("Total unique target words with min count "+str(tgt_min_words)+': '+str(target_vocab_count))

    source_tokenizer=keras.preprocessing.text.Tokenizer(num_words=source_vocab_count+1,oov_token='UNK')
    source_tokenizer.fit_on_texts(source_lang_text_data)
    target_tokenizer=keras.preprocessing.text.Tokenizer(num_words=target_vocab_count+1,oov_token='UNK')
    target_tokenizer.fit_on_texts(target_lang_text_data)
    src_train, src_test, tgt_train, tgt_test = train_test_split(source_lang_text_data, target_lang_text_data, test_size=0.1)
    src_train, src_cv, tgt_train, tgt_cv = train_test_split(src_train, tgt_train,test_size=0.1)
    return src_train,src_cv,src_test,tgt_train,tgt_cv,tgt_test,source_tokenizer,target_tokenizer

In [8]:
from tensorflow.python.keras.layers import Input,Embedding,GRU,Dense,TimeDistributed,Bidirectional,Concatenate
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import numpy as np

def define_nmt(hidden_size,embedding_dim,source_lang_timesteps,source_lang_vocab_size,target_lang_timesteps,target_lang_vocab_size,dropout):

    encoder_inputs=Input(shape=(source_lang_timesteps,),name='encoder_inputs')
    decoder_inputs=Input(shape=(target_lang_timesteps-1,),name='decoder_inputs')

    encoder_embedding_layer = Embedding(input_dim=source_lang_vocab_size, output_dim=embedding_dim)
    encoder_embedded = encoder_embedding_layer(encoder_inputs)
    decoder_embedding_layer = Embedding(input_dim=target_lang_vocab_size, output_dim=embedding_dim)
    decoder_embedded = decoder_embedding_layer(decoder_inputs)

    encoder_gru1 = GRU(2 * hidden_size, return_sequences=True, return_state=True, name='encoder_gru1')
    encoder_out1, encoder_state1 = encoder_gru1(encoder_embedded)
    encoder_gru2 = GRU(2 * hidden_size, return_sequences=True, return_state=True, name='encoder_gru2')
    encoder_out2, encoder_state2 = encoder_gru2(encoder_out1)
    encoder_states = [encoder_state1, encoder_state2]

    decoder_gru1 = GRU(2 * hidden_size, return_sequences=True, return_state=True, name='decoder_gru1')
    decoder_out1, decoder_state1 = decoder_gru1(decoder_embedded, initial_state=encoder_state1)
    decoder_gru2 = GRU(2 * hidden_size, return_sequences=True, return_state=True, name='decoder_gru2')
    decoder_out2, decoder_state2 = decoder_gru2(decoder_out1, initial_state=encoder_state2)

    attn_layer = AttentionLayer(name='attention_layer')
    attn_out, attn_states = attn_layer([encoder_out2, decoder_out2])

    decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_out2, attn_out])

    dense=Dense(target_lang_vocab_size,activation='softmax',name='softmax_layer')
    dense_time=TimeDistributed(dense,name='time_distributed_layer')
    decoder_pred=dense_time(decoder_concat_input)

    full_model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_pred)
    full_model.compile(optimizer='adam', loss='categorical_crossentropy')
    full_model.summary(line_length=225)

    encoder_model = Model(encoder_inputs, [encoder_out2]+encoder_states)
    encoder_model.summary(line_length=225)

    inf_decoder_state1 = Input(shape=(2*hidden_size,))
    inf_decoder_state2 = Input(shape=(2*hidden_size,))
    inf_decoder_inputs = Input(shape=(1,), name='decoder_inputs')
    inf_encoder_outputs=Input(shape=(source_lang_timesteps,2*hidden_size,))
    inf_decoder_embedded = decoder_embedding_layer(inf_decoder_inputs)

    inf_decoder_out1, inf_decoder_state_out1= decoder_gru1(inf_decoder_embedded,initial_state=inf_decoder_state1)
    inf_decoder_out2, inf_decoder_state_out2 = decoder_gru2(inf_decoder_out1, initial_state=inf_decoder_state2)
    inf_attn_out, inf_attn_states = attn_layer([inf_encoder_outputs, inf_decoder_out2])
    inf_decoder_concat = Concatenate(axis=-1, name='concat')([inf_decoder_out2, inf_attn_out])

    inf_decoder_pred = TimeDistributed(dense)(inf_decoder_concat)
    decoder_model = Model(inputs=[inf_encoder_outputs,inf_decoder_inputs,inf_decoder_state1,inf_decoder_state2],outputs=[inf_decoder_pred,inf_attn_states,inf_decoder_state_out1,inf_decoder_state_out2])
    decoder_model.summary(line_length=225)
    return full_model, encoder_model, decoder_model

In [9]:
from tensorflow.python.keras.utils.data_utils import Sequence
from sklearn.utils import shuffle
import numpy as np
from tensorflow.keras.utils import to_categorical

class DataGenerator(Sequence):
    def __init__(self,source_text,target_text,source_tokenizer,target_tokenizer,target_vocab_size,source_timesteps,target_timesteps,batch_size=32,shuffle=True):
        self.source_text=source_text
        self.target_text=target_text
        self.source_tokenizer=source_tokenizer
        self.target_tokenizer=target_tokenizer
        self.target_vocab_size=target_vocab_size
        self.source_timesteps=source_timesteps
        self.target_timesteps=target_timesteps
        self.batch_size=batch_size
        self.shuffle=shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.source_text)/float(self.batch_size)))

    def on_epoch_end(self):
        if self.shuffle==True:
            self.source_text,self.target_text=shuffle(self.source_text,self.target_text)

    def __getitem__(self,idx):
        source_text=self.source_text[idx * self.batch_size:(idx + 1) * self.batch_size]
        target_text=self.target_text[idx * self.batch_size:(idx + 1) * self.batch_size]
        source_text_encoded = self.source_tokenizer.texts_to_sequences(source_text)
        target_text_encoded = self.target_tokenizer.texts_to_sequences(target_text)
        source_preproc_text = pad_sequences(source_text_encoded, padding='post', maxlen=self.source_timesteps)
        target_preproc_text = pad_sequences(target_text_encoded, padding='post', maxlen=self.target_timesteps)
        target_categorical=to_categorical(target_preproc_text,num_classes=self.target_vocab_size)
        return [source_preproc_text,target_preproc_text[:,:-1]],target_categorical[:,1:,:]


In [10]:
def translate(sentence,encoder_model,decoder_model,source_tokenizer,target_tokenizer,src_vsize,tgt_vsize,source_timesteps,target_timesteps):
    target="sentencestart"
    source_text_encoded = source_tokenizer.texts_to_sequences([sentence])
    target_text_encoded = target_tokenizer.texts_to_sequences([target])
    source_preproc_text = pad_sequences(source_text_encoded, padding='post', maxlen=source_timesteps)
    target_preproc_text=pad_sequences(target_text_encoded,padding='post',maxlen=1)
    encoder_out,enc_last_state1,enc_last_state2=encoder_model.predict(source_preproc_text)
    continuePrediction=True
    output_sentence=''
    total=0
    while continuePrediction:
        decoder_pred,attn_state,decoder_state1,decoder_state2=decoder_model.predict([encoder_out,target_preproc_text,enc_last_state1,enc_last_state2])
        index_value = np.argmax(decoder_pred, axis=-1)[0, 0]
        sTemp = target_tokenizer.index_word.get(index_value, 'UNK')
        output_sentence += sTemp + ' '
        total += 1
        if total >= target_timesteps or sTemp == 'sentenceend':
            continuePrediction = False
        enc_last_state1=decoder_state1
        enc_last_state2=decoder_state2
        target_preproc_text[0,0]=index_value
    return output_sentence

# TRAIN SECTION

In [None]:
src_train, src_cv, src_test, tgt_train, tgt_cv, tgt_test, source_tokenizer, target_tokenizer = build_tokenizer_and_split_text(data, src_min_words=1, tgt_min_words=1)

In [12]:
if source_tokenizer.num_words is None:
  src_vsize = max(source_tokenizer.index_word.keys()) + 1
else:
  if (max(source_tokenizer.index_word.keys()) + 1) < source_tokenizer.num_words:
    src_vsize = max(source_tokenizer.index_word.keys()) + 1
  else:
    src_vsize = source_tokenizer.num_words

    
if target_tokenizer.num_words is None:
    tgt_vsize = max(target_tokenizer.index_word.keys()) + 1
else:
    if max(target_tokenizer.index_word.keys()) + 1 < target_tokenizer.num_words:
      tgt_vsize = max(target_tokenizer.index_word.keys()) + 1
    else:
      tgt_vsize = target_tokenizer.num_words

In [13]:
SOURCE_TIMESTEPS,TARGET_TIMESTEPS=20,20
HIDDEN_SIZE=256
EMBEDDING_DIM=100
NUM_EPOCHS=1
BATCH_SIZE=64
DROPOUT=1.0

In [None]:
full_model, encoder_model, decoder_model = define_nmt(hidden_size=HIDDEN_SIZE, embedding_dim=EMBEDDING_DIM,
                                                              source_lang_timesteps=SOURCE_TIMESTEPS,
                                                              source_lang_vocab_size=src_vsize,
                                                              target_lang_timesteps=TARGET_TIMESTEPS,
                                                              target_lang_vocab_size=tgt_vsize, dropout=DROPOUT)

In [None]:
full_model.summary(line_length=225)
encoder_model.summary(line_length=225)
decoder_model.summary(line_length=225)

In [None]:
training_generator=DataGenerator(source_text=src_train,target_text=tgt_train,source_tokenizer=source_tokenizer,target_tokenizer=target_tokenizer,
                                         target_vocab_size=tgt_vsize,source_timesteps=SOURCE_TIMESTEPS,target_timesteps=TARGET_TIMESTEPS,batch_size=BATCH_SIZE,shuffle=True)

In [None]:
validation_generator=DataGenerator(source_text=src_cv,target_text=tgt_cv,source_tokenizer=source_tokenizer,target_tokenizer=target_tokenizer,
                                         target_vocab_size=tgt_vsize,source_timesteps=SOURCE_TIMESTEPS,target_timesteps=TARGET_TIMESTEPS,batch_size=BATCH_SIZE,shuffle=True)

In [None]:
history = full_model.fit_generator(generator=training_generator,validation_data=validation_generator,use_multiprocessing=True,workers=6,epochs=NUM_EPOCHS)

# GRAPH SECTION

In [None]:
import matplotlib.pyplot as plt

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.plot(history.history['loss'], label="loss")
plt.plot(history.history['val_loss'], label="val_loss")

plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.tight_layout()
plt.show()

# SAVE AND LOAD SECTION

In [None]:
dir_hash='GRU_Attention'
model_dict = {
          'HiddenSize': HIDDEN_SIZE,
          'EmbeddingDim': EMBEDDING_DIM,
          'SourceTimeSteps': 20,
          'TargetTimeSteps': 20,
          'SourceVocab': src_vsize,
          'TargetVocab': tgt_vsize,
      }

save_model(dir_hash, model_dict, full_model, encoder_model, decoder_model, source_tokenizer, target_tokenizer)

In [None]:

model_dict, source_tokenizer, target_tokenizer, _, encoder_model, decoder_model = load_saved_model(dir_hash)

src_vsize = model_dict['SourceVocab']
tgt_vsize = model_dict['TargetVocab']
SOURCE_TIMESTEPS = model_dict['SourceTimeSteps']
TARGET_TIMESTEPS = model_dict['TargetTimeSteps']
HIDDEN_SIZE = model_dict['HiddenSize']
EMBEDDING_DIM = model_dict['EmbeddingDim']


# BLEU SECTION

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate import bleu
from nltk.translate.bleu_score import SmoothingFunction

total_bleu=0.0
smoothie = SmoothingFunction().method4

bleu_1=0.0
bleu_2=0.0
bleu_3=0.0
bleu_4=0.0

total_bleu_1=0.0
total_bleu_2=0.0
total_bleu_3=0.0
total_bleu_4=0.0


for i,sentence in enumerate(src_test):
  translation=translate(sentence, encoder_model, decoder_model, source_tokenizer, target_tokenizer, src_vsize,tgt_vsize, SOURCE_TIMESTEPS, TARGET_TIMESTEPS)
  expected=[tgt_test[i].replace("sentencestart ","")]
  translation=translation.replace(" sentenceend","")
  bleu=sentence_bleu(expected,translation, smoothing_function=smoothie)
  bleu_1=sentence_bleu(expected,translation, smoothing_function=smoothie, weights=(1, 0, 0, 0))
  bleu_2=sentence_bleu(expected,translation, smoothing_function=smoothie, weights=(0.5, 0.5, 0, 0))
  bleu_3=sentence_bleu(expected,translation, smoothing_function=smoothie, weights=(0.33, 0.33, 0.33, 0))
  bleu_4=sentence_bleu(expected,translation, smoothing_function=smoothie, weights=(0.25, 0.25, 0.25, 0.25))
  total_bleu_1+=bleu_1
  total_bleu_2+=bleu_2
  total_bleu_3+=bleu_3
  total_bleu_4+=bleu_4
  total_bleu+=bleu

total_bleu/=len(src_test)
total_bleu_1/=len(src_test)
total_bleu_2/=len(src_test)
total_bleu_3/=len(src_test)
total_bleu_4/=len(src_test)

print("Average bleu score for "+str(len(src_test))+" items: "+str(total_bleu))
print("Average BLEU-1 for "+str(len(src_test))+" items: "+str(total_bleu_1))
print("Average BLEU-2 for "+str(len(src_test))+" items: "+str(total_bleu_2))
print("Average BLEU-3 for "+str(len(src_test))+" items: "+str(total_bleu_3))
print("Average BLEU-4 for "+str(len(src_test))+" items: "+str(total_bleu_4))

# TEST SECTION

In [None]:
i = 0
while i < 5:
  sentence = input("Please enter a source word: ")
  translation = translate(sentence, encoder_model, decoder_model, source_tokenizer, target_tokenizer, src_vsize,
                                    tgt_vsize, SOURCE_TIMESTEPS, TARGET_TIMESTEPS)
  print("Translation to target word " + translation.replace(" sentenceend",""))
  i = i + 1