In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
import random

In [2]:
# opening the train_source file in read mode
my_file = open("train-source.txt", "r", encoding='UTF-8')

# reading the file
data = my_file.read()

train_source_list = data.replace('\n'," ").split('</s>')

my_file.close()

In [None]:
train_source_list

In [3]:
# opening the train-target file in read mode
my_file = open("train-target.txt", "r", encoding='UTF-8')

# reading the file
data = my_file.read()

train_target_list = data.replace('\n'," ").split('</s>')

my_file.close()

In [4]:
# opening the test-target file in read mode
my_file = open("test-source.txt", "r", encoding='UTF-8')

# reading the file
data = my_file.read()

test_source_list = data.replace('\n'," ").split('</s>')

my_file.close()

# opening the test-target file in read mode
my_file = open("test-target.txt", "r", encoding='UTF-8')

# reading the file
data = my_file.read()

test_target_list = data.replace('\n'," ").split('</s>')

my_file.close()

In [5]:
print("length of training sets Source: ", len(train_source_list), " Target :", len(train_target_list))
print("length of test sets Source: ", len(test_source_list), " Target :", len(test_target_list))

length of training sets Source:  45172  Target : 45172
length of test sets Source:  1001  Target : 1001


Finding unique words in both sets

In [6]:
input_token_index = dict([(char, i) for i, char in enumerate(train_source_list)])
target_token_index = dict([(char, i) for i, char in enumerate(train_target_list)])

print(len(input_token_index))
print(len(target_token_index))

44981
44930


In [7]:
input_unique_words = set()
for sentence in train_source_list:
    input_unique_words.update(sentence)

output_unique_words = set()
for sentence in train_target_list:
    output_unique_words.update(sentence)

In [8]:
print(len(input_unique_words))
print(len(output_unique_words))

107
95


In [9]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [10]:
lang_eng = []
lang_ita = []

raw_data_en, raw_data_ita = train_source_list, train_target_list

def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

input_tensor, inp_lang = tokenize(train_source_list)
target_tensor, targ_lang = tokenize(train_target_list)

max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [11]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

36137 36137 9035 9035
Input Language; index to word mapping
1 ----> <s>
8 ----> "
29 ----> tá
59 ----> tú
15 ----> i
683 ----> n-am
12 ----> go
367 ----> leor
5 ----> ,
8 ----> "
20 ----> arsa
296 ----> nóra
14 ----> '
288 ----> á
6246 ----> comóradh
14 ----> '
155 ----> un
4 ----> an
541 ----> dorais
2 ----> .

Target Language; index to word mapping
1 ----> <s>
40 ----> `
33 ----> tá
55 ----> tú
66 ----> in
127 ----> am
12 ----> go
186 ----> leor
5 ----> ,
14 ----> '
22 ----> arsa
283 ----> nóra
5 ----> ,
128 ----> á
4224 ----> comóradh
62 ----> chun
4 ----> an
542 ----> dorais
2 ----> .


In [12]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 1000
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset element_spec=(TensorSpec(shape=(1000, 232), dtype=tf.int32, name=None), TensorSpec(shape=(1000, 222), dtype=tf.int32, name=None))>

In [13]:
class Encoder(tf.keras.Model):

    def __init__(self, inp_vocab_size, embedding_size, lstm_size, input_length):
        super(Encoder, self).__init__()
        
        #Initialize Embedding layer
        #Intialize Encoder LSTM layer
        
        self.lstm_size = lstm_size
        self.embedding = tf.keras.layers.Embedding(inp_vocab_size, embedding_size)
        self.lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True)

    def call(self, input_sequence, states):
      
        embed = self.embedding(input_sequence)
        output, state_h, state_c = self.lstm(embed, initial_state=states)

        return output, state_h, state_c
    
    def initialize_states(self,batch_size):
    
        return (tf.zeros([batch_size, self.lstm_size]),
                tf.zeros([batch_size, self.lstm_size]))

In [14]:
class Attention(tf.keras.layers.Layer):
    def __init__(self,scoring_function, att_units):
        super(Attention, self).__init__()
        
        self.scoring_function = scoring_function
        self.att_units = att_units

        if self.scoring_function=='dot':
            pass
            # For general, it would be self.wa = tf.keras.layers.Dense(att_units)


    def call(self,decoder_hidden_state,encoder_output):

        if self.scoring_function == 'dot':
            
            new_state = tf.expand_dims(decoder_hidden_state, -1)
            score = tf.matmul(encoder_output, new_state)
            weights = tf.nn.softmax(score, axis=1)
            context = weights * encoder_output
            context_vector = tf.reduce_sum(context, axis=1)
                                
            return context_vector, weights

In [15]:
class One_Step_Decoder(tf.keras.Model):
    def __init__(self, tar_vocab_size, embedding_dim, input_length, dec_units, score_fun, att_units):
        super(One_Step_Decoder, self).__init__()
        # Initialize decoder embedding layer, LSTM and any other objects needed
        self.tar_vocab_size = tar_vocab_size
        self.embedding_dim = embedding_dim
        self.input_length = input_length
        self.dec_units = dec_units
        self.score_fun = score_fun
        self.att_units = att_units
        self.embedding = tf.keras.layers.Embedding(self.tar_vocab_size, self.embedding_dim, 
                                                   input_length=self.input_length)
        
        self.lstm = tf.keras.layers.LSTM(self.dec_units, return_sequences=True, 
                                         return_state=True)
        
        self.output_layer = tf.keras.layers.Dense(self.tar_vocab_size)
        
        self.attention = Attention(self.score_fun, self.att_units)

    def call(self, input_to_decoder, encoder_output, state_h, state_c):
        
        result = self.embedding(input_to_decoder)
        
        context_vector, weights = self.attention(state_h, encoder_output)
        
        concat = tf.concat([tf.expand_dims(context_vector, 1), result], axis=-1)
        
        decoder_output, hidden_state, cell_state = self.lstm(concat, initial_state=[state_h, state_c])
        
        final_output = tf.reshape(decoder_output, (-1, decoder_output.shape[2]))
        final_output = self.output_layer(final_output)
        
        return final_output, hidden_state, cell_state, weights, context_vector

In [16]:
class Decoder(tf.keras.Model):
    def __init__(self, out_vocab_size, embedding_dim, output_length, dec_units ,score_fun ,att_units):
        #Intialize necessary variables and create an object from the class onestepdecoder
        super(Decoder, self).__init__()
        self.out_vocab_size = out_vocab_size
        self.embedding_dim = embedding_dim
        self.output_length = output_length
        self.dec_units = dec_units
        self.score_fun = score_fun
        self.att_units = att_units
        self.onestepdecoder = One_Step_Decoder(self.out_vocab_size, self.embedding_dim, self.output_length,
                                               self.dec_units, self.score_fun, self.att_units)
        
    def call(self, input_to_decoder,encoder_output,decoder_hidden_state,decoder_cell_state):
        
        all_outputs= tf.TensorArray(tf.float32, size=input_to_decoder.shape[1], name="output_arrays")
        
        
        for timestep in range(input_to_decoder.shape[1]):
            output, decoder_hidden_state, decoder_cell_state, weights, context_vector = self.onestepdecoder(
                                                                                    input_to_decoder[:,timestep:timestep+1], 
                                                                                    encoder_output, 
                                                                                    decoder_hidden_state,
                                                                                    decoder_cell_state)
            
            all_outputs = all_outputs.write(timestep, output)
        
        all_outputs = tf.transpose(all_outputs.stack(), (1, 0, 2)) 

        return all_outputs

In [17]:
class encoder_decoder(tf.keras.Model):
    def __init__(self, inp_vocab_size, out_vocab_size, embedding_size, lstm_size, 
                 input_length, output_length, dec_units ,score_fun ,att_units, batch_size):
        
        super(encoder_decoder, self).__init__()
        
        self.encoder = Encoder(inp_vocab_size, embedding_size, lstm_size, input_length)
        self.decoder = Decoder(out_vocab_size, embedding_size, output_length, 
                               dec_units, score_fun, att_units)
    
    def call(self, data):
        
        input_sequence, input_to_decoder = data[0],data[1]
        initial_state = self.encoder.initialize_states(batch_size=1000)
        encoder_output, state_h, state_c = self.encoder(input_sequence, initial_state)
        decoder_hidden_state = state_h
        decoder_cell_state = state_c
        decoder_output = self.decoder(input_to_decoder, encoder_output, decoder_hidden_state, decoder_cell_state)
        
        return decoder_output

In [18]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

optimizer = tf.keras.optimizers.Adam()

In [None]:
!mkdir logs

In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("dot.h5", monitor='val_loss', verbose=1, save_weights_only=True)

logdir='logs'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

input_vocab_size = len(inp_lang.word_index)+1
output_vocab_size = len(targ_lang.word_index)+1

input_len = max_length_inp
output_len = max_length_targ

lstm_size = 200
att_units = 300
dec_units = 200
embedding_size = 300
embedding_dim = 300
score_fun = 'dot'
steps = len(input_tensor)//1000
batch_size=64

model = encoder_decoder(input_vocab_size,output_vocab_size,embedding_size,lstm_size,input_len,output_len,dec_units,score_fun,att_units, batch_size)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=model.layers[0],
                                 decoder=model.layers[1])

In [20]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden,enc_state = model.layers[0](inp, enc_hidden)


    dec_input = tf.expand_dims([targ_lang.word_index['<s>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions = model.layers[1](dec_input,enc_output,enc_hidden,enc_state)

      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = model.layers[0].trainable_variables + model.layers[1].trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [21]:
EPOCHS = 3

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = model.layers[0].initialize_states(1000)
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 1000 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
      
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

: 

: 

In [46]:
def preprocess_sentence(s):
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)

    s = s.strip()
    s = s +' '+' <end>'
    return s

In [28]:
def preprocess_sentence(input):
    return('<s>' + input)

In [57]:
def predict(input_sentence):

  attention_plot = np.zeros((output_len, input_len))

  input_sentence = preprocess_sentence(input_sentence)

  inputs = [inp_lang.word_index[i] for i in input_sentence]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=input_len,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''
  
  encoder_output,state_h,state_c = model.layers[0](inputs,[tf.zeros((1, lstm_size)),tf.zeros((1, lstm_size))])

  dec_input = tf.expand_dims([targ_lang.word_index['<s>']], 0)

  for t in range(output_len):
   predictions,state_h,state_c,attention_weights,context_vector = model.layers[1].onestepdecoder(dec_input,
                                                                                                 encoder_output,
                                                                                                 state_h,
                                                                                                 state_c)

   attention_weights = tf.reshape(attention_weights, (-1, ))
   attention_plot[t] = attention_weights.numpy()

   predicted_id = tf.argmax(predictions[0]).numpy()

   result += targ_lang.index_word[predicted_id] + ' '

   if targ_lang.index_word[predicted_id] == '<end>':
     return result, input_sentence, attention_plot

   dec_input = tf.expand_dims([predicted_id], 0)

  return result, input_sentence, attention_plot

In [58]:
def translate(sentence):
  result, sent, attention_plot = predict(sentence)

  print('Input: %s' % (sent))
  print('Predicted translation: {}'.format(result))

In [59]:
test = test_source_list[1]

print("input sentnce: ", test)
print("predicted sentence: ", translate(test))
print("Actual target: ", test_target_list[1])

input sentnce:   <s> MÍ Iúil a bhí ann i mbliadhain a 1854 , nuair a bhain an taisme seo dúinn . 


NameError: name 'i' is not defined

In [9]:
#Anything goes ref: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
from nltk.translate.bleu_score import sentence_bleu,corpus_bleu

In [11]:
type(train_target_list)

list

In [13]:
print(random.choice(train_target_list))

 Ní dheachaigh ceathrú uaire thart go bhfuair an ghirseach bheag bás .  


In [10]:
#For Train
reference_train = train_source_list
candidate_train = train_target_list[1]

print('Cumulative 1-gram: %f' % sentence_bleu(train_source_list, train_target_list[1], weights=(1, 0, 0, 0)))
print('Cumulative 2-gram: %f' % sentence_bleu(train_source_list, train_target_list[1], weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % sentence_bleu(train_source_list, train_target_list[1], weights=(0.33, 0.33, 0.33, 0)))
print('Cumulative 4-gram: %f' % sentence_bleu(train_source_list, train_target_list[1], weights=(0.25, 0.25, 0.25, 0.25)))

Cumulative 1-gram: 1.000000
Cumulative 2-gram: 1.000000
Cumulative 3-gram: 1.000000
Cumulative 4-gram: 1.000000


In [12]:
#For Test
reference_test = test_source_list
candidate_test = test_target_list[1]

print('Cumulative 1-gram: %f' % sentence_bleu(reference_test, candidate_test, weights=(1, 0, 0, 0)))
print('Cumulative 2-gram: %f' % sentence_bleu(reference_test, candidate_test, weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % sentence_bleu(reference_test, candidate_test, weights=(0.33, 0.33, 0.33, 0)))
print('Cumulative 4-gram: %f' % sentence_bleu(reference_test, candidate_test, weights=(0.25, 0.25, 0.25, 0.25)))

Cumulative 1-gram: 1.000000
Cumulative 2-gram: 1.000000
Cumulative 3-gram: 0.989740
Cumulative 4-gram: 0.963897


In [32]:
#From scratch: https://stackoverflow.com/questions/56968434/bleu-score-in-python-from-scratch
import numpy as np
from collections import Counter
import math

In [33]:
def n_gram_generator(sentence,n= 2,n_gram= False):
    '''
    N-Gram generator with parameters sentence
    n is for number of n_grams
    The n_gram parameter removes repeating n_grams
    '''
    sentence = sentence.lower()  # converting to lower case
    sent_arr = np.array(sentence.split())  # split to string arrays
    length = len(sent_arr)

    word_list = []
    for i in range(length+1):
        if i < n:
            continue
        word_range = list(range(i-n,i))
        s_list = sent_arr[word_range]
        string = ' '.join(s_list)  # converting list to strings
        word_list.append(string) # append to word_list
        if n_gram:
            word_list = list(set(word_list))
    return word_list

In [35]:
def from_scratch_bleu_score(original, reference):
    '''
    Bleu score function given a orginal and a machine translated sentences
    '''
    rf_length = len(reference.split())
    o_length  = len(original.split())

    # Brevity Penalty
    if rf_length > o_length:
        BP=1
    else:
        penality=1-(rf_length/o_length)
        BP = np.exp(penality)

    # Clipped precision
    clipped_precision_score = []
    for ngram_level in range(1, 4):  # 1-gram to 4-gram
        
        
        original_ngram_list = n_gram_generator(original, ngram_level)
        original_n_gram = Counter(original_ngram_list)
        
        reference_ngram_list = n_gram_generator(reference, ngram_level)
        reference_n_gram = Counter(reference_ngram_list)
        
        
        num_ngrams_in_translation = sum(reference_n_gram.values())  # number of ngrams in translation
        
        # iterate the unique ngrams in translation (candidate)
        for j in reference_n_gram:
            
            if j in original_n_gram:  # if found in reference
                
                if reference_n_gram[j] > original_n_gram[j]:  # CLIPPING - if found in translation more than in source, clip
                    reference_n_gram[j] = original_n_gram[j]
                    
            else:
                reference_n_gram[j] = 0

        #print (sum(machine_n_gram.values()), c)
        clipped_precision_score.append(float(sum(reference_n_gram.values())) / num_ngrams_in_translation)

    #print (clipped_precision_score)

    weights = [0.25]*4

    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, clipped_precision_score))
    s = BP * math.exp(math.fsum(s))
    return s

In [36]:
#For Train Files
original_train = train_source_list[1]
reference_train = train_target_list[1]

print ("BLEU Score from scratch for Train files : ", from_scratch_bleu_score(original_train, reference_train))

BLEU Score from scratch for Train files :  0.36787632499277756


In [37]:
candidate = random.randint(0,len(train_target_list))
original_train = train_source_list[candidate]
reference_train = train_target_list[candidate]
From_Scratch_Score = from_scratch_bleu_score(original_train, reference_train)
print ("BLEU Score from scratch for Train files : ", From_Scratch_Score)
    

BLEU Score from scratch for Train files :  0.7539352394200599


In [31]:
print(random.randint(0,len(train_target_list)))

25376


In [None]:
original = random.choice(train_source_list)
print(original.index)

In [27]:
#For Train Files
original_test = test_source_list[1]
reference_test = test_target_list[1]

print ("BLEU Score from scratch for Test files : ", from_scratch_bleu_score(original_test, reference_test))

BLEU Score from scratch for Test files :  0.47287080450158786


Approach - Seq to Seq encoder