# Neural Machine Translation with Attention mechanism

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### What is Attention?

Attention is an interface between the encoder and decoder that provides the decoder with information from every encoder hidden state. With this setting, the model is able to selectively focus on useful parts of the input sequence and hence, learn the alignment between them. This helps the model to cope effectively with long input sentences .

In [2]:
!pip install chart-studio

Collecting chart-studio
[?25l  Downloading https://files.pythonhosted.org/packages/ca/ce/330794a6b6ca4b9182c38fc69dd2a9cbff60fd49421cb8648ee5fee352dc/chart_studio-1.1.0-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 2.1MB/s 
Installing collected packages: chart-studio
Successfully installed chart-studio-1.1.0


In [3]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

#tf.enable_eager_execution()

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import time
import string

import chart_studio.plotly
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
#%plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### As in case of any NLP task, after reading the input file, we perform the basic cleaning and preprocessing as follows:

**The Dataset :** We need a dataset that contains English sentences and their Portuguese translations which can be freely downloaded from this [link](http://www.manythings.org/anki/). Download the file fra-eng.zip and extract it. On each line, the text file contains an English sentence and its French translation, separated by a tab.

In [4]:
file_path = './drive/MyDrive/pol.txt' # please set the path according to your system

In [5]:
lines = open(file_path, encoding='UTF-8').read().strip().split('\n')
lines[5000:5010]

['Will you join us?\tCzy dołączysz do nas?\tCC-BY 2.0 (France) Attribution: tatoeba.org #237669 (CK) & #580598 (Bilberry)',
 'Will you join us?\tPrzyłączycie się do nas?\tCC-BY 2.0 (France) Attribution: tatoeba.org #237669 (CK) & #4719046 (Ceresnya)',
 'Will you take it?\tWeźmiesz to?\tCC-BY 2.0 (France) Attribution: tatoeba.org #3738699 (CK) & #3749525 (gin)',
 'You abandoned me.\tOpuściłeś mnie.\tCC-BY 2.0 (France) Attribution: tatoeba.org #3374731 (CK) & #3817518 (liori)',
 'You already paid.\tJuż zapłaciłeś.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2254931 (CK) & #4550568 (jeedrek)',
 'You already paid.\tJuż zapłaciłaś.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2254931 (CK) & #5868591 (BeataB)',
 'You always cheat.\tZawsze oszukujesz.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2254932 (CK) & #4550579 (jeedrek)',
 'You are a genius.\tJesteś geniuszem.\tCC-BY 2.0 (France) Attribution: tatoeba.org #1895825 (CK) & #3698112 (jeedrek)',
 'You are too late.\tJesteś za późno.\t

In [6]:
print("total number of records: ",len(lines))

total number of records:  40465


In [7]:
exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

### Function to preprocess English sentence

In [8]:
def preprocess_eng_sentence(sent):
    '''Function to preprocess English sentence'''
    sent = sent.lower() # lower casing
    sent = re.sub("'", '', sent) # remove the quotation marks if any
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.translate(remove_digits) # remove the digits
    sent = sent.strip()
    sent = re.sub(" +", " ", sent) # remove extra spaces
    sent = '<start> ' + sent + ' <end>' # add <start> and <end> tokens
    return sent

### Function to preprocess Polish sentence

In [9]:
def preprocess_port_sentence(sent):
    '''Function to preprocess Portuguese sentence'''
    sent = re.sub("'", '', sent) # remove the quotation marks if any
    sent = ''.join(ch for ch in sent if ch not in exclude)
    #sent = re.sub("[२३०८१५७९४६]", "", sent) # remove the digits
    sent = sent.strip()
    sent = re.sub(" +", " ", sent) # remove extra spaces
    sent = '<start> ' + sent + ' <end>' # add <start> and <end> tokens
    return sent

### Generate pairs of cleaned English and Polish sentences with start and end tokens added.

In [10]:
# Generate pairs of cleaned English and Portuguese sentences
sent_pairs = []
for line in lines:
    sent_pair = []
    eng = line.rstrip().split('\t')[1]
    port = line.rstrip().split('\t')[0]
    eng = preprocess_eng_sentence(eng)
    sent_pair.append(eng)
    port = preprocess_port_sentence(port)
    sent_pair.append(port)
    sent_pairs.append(sent_pair)
sent_pairs[5000:5010]

[['<start> czy dołączysz do nas <end>', '<start> Will you join us <end>'],
 ['<start> przyłączycie się do nas <end>', '<start> Will you join us <end>'],
 ['<start> weźmiesz to <end>', '<start> Will you take it <end>'],
 ['<start> opuściłeś mnie <end>', '<start> You abandoned me <end>'],
 ['<start> już zapłaciłeś <end>', '<start> You already paid <end>'],
 ['<start> już zapłaciłaś <end>', '<start> You already paid <end>'],
 ['<start> zawsze oszukujesz <end>', '<start> You always cheat <end>'],
 ['<start> jesteś geniuszem <end>', '<start> You are a genius <end>'],
 ['<start> jesteś za późno <end>', '<start> You are too late <end>'],
 ['<start> możesz mnie obwinić <end>', '<start> You can blame me <end>']]

### Create a class to map every word to an index and vice-versa for any given vocabulary.

In [11]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()

        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))

        self.vocab = sorted(self.vocab)

        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1

        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [12]:
def max_length(tensor):
    return max(len(t) for t in tensor)

### Tokenization and Padding

In [13]:
def load_dataset(pairs, num_examples):
    # pairs => already created cleaned input, output pairs

    # index language using the class defined above    
    inp_lang = LanguageIndex(en for en, ma in pairs)
    targ_lang = LanguageIndex(ma for en, ma in pairs)
    
    # Vectorize the input and target languages
    
    # English sentences
    input_tensor = [[inp_lang.word2idx[s] for s in en.split(' ')] for en, ma in pairs]
    
    # Marathi sentences
    target_tensor = [[targ_lang.word2idx[s] for s in ma.split(' ')] for en, ma in pairs]
    
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    
    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [14]:
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(sent_pairs, len(lines))

### Creating training and validation sets using an 99-01 split

In [15]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.01, random_state = 101)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(40060, 40060, 405, 405)

In [16]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

We'll be using GRUs instead of LSTMs as we only have to create one state and implementation would be easier.

### Create GRU units

In [17]:
def gru(units):

    return tf.keras.layers.GRU(units, 
                                   return_sequences=True, 
                                   return_state=True, 
                                   recurrent_activation='sigmoid', 
                                   recurrent_initializer='glorot_uniform')


### The next step is to define the encoder and decoder network.

The input to the encoder will be the sentence in English and the output will be the hidden state and cell state of the GRU.

In [18]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

The next step is to define the decoder. The decoder will have two inputs: the hidden state and cell state from the encoder and the input sentence, which actually will be the output sentence with a token appended at the beginning.

In [19]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):

        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

Create encoder and decoder objects from their respective classes.

In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

### Define the optimizer and the loss function.

In [21]:
optimizer = tf.optimizers.Adam()

def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

### Training the Model
To train the model copy training_attention.py here from the drive


### Restoring the latest checkpoint

In [22]:

checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)
checkpoint.restore(tf.train.latest_checkpoint("./drive/MyDrive/attention_checkpoints"))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4a72460450>

### Inference setup and testing:

In [23]:
def evaluate(inputs, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):
    
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = ''
    for i in inputs[0]:
        if i == 0:
            break
        sentence = sentence + inp_lang.idx2word[i] + ' '
    sentence = sentence[:-1]
    
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.idx2word[predicted_id] + ' '

        if targ_lang.idx2word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot


In [24]:
!pip install nltk==3.4

Collecting nltk==3.4
[?25l  Downloading https://files.pythonhosted.org/packages/6f/ed/9c755d357d33bc1931e157f537721efb5b88d2c583fe593cc09603076cc3/nltk-3.4.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 3.1MB/s 
Collecting singledispatch
  Downloading https://files.pythonhosted.org/packages/cd/d1/6a9e922826e03f5af7bf348cfb75bcb0bc4c67e19c36805c2545f34427e5/singledispatch-3.6.2-py2.py3-none-any.whl
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4-cp37-none-any.whl size=1436383 sha256=4175b3de2dee9341b798a7e53279eb3d84fdce39503e1dae4e530819587bc895
  Stored in directory: /root/.cache/pip/wheels/4b/c8/24/b2343664bcceb7147efeb21c0b23703a05b23fcfeaceaa2a1e
Successfully built nltk
Installing collected packages: singledispatch, nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.4 single

In [25]:
from nltk.translate.nist_score import sentence_nist

In [26]:
from nltk.translate.bleu_score import sentence_bleu

from nltk.translate.gleu_score import sentence_gleu
# gleu_score = []
bleu_score = []
# nist_score = []
sent_len = []

### Function to predict (translate) a randomly selected test point


In [33]:

def predict_random_val_sentence():
    actual_sent = ''
    for k in range(0,30):
      actual_sent=" "
      random_input = input_tensor_val[k]
      random_output = target_tensor_val[k]
      random_input = np.expand_dims(random_input,0)
      result, sentence, attention_plot = evaluate(random_input, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)
      # print(result,sentence)
      print('Input: {}'.format(sentence[8:-6]))
      print('Predicted translation: {}'.format(result[:-6]))
      for i in random_output:
          if i == 0:
              break
          actual_sent = actual_sent + targ_lang.idx2word[i] + ' '
      actual_sent = actual_sent[8:-7]
      print('Actual translation: {}'.format(actual_sent))
      attention_plot = attention_plot[:len(result.split(' '))-2, 1:len(sentence.split(' '))-1]
      sentence, result = sentence.split(' '), result.split(' ')
      sentence = sentence[1:-1]
      result = result[:-2]
      print("Result is",result)
      reference = [actual_sent.split()]
      sent_len.append(len(reference[0]))
      candidate = result
      print(reference,candidate)
      bleu_score_i = sentence_bleu(reference,candidate)
      print('BLEU score -> {}'.format(bleu_score_i))
      bleu_score.append(bleu_score_i)


    # use plotly to generate the heat map
    # trace = go.Heatmap(z = attention_plot, x = sentence, y = result, colorscale='greens')
    # data=[trace]
    # iplot(data)


In [34]:
predict_random_val_sentence()

Input: oto nasza szkoła
Predicted translation: This is our school 
Actual translation:  That is our school
Result is ['This', 'is', 'our', 'school']
[['That', 'is', 'our', 'school']] ['This', 'is', 'our', 'school']
BLEU score -> 8.636168555094496e-78
Input: ona się odchudza
Predicted translation: She is dieting 
Actual translation:  Shes dieting
Result is ['She', 'is', 'dieting']
[['Shes', 'dieting']] ['She', 'is', 'dieting']
BLEU score -> 1.384292958842266e-231
Input: chcę żebyście poszli z nami
Predicted translation: I want the two of us with us 
Actual translation:  I want you to come with us
Result is ['I', 'want', 'the', 'two', 'of', 'us', 'with', 'us']
[['I', 'want', 'you', 'to', 'come', 'with', 'us']] ['I', 'want', 'the', 'two', 'of', 'us', 'with', 'us']
BLEU score -> 9.170599044431425e-155




The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



Input: wyglądają na wyczerpanych
Predicted translation: Does I look at all 
Actual translation:  They look exhausted
Result is ['Does', 'I', 'look', 'at', 'all']
[['They', 'look', 'exhausted']] ['Does', 'I', 'look', 'at', 'all']
BLEU score -> 1.2183324802375697e-231
Input: nie umiem otworzyć tej walizki
Predicted translation: I cant open this suitcase 
Actual translation:  I cant figure out how to open this suitcase
Result is ['I', 'cant', 'open', 'this', 'suitcase']
[['I', 'cant', 'figure', 'out', 'how', 'to', 'open', 'this', 'suitcase']] ['I', 'cant', 'open', 'this', 'suitcase']
BLEU score -> 3.8804806708023324e-78
Input: on codziennie ogląda telewizję
Predicted translation: He watches TV every day 
Actual translation:  He watches TV every day
Result is ['He', 'watches', 'TV', 'every', 'day']
[['He', 'watches', 'TV', 'every', 'day']] ['He', 'watches', 'TV', 'every', 'day']
BLEU score -> 1.0
Input: nigdy nie byłem nieobecny w szkole
Predicted translation: Ive never been absent from sc

In [None]:
sum(bleu_score) / len(bleu_score)

2.8787228516981653e-78

In [None]:
len(sent_len),len(bleu_score)

(3, 3)

In [None]:
with open('/content/drive/MyDrive/sent_len.txt', 'w') as testwritefile:
    testwritefile.write(str(sent_len))

In [None]:
with open('/content/drive/MyDrive/bleu_Score.txt', 'w') as testwritefile:
    testwritefile.write(str(bleu_score))

In [None]:
def predict_sentence(s):
    # s = preprocess_eng_sentence(s)
    # input = np.array([inp_lang.word2idx[s] for s in s.split(' ')] , dtype="int32")
    input = s
    random_input = np.pad(input, (0, 40 - len(input)))
    random_input = np.expand_dims(random_input,0)
    result, sentence, attention_plot = evaluate(random_input, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)
    # print('Predicted translation: {}'.format(result[:-6]))
    return result[:-7]

In [None]:
target_tensor_val.shape

(405, 49)

In [None]:
predict_sentence(input_tensor_val[0])

Input: oto nasza szkoła
Predicted translation: This is our school 


In [None]:
corpus_gleu('here is our school' , 'This is our school')

0.7777777777777778

In [None]:
corpus_nist('here is our school' , 'This is our school', n=1)

2.4927319453258576

In [None]:
from nltk.translate.bleu_score import SmoothingFunction
smoothie = SmoothingFunction().method4

corpus_bleu('That is our school', 'This is our school' , smoothing_function=smoothie)

0.36858666333480733

In [None]:
from nltk.translate.bleu_score import SmoothingFunction
smoothie = SmoothingFunction().method4
bleu_score = 0
nist = 0
gleu = 0
for i in range(0,len(input_tensor_val)):
  predicted = predict_sentence(input_tensor_val[i])
  actual_sent = ""
  for i in target_tensor_val[i]:
          if i == 0:
              break
          actual_sent = actual_sent + targ_lang.idx2word[i] + ' '
  actual_sent = actual_sent[8:-7]
  # print("Required : "+actual_sent)
  bleu_score +=  sentence_bleu(actual_sent , predicted , smoothing_function=smoothie)
  nist += sentence_nist(actual_sent , predicted , n =5)
  gleu += corpus_gleu([actual_sent] , [predicted])

In [None]:
actual_sent = ""
for i in target_tensor_val[0]:
        if i == 0:
            break
        actual_sent = actual_sent + targ_lang.idx2word[i] + ' '
actual_sent = actual_sent[8:-7]

In [None]:
actual_sent

'That is our school'

In [None]:
predicted = predict_sentence(input_tensor_val[0])

Input: oto nasza szkoła
Predicted translation: This is our school 


In [None]:
predicted

'This is our school'

In [None]:
bleu_score / 405


0.32362446102383025

In [None]:
gleu / 405

0.012180688452118156

In [None]:
nist / 405

0.19613899235090604

In [None]:
sentence_nist("Hello World I am human" , "Hello World I am human")

0.20270143721078623