<a href="https://colab.research.google.com/github/Aishwarya138/Synapse-Learning-period/blob/NLP-week-4/MachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import numpy as np

import unicodedata
import io
import time

In [2]:
import pathlib

path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [3]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()  # removing extra space

  w = '<start> ' + w + ' <end>'
  return w

In [4]:
eng_sentence = u"May I borrow this @ book?"
spa_sentence = u"¿Puedo tomar prestado este libro?"

print("Before preprocessing")
print("English sentence : ", eng_sentence)
print("Spanish sentence", spa_sentence)
print(" ")
print("After preprocessing")
print("English sentence : ", preprocess_sentence(eng_sentence))
print("Spanish sentence", preprocess_sentence(spa_sentence))

Before preprocessing
English sentence :  May I borrow this @ book?
Spanish sentence ¿Puedo tomar prestado este libro?
 
After preprocessing
English sentence :  <start> may i borrow this book ? <end>
Spanish sentence <start> ¿ puedo tomar prestado este libro ? <end>


In [5]:
# Returns word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  return zip(*word_pairs)

eng, spa = create_dataset(path_to_file, None)
print(eng[-1])
print(spa[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [6]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
  return tensor, lang_tokenizer

In [7]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [8]:
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
print(max_length_targ, max_length_inp)

11 16


In [9]:
BUFFER_SIZE = len(input_tensor)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor)//BATCH_SIZE
embedding_dim = 256    # for word embedding
units = 1024    # dimensionality of the output space of RNN
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 16]), TensorShape([64, 11]))

In [10]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,  
                                   return_state=True,  
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [11]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):

    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [12]:
class DecoderWithAttention(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_layer = None):
    super(DecoderWithAttention, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    self.attention = attention_layer

  def call(self, x, hidden, enc_output):
    x = self.embedding(x)
    attention_weights = None
    
    if self.attention:
      context_vector, attention_weights = self.attention(hidden, enc_output)
      x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    output, state = self.gru(x, initial_state = hidden)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)

    return x, state, attention_weights

In [13]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0)) 
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

In [14]:
optimizer = tf.keras.optimizers.Adam()

def get_train_step_func():

  @tf.function
  def train_step(inp, targ, enc_hidden, encoder, decoder):
    loss = 0

    with tf.GradientTape() as tape: 
      enc_output, enc_hidden = encoder(inp, enc_hidden)
      dec_hidden = enc_hidden
      dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

      for t in range(1, targ.shape[1]):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        loss += loss_function(targ[:, t], predictions)
        dec_input = tf.expand_dims(targ[:, t], 1)


    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss
    
  return train_step

In [15]:
def training_seq2seq(epochs, attention):
  encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
  decoder = DecoderWithAttention(vocab_tar_size, embedding_dim, units, BATCH_SIZE, attention)
  train_step_func = get_train_step_func()
  training_loss = []

  for epoch in range(epochs):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
      batch_loss = train_step_func(inp, targ, enc_hidden, encoder, decoder)
      total_loss += batch_loss

      if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss))
        

    training_loss.append(total_loss / steps_per_epoch)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, training_loss[-1]))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    
  return encoder, decoder, training_loss

In [16]:
epochs = 10
attention = None
print("Running seq2seq model without attention")
encoder, decoder, training_loss = training_seq2seq(epochs, attention)
tloss = training_loss

Running seq2seq model without attention
Epoch 1 Batch 0 Loss 4.5898
Epoch 1 Batch 100 Loss 2.1447
Epoch 1 Batch 200 Loss 1.9102
Epoch 1 Batch 300 Loss 1.6986
Epoch 1 Batch 400 Loss 1.6464
Epoch 1 Loss 1.8993
Time taken for 1 epoch 64.80535435676575 sec

Epoch 2 Batch 0 Loss 1.4226
Epoch 2 Batch 100 Loss 1.4372
Epoch 2 Batch 200 Loss 1.3939
Epoch 2 Batch 300 Loss 1.3053
Epoch 2 Batch 400 Loss 1.3179
Epoch 2 Loss 1.3690
Time taken for 1 epoch 44.46726202964783 sec

Epoch 3 Batch 0 Loss 1.1187
Epoch 3 Batch 100 Loss 1.1175
Epoch 3 Batch 200 Loss 1.1739
Epoch 3 Batch 300 Loss 1.0470
Epoch 3 Batch 400 Loss 1.0358
Epoch 3 Loss 1.1076
Time taken for 1 epoch 44.40150785446167 sec

Epoch 4 Batch 0 Loss 0.8900
Epoch 4 Batch 100 Loss 0.9261
Epoch 4 Batch 200 Loss 0.9644
Epoch 4 Batch 300 Loss 0.8785
Epoch 4 Batch 400 Loss 0.9130
Epoch 4 Loss 0.9019
Time taken for 1 epoch 44.621570110321045 sec

Epoch 5 Batch 0 Loss 0.7430
Epoch 5 Batch 100 Loss 0.6954
Epoch 5 Batch 200 Loss 0.7278
Epoch 5 Batch 3

In [17]:
epochs = 10
attention = BahdanauAttention(units)
print("Running seq2seq model with Bahdanau attention")
encoder_bah, decoder_bah, training_loss = training_seq2seq(epochs, attention)
tloss = np.vstack((tloss, training_loss))

Running seq2seq model with Bahdanau attention
Epoch 1 Batch 0 Loss 4.6507
Epoch 1 Batch 100 Loss 2.0514
Epoch 1 Batch 200 Loss 1.5292
Epoch 1 Batch 300 Loss 1.4601
Epoch 1 Batch 400 Loss 1.2816
Epoch 1 Loss 1.6856
Time taken for 1 epoch 98.92546105384827 sec

Epoch 2 Batch 0 Loss 0.9885
Epoch 2 Batch 100 Loss 0.8711
Epoch 2 Batch 200 Loss 0.7430
Epoch 2 Batch 300 Loss 0.7838
Epoch 2 Batch 400 Loss 0.8050
Epoch 2 Loss 0.8118
Time taken for 1 epoch 81.99607419967651 sec

Epoch 3 Batch 0 Loss 0.5456
Epoch 3 Batch 100 Loss 0.4241
Epoch 3 Batch 200 Loss 0.4263
Epoch 3 Batch 300 Loss 0.5504
Epoch 3 Batch 400 Loss 0.4425
Epoch 3 Loss 0.4657
Time taken for 1 epoch 82.03529095649719 sec

Epoch 4 Batch 0 Loss 0.2344
Epoch 4 Batch 100 Loss 0.2591
Epoch 4 Batch 200 Loss 0.2809
Epoch 4 Batch 300 Loss 0.2318
Epoch 4 Batch 400 Loss 0.2812
Epoch 4 Loss 0.2800
Time taken for 1 epoch 81.97464537620544 sec

Epoch 5 Batch 0 Loss 0.1923
Epoch 5 Batch 100 Loss 0.2037
Epoch 5 Batch 200 Loss 0.1662
Epoch 5 Ba

In [18]:
def translate(sentence, encoder, decoder):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [19]:
result, sentence = translate(u'esta es mi vida.', encoder_bah, decoder_bah)
print('Input: %s' % (sentence))
print('Predicted translation: {}'.format(result))

Input: <start> esta es mi vida . <end>
Predicted translation: this is my life . <end> 


Here I have performed machine translation from Spanish to English using Encoder Decoder model with Bahdanau Attention i.e Additive attention. The encoder generates a set of annotations from the input sentence and these annotations are fed to an alignment model together with the previous hidden encoder state. The alignment model uses this information to generate the attention scores. A softmax function is applied to the attention scores, effectively normalizing them into weight values in a range between 0 and 1. These weights together with the previously computed annotations are used to generate a context vector through a weighted sum of the annotations. The context vector is fed to the decoder together with the previous hidden decoder state and the previous utput, to compute the final output. These steps are repeated until the end of the sequence. 
I have used a GRU layer in the encoder to annotate the input sentenes.
I have used teacher forcing for training where we pass the actual word to the Decoder at each time step. Then, calculate the gradient descent, apply it to the optimizer and backpropagate.