In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import h5py

In [None]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

--2021-11-29 13:40:22--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 209.85.147.128, 142.250.125.128, 142.250.136.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|209.85.147.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2638744 (2.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2021-11-29 13:40:22 (158 MB/s) - ‘spa-eng.zip’ saved [2638744/2638744]



In [None]:
ls

[0m[01;34mgdrive[0m/  [01;34msample_data[0m/  spa-eng.zip


In [None]:
!unzip spa-eng.zip

Archive:  spa-eng.zip
   creating: spa-eng/
  inflating: spa-eng/_about.txt      
  inflating: spa-eng/spa.txt         


In [None]:
cd spa-eng 

/content/spa-eng


In [None]:
!cat spa.txt

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



rimonio.
They were sitting on the sofa in our living room.	Ellos estaban sentados en el sofá de nuestro salón.
They won the Japan Cup three years in succession.	Ellos ganaron la Copa de Japón durante tres años consecutivos.
They won't believe me even if I swear it is true.	Ellos no me creerán aunque les jure que es verdad.
They won't believe me even if I swear it is true.	No me creerán aunque les jure que es cierto.
This afternoon the weather is going to get worse.	Esta tarde el tiempo va a empeorar.
This antique clock is worth one thousand dollars.	Este reloj antiguo cuesta mil dólares.
This antique clock is worth one thousand dollars.	Este antiguo reloj vale mil dólares.
This cottage reminds me of the one I was born in.	Esta casa de campo me recuerda a una donde yo nací.
This desk, which I bought yesterday, is very big.	Este escritorio que compré ayer es muy grande.
This is by far the tallest building in this city.	Este es por lejos el edificio más alto en esta ciudad.
This is easy. 

In [None]:
# Converts the unicode file to ascii
def preprocess_sentence(w):
  w = w.lower().strip()
  
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [None]:
en_sentence = "I go home right after work.	"
sp_sentence = "Voy a casa inmediatamente después del trabajo."
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))

<start> i go home right after work. <end>
<start> voy a casa inmediatamente después del trabajo. <end>


In [None]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  return zip(*word_pairs)

In [None]:
path_to_file = '/content/gdrive/My Drive/spa-eng/spa.txt'
en, sp = create_dataset(path_to_file, 3000)
#print(en[-1])
#print(sp[-1])

In [None]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

  return tensor, lang_tokenizer

In [None]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [None]:
num_examples = 3000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [None]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

2400 2400 600 600


In [None]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [None]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[1])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[1])

Input Language; index to word mapping
1 ----> <start>
95 ----> para
43 ----> eso.
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
56 ----> stop
58 ----> that.
2 ----> <end>


In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1  
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 9]), TensorShape([64, 6]))

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True)

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 9, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


# **Bahdanau Attention**

In [118]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)
    print('query_with_time_axis:(batch_size, hidden size, max_len) ', query_with_time_axis.shape)

    print('values_transposed:(batch_size, hidden size, max_len) ', values.shape)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [119]:
attention_layer_b = BahdanauAttention(1024)
attention_result_b, attention_weights_b = attention_layer_b(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result_b.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights_b.shape))

query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 9, 1)


In [120]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True)
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [121]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
Decoder output shape: (batch_size, vocab size) (64, 1232)


In [122]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [123]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [124]:
EPOCHS = 2

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch

# **Dot Product Attention**

In [111]:
class DotProduct(tf.keras.layers.Layer):
  def __init__(self, units):
    super(DotProduct, self).__init__()
    self.W = tf.keras.layers.Dense(units)
    #self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    values_transposed = tf.transpose(values, perm=[0, 2, 1])
    print('values_transposed:(batch_size, hidden size, max_len) ', values_transposed.shape)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = tf.transpose(tf.matmul(query_with_time_axis, values_transposed) , perm=[0, 2, 1])

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [112]:
attention_layer_l = DotProduct(1024)
attention_result_l, attention_weights_l = attention_layer_l(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result_l.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights_l.shape))

values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 9, 1)


In [113]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True)
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = DotProduct(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [114]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
Decoder output shape: (batch_size, vocab size) (64, 1232)


In [115]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [116]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [117]:
EPOCHS = 2

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
values_transposed:(batch_size, hidden size, max_len)  (64, 1024, 9)
Epoch 1 Batch 0 Loss 4.1312
Epoch 1 Loss 2.8976
Time taken for 1 epoch 16.66552209854126 sec

Epoch 2 Batch 0 Loss 2.4497
Epoch 2 Loss 2.2371
Time taken for 1 epoch 6.658436298370361 sec



# **Loung's Attention**

In [109]:
class LoungsAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(LoungsAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query,1)
    print('query_with_time_axis:(batch_size, hidden size, max_len) ', query_with_time_axis.shape)

    values_transposed =values
    print('values_transposed:(batch_size, hidden size, max_len) ', values_transposed.shape)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    #score = (tf.transpose(query_with_time_axis)self.W (values_transposed))
    score = (query_with_time_axis)* self.W(values_transposed)

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values_transposed
    context_vector = tf.reduce_sum(context_vector, axis=1)
    #context_vector = tf.matmul(attention_weights, values_transposed)

    return context_vector, attention_weights

In [110]:
attention_layer_d = LoungsAttention(1024)
attention_result_d, attention_weights_d = attention_layer_d(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result_d.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights_d.shape))

query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 9, 1024)


In [102]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True)
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [103]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
Decoder output shape: (batch_size, vocab size) (64, 1232)


In [104]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [105]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [106]:
EPOCHS = 2

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch_size, hidden size, max_len)  (64, 1, 1024)
values_transposed:(batch_size, hidden size, max_len)  (64, 9, 1024)
query_with_time_axis:(batch