# <span style="color:orange"><b>Run on PC</b></span>
# Code Attention

Show how to code an encoder decoder model with attention mechanism.
Use generated data (the same generated data we used to demonstrate the encoder decoder project) 


## Prelude


In [1]:
import tensorflow as tf 
import os
import time

from random import randint
from numpy import array


k_Vocab_Size_In   = 100
k_In_Seq_Length   = 10
k_Out_Seq_Length  = 5
k_Batch_Size      = 128

k_N_Word_Embed    = 32 # number of units needed for the embedding layers
k_N_GRU           = 32 # number of units needed for the GRU 
k_N_W1_W2         =  8 # number of neurons in W1 and W2

k_Train_Samples   = 10_000 
k_Val_Samples     =  5_000 

k_Epochs          = 10


## Generate data

In [2]:
# generate a sequence of n random integers from 2 to val_max-1 included
# TODO : pourquoi (2, val_max-1) et pas (1, val_max)
# Qu'est ce qui empêche d'avoir 1 et val_max ?
        
def generate_sequence(n, val_max):
	return [randint(2, val_max-1) for _ in range(n)]

In [3]:
generate_sequence(k_In_Seq_Length, k_Vocab_Size_In)

[32, 38, 3, 25, 91, 51, 72, 8, 87, 21]

In [4]:
# The data we are generating consists in a random sequence of numbers 
# they could represent encoded letters, words, sentences or anything you could think of.
# The target is built using the first elements of the input in reversed order. 
# We add a special token at the beginning of every target sequence for teacher.
# Since words are represented by number and since the translated sentence use the same number k_Vocab_Size_In and k_Vocab_Size_Out are the same  

def create_Input_and_Target(n_in, n_out, voc_size, how_many, printing = False):
  
  X = list()
  y  = list()
  
  for _ in range(how_many):
    # generate source sequence
    source = generate_sequence(n_in, voc_size)
    source_pad = source
    if printing:
      print("source : ", source_pad)
    
    # add <start> token (0) at the beginning of each sequence
    target = source[:n_out]
    target.reverse()
    target = [0] + target
    if printing:
      print("target : ", target)

    X.append(source_pad)
    y.append(target)
  
  return array(X), array(y)

In [5]:
# Testing purpose
# Generate 2 source and 2 target
# Sources are "strings" of length k_In_Seq_Length words
# Targets are "strings" of length k_Out_Seq_Length words
input, target =  create_Input_and_Target(k_In_Seq_Length, k_Out_Seq_Length, k_Vocab_Size_In, 2, True)

source :  [48, 55, 66, 89, 35, 61, 44, 23, 66, 19]
target :  [0, 35, 89, 66, 55, 48]
source :  [9, 96, 31, 62, 83, 95, 71, 78, 17, 15]
target :  [0, 83, 62, 31, 96, 9]


In [6]:
# Create the training data and validation dataset

X_train, y_train = create_Input_and_Target(k_In_Seq_Length, k_Out_Seq_Length, k_Vocab_Size_In, how_many = k_Train_Samples)
X_val,   y_val   = create_Input_and_Target(k_In_Seq_Length, k_Out_Seq_Length, k_Vocab_Size_In, how_many = k_Val_Samples)

In [7]:
# Transform the train sets into batches 
train_batch   = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train)).batch(k_Batch_Size)

## Create the encoder decoder with attention

![bahdanau](https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Deep+Learning/attention/Attention-encoder-decoder.drawio.png)

### Create encoder model

* The goal of the encoder is to create a representation of the input data
* This repreentation extract information from the input data which will then be interpreted by the decoder model
* The encoder receives sequence inputs 
* It will output sequences with a given depth of representation (we called that dimension channels before)

In [8]:
# Hier on faisait 
# encoder = tf.keras.Model(inputs = encoder_input, outputs = encoder_output)
# plot_model(encoder)
# Là on peut plus

class encoder_factory(tf.keras.Model):
  
  def __init__(self, in_vocab_size, embed_dim, n_units):
    super().__init__()
    self.n_units = n_units
    # WE layer
    self.embed = tf.keras.layers.Embedding(input_dim = in_vocab_size, output_dim = embed_dim)
    # GRU layer
    self.gru = tf.keras.layers.GRU(units = n_units, return_sequences = True, return_state = True)
  
  
  def __call__(self, input_batch):
    # each output is saved as a class attribute 
    # doing so we can access them to control the shapes throughout the demo
    self.embed_out               = self.embed(input_batch)
    self.gru_out, self.gru_state = self.gru(self.embed_out)    
    return self.gru_out, self.gru_state


In [9]:
# On fait un test 
encoder = encoder_factory(k_Vocab_Size_In, k_N_Word_Embed, k_N_GRU)

In [10]:
# We already generated X_train (size k_Train_Samples)
# Realise we can't pass X[train] to the ecoder which expect a tensor 
print(X_train[0])
print(X_train.shape)
print(tf.expand_dims(X_train[0],0))

[31 61 47 85 24 24 26  6  9 34]
(10000, 10)
tf.Tensor([[31 61 47 85 24 24 26  6  9 34]], shape=(1, 10), dtype=int32)


In [11]:
# On passe X après l'avoir transfromé en tenseur
encoder_output, encoder_state = encoder(tf.expand_dims(X_train[0],0))

In [12]:
# Remember : k_n_seq = 10 & k_n_gru = 32
# The first output as a shape of (1,10,32) which is normal because we applied the encoder to 1 input sequence of 10 elements 
# (we chose return_sequences = True for the gru layer) and 32 channels since we have 32 units on the gru layer.

encoder_output

<tf.Tensor: shape=(1, 10, 32), dtype=float32, numpy=
array([[[ 0.0018117 , -0.01219099, -0.00447212,  0.01740521,
          0.00206368,  0.00503232, -0.00234216,  0.00155127,
          0.00764946, -0.00272848, -0.0032842 , -0.00414186,
         -0.00387689,  0.00940679, -0.00597036, -0.00511418,
          0.01037782,  0.00578577, -0.00524591,  0.00619613,
         -0.00250725, -0.00741306, -0.02181177, -0.01128406,
         -0.00030009,  0.00436511,  0.01109104, -0.00397885,
         -0.00591367, -0.00156584, -0.00851023, -0.01285357],
        [ 0.00086671, -0.00839399,  0.00393489,  0.00616573,
          0.00420294,  0.01341602,  0.00573099,  0.01197856,
          0.02816976,  0.01873257,  0.0022498 , -0.01051817,
          0.00577455,  0.01576327,  0.0078271 , -0.01179696,
          0.00314449,  0.02000825, -0.00499912,  0.00045766,
          0.02114684, -0.01677737,  0.00156429,  0.00826093,
         -0.0198865 ,  0.01031852, -0.01919936, -0.00163858,
         -0.00792624,  0.002370

In [13]:
# The second output is the gru state which has shape (1,32) for one input sequence and 32 units on the gru layer.
encoder_state

<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.0166283 ,  0.01137532, -0.00853125,  0.00529761,  0.00379842,
         0.01369195, -0.00229246, -0.00550406,  0.00634844, -0.00936119,
        -0.00909596, -0.02524787,  0.00035473, -0.00532388,  0.01445548,
         0.00428359,  0.01009566,  0.00593811, -0.00879329, -0.00981067,
        -0.02220128, -0.01617278, -0.00140102, -0.00135519, -0.01227896,
         0.02162432, -0.00843613, -0.01270058, -0.00608672,  0.00822962,
        -0.01058873,  0.00363486]], dtype=float32)>

### Create the Attention layer

![bahdanau](https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Deep+Learning/attention/Attention-encoder-decoder.drawio.png)

In [14]:
class Bahdanau_attention_factory(tf.keras.layers.Layer):
  def __init__(self, attention_units):
    super().__init__()

    # The attention layer contains three dense layers
    self.W1 = tf.keras.layers.Dense(units=attention_units)
    self.W2 = tf.keras.layers.Dense(units=attention_units)
    self.V  = tf.keras.layers.Dense(units=1)                 # ! obligatoirement 1 seul neurone

  def __call__(self, enc_out, state):
    # enc_out represents the encoder output which will be used to create the attention weights and then used to create the context vector once we
    # apply the attention weights 
    # the state will be a hidden state from a recurrent unit coming either from the encoder at first, and from the decoder as we make further predictions
    self.W1_out = self.W1(enc_out)                                              # shape (1, 10, attention_units)

    # we are going to sum the outputs from W1 and W2, though the shapes are incompatible
    # the enc_out is (batch_size, 10, 32) -> W1 -> (batch_size,10,attention_units)
    # the state is   (batch_size, 32)     -> W2 -> (batch_size, attention_units)
    # thus we need to artificially add a dimension to the state along axis 1
    self.state  = tf.expand_dims(state, axis = 1) 
    self.W2_out = self.W2(self.state)                                           # shape (batch_size, 1, attention_units)
    self.sum        = self.W1_out + self.W2_out                                 # shape (batch_size, 10, attention_units)
    
    # tanh because we have positive and negatives values and we want to scale beetween -1 and 1                              
    self.sum_scale  = tf.nn.tanh(self.sum)                                      # shape (batch_size, 10, attention_units)

    self.score = self.V(self.sum_scale)                                         # shape (batch_size, 10, 1)
    self.attention_weights = tf.nn.softmax(self.score, axis=1)                  # shape (batch_size, 10, 1)
    self.weighted_enc_out = enc_out * self.attention_weights                    # shape (batch_size, 10, 32)
    self.context_vector = tf.reduce_sum(self.weighted_enc_out, axis=1)          # shape (batch_size, 32)

    return self.context_vector, self.attention_weights

In [15]:
attention_layer = Bahdanau_attention_factory(k_N_W1_W2)                         # k_N_W1_W2 neurones dans les couches denses W1 et W2
attention_layer(encoder_output, encoder_state)                                  # on regarde ce qui se passe quand on lui passe 

# the first output is context vector
# the second is weight attention vectore (32 de long because 32 in the GRU)

(<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
 array([[ 2.7340678e-03,  5.4913631e-04,  3.5740056e-03, -1.9579534e-03,
         -2.9608137e-03,  3.8503141e-03,  3.3869778e-03,  2.3422490e-03,
          9.1212038e-03,  3.3197063e-03, -3.1294660e-03, -1.3630067e-03,
          1.1034927e-03, -2.0108197e-03,  7.9029230e-03, -4.8542027e-03,
         -6.2369509e-05,  3.4917251e-03, -5.6915828e-03, -7.2614109e-04,
          4.6738074e-03, -3.0998839e-03, -2.8672973e-03,  2.1285904e-03,
         -4.4008513e-04,  2.5750333e-03, -1.1274226e-02,  1.4739938e-04,
         -3.1003163e-03, -3.8168224e-04, -5.2784132e-03,  5.9474581e-03]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 10, 1), dtype=float32, numpy=
 array([[[0.1004601 ],
         [0.1019185 ],
         [0.10097729],
         [0.10205767],
         [0.10023692],
         [0.0992083 ],
         [0.09869408],
         [0.09910867],
         [0.09838147],
         [0.09895701]]], dtype=float32)>)

### Create decoder

* The decoder use the encoder output and the previous target element to predict the next target element
* Its output is a sequence with as many elements as the target 
* This is where the padded target comes in
* It will serve as input and must have a number of channels equals to the number of possible values for target elements
* 
* Two versions of the same model (with the same weights) have to be prepared
    * One for training
    * One for inference 

<img src="./decoder.png"  />

In [16]:
class decoder_factory(tf.keras.Model):
  
  def __init__(self, target_vocab_size, embed_dim, n_units):
    super().__init__()
    # The decoder contains 
    #     WE layer used with teacher forcing
    #     GRU layer
    #     Dense layer to make the predictions
    #     Attention layer

    # ! NOTE that the Embedding layer has an input size of target vocab size (10 000 wordds foar example)
    # This is because in inference mode, at the end of the first iteration, pred(X) is connected to the word embedding layer
    # pred(x) is a vector of size vocab_size whose most values are null exept the one which is the index of the translated word in the target vocabulary
    self.embed = tf.keras.layers.Embedding(input_dim=target_vocab_size, output_dim=embed_dim)
    
    # ! return_state=True is important since we want to connect state to W2 at the next iteration 
    self.gru = tf.keras.layers.GRU(units=n_units, return_sequences=True, return_state=True)     
    self.pred = tf.keras.layers.Dense(units = target_vocab_size, activation="softmax")
    self.attention = Bahdanau_attention_factory(attention_units=n_units)

  def __call__(self, dec_in, enc_out, state):
    # The Attention layer provides context_vector and  attention weights
    self.context_vector, self.attention_weights = self.attention(enc_out,state)

    # the decoder ingest one sequence element from the teacher forcing whose shape is (bacth_size, 1)
    self.embed_out = self.embed(dec_in)                                                        # shape (batch_size,1,embed_dim)

    # concatenate the embedding output and the context vector
    # ! their shapes are incompatible
    # embed out is of size      : (batch_size, 1, embed_dim)
    # context vector is of size : (batch_size, n_units)                                         n_units is defined in the encoder
    # => need to add one dimension along axis 1
    self.context_vector_expanded = tf.expand_dims(self.context_vector, axis=1)                  # shape (batch_size, 1, n_units)
    self.concat = tf.keras.layers.concatenate([self.embed_out, self.context_vector_expanded])   # shape (bacth_size, 1, embed_dim + n_units)
    
    self.gru_out, self.gru_state = self.gru(self.concat)                                        # shapes (batch_size, 1, n_units) and (batch_size, n_units)

    self.gru_out_reshape = tf.reshape(self.gru_out, shape=(-1, self.gru_out.shape[2]))          # pourquoi un reshape ici ??? On est (1, 1, 32) on passe en (1, 32)
                                                                                                # On met en (1,32) pour pouvoir le réutiliser ensuite dans la boucle
                                                                                                # où on fait un concatenate avec context vector

    self.pred_out = self.pred(self.gru_out_reshape)                                             # shape (batch_size, 1, tar_vocab_size)

    return self.pred_out, self.gru_state, self.attention_weights

In [17]:
# Testing : use the decoder using the encoder output, the encoder state and the first element of the teacher forcing
# ! On force target vocab size à la même taille que le vocab size input 
decoder = decoder_factory(target_vocab_size=k_Vocab_Size_In, embed_dim=k_N_Word_Embed, n_units=k_N_GRU)

In [18]:
# the teacher forcing is the first element of the target sequence 
# use expand_dims twice to feed the decoder with a tensor with right dims
decoder_input = tf.expand_dims(tf.expand_dims(y_train[0][0], axis=0), axis=0) 

In [19]:
decoder_input

<tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[0]])>

In [20]:
# The first tensor is pred(X) whose size is vocab_size
# The second tensor is the hidden state of the decoder whowse size is n_units
decoder(decoder_input, encoder_output, encoder_state)

(<tf.Tensor: shape=(1, 100), dtype=float32, numpy=
 array([[0.01000052, 0.00996335, 0.00990824, 0.01012561, 0.00995792,
         0.00996186, 0.01009192, 0.00998276, 0.00996807, 0.00998539,
         0.01001669, 0.00996236, 0.01003362, 0.00994417, 0.0099698 ,
         0.01001042, 0.0100463 , 0.01002792, 0.01006363, 0.01009929,
         0.00991508, 0.00996633, 0.01000384, 0.01000245, 0.01000163,
         0.00998842, 0.00997958, 0.0100095 , 0.00993689, 0.00991234,
         0.00994213, 0.00998204, 0.00993922, 0.00998854, 0.01003938,
         0.01002309, 0.00999049, 0.01002568, 0.01005118, 0.00991194,
         0.01006036, 0.00982559, 0.0098395 , 0.01000199, 0.01000774,
         0.01001138, 0.01005291, 0.01006296, 0.01008629, 0.01007326,
         0.01001926, 0.01003062, 0.0100176 , 0.00993883, 0.01002304,
         0.0099733 , 0.01011093, 0.00997716, 0.01002331, 0.01004564,
         0.01000636, 0.01005011, 0.01011498, 0.00998618, 0.00998816,
         0.00994612, 0.00995747, 0.00995007, 0.01009

## Training the encoder decoder model

The encoder output is used for each prediction once weighted by the attention weights.

In [21]:
optimizer     = tf.keras.optimizers.Adam()

# SparseCategoricalCrossentropy because 
# CrossEntropy since it is multi categorieq
# Sparce beacause the label are integers (not one hot encoded)
# https://stats.stackexchange.com/questions/326065/cross-entropy-vs-sparse-cross-entropy-when-to-use-one-over-the-other
loss_function = tf.keras.losses.SparseCategoricalCrossentropy()

In [22]:

checkpoint_dir    = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint        = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [23]:
def train_step(inp, targ):
  loss = 0

  # gradient tape to track the operations happening in the network in order to be able to compute the gradients later
  with tf.GradientTape() as tape: 
     # the input sequence is fed to the encoder to produce the encoder output and the encoder state
    enc_output, enc_state = encoder(inp)

    # the initial state is the encoder state
    dec_state = enc_state 

    # the first decoder input is the first sequence element of the target batch
    # the <start> token for each sequence in the batch. 
    # This is the teacher forcing
    dec_input = tf.expand_dims(targ[:,0], axis=1) 

    # we loop over the teacher forcing sequence to produce the predictions
    # we loop from 1 to targ.shape[1] which is the target sequence length
    
    # t comme token
    # targ c'est un batch de token (dim 32)
    # Dans une boucle on regarde tous les indice 0, tous les indices 1...
    # t = 2 on regarde en même temps 
    for t in range(1, targ.shape[1]):                                        # range 1... car on a dejà 0
      # passing dec_input, dec_state and enc_output to the decoder
      # in order to produce the prediction, the new state, and the attention weights which we will not need explicitely here
      pred, dec_state, _ = decoder(dec_input, enc_output, dec_state)

      # loss sur le token t du batch targ
      # compare the prediction produced by teacher forcing with the next element of the target and increment the loss
      loss += loss_function(targ[:, t], pred) 

      # The new decoder input becomes the next element of the target sequence which we just attempted to predict (teacher forcing)
      # a l'itération t change. A la dernière iteration on utilise ...
      dec_input = tf.expand_dims(targ[:, t], 1)                      

  # On est en training
  # On vient de faire une forward pass
  # faut calculer la loss (qui a été incrémenté à chaque tour de boucle)
  # rechercher la variable loss_function

  batch_loss = (loss / int(targ.shape[1])) # we divide the loss by the target
  # sequence's length to get the average loss across the sequence

  variables = encoder.trainable_variables + decoder.trainable_variables # here
  # we concatenate the lists of trainable variables for the encoder and the
  # decoder

  # compute the gradient based on the loss and the trainable variables
  gradients = tape.gradient(loss, variables) 

  # then update the model's  parameters
  optimizer.apply_gradients(zip(gradients, variables)) 

  return batch_loss

In [24]:


for epoch in range(k_Epochs):
  start = time.time()

  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train_batch):
    batch_loss = train_step(inp, targ)
    total_loss += batch_loss

    if batch % 10 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  
  # saving (checkpoint) the model every epoch
  checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss))
  print('Time taken for 1 epoch {} sec'.format(time.time() - start))

  # classic encoder input
  enc_input = X_val

  # the first decoder input is the special token 0
  dec_input = tf.zeros(shape=(len(X_val), 1))

  
  # compute once and for all the encoder output and the encoder h state and c state
  enc_out, enc_state = encoder(enc_input)

  # The encoder h state and c state will serve as initial states for the decoder
  dec_state = enc_state

  # we'll store the predictions in here
  pred = []  

  # we loop over the expected length of the target, but actually the loop can run
  # for as many steps as we wish, which is the advantage of the encoder decoder
  # architecture
  
  # Là on fait une inference sur le val set
  # On pointe sur le start et après on boucle
  for i in range(y_val.shape[1]-1):
    dec_out, dec_state, attention_w = decoder(dec_input, enc_out, dec_state)
    # the decoder state is updated and we get the first prediction probability vector
    decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1)

    # decode the softmax vector into and index and update the prediction list
    pred.append(tf.expand_dims(dec_out, axis = 1)) 

    # the last pred is used as the new input
    dec_input = decoded_out 

  pred = tf.concat(pred, axis=1).numpy()
  print("\n val loss :", loss_function(y_val[:,1:], pred),"\n") # on peut alors afficher la loss sur le val set

Epoch 1 Batch 0 Loss 3.8376
Epoch 1 Batch 10 Loss 3.8367
Epoch 1 Batch 20 Loss 3.8363
Epoch 1 Batch 30 Loss 3.8349
Epoch 1 Batch 40 Loss 3.8319
Epoch 1 Batch 50 Loss 3.8277
Epoch 1 Batch 60 Loss 3.8216
Epoch 1 Batch 70 Loss 3.8002
Epoch 1 Loss 302.2227
Time taken for 1 epoch 53.01007556915283 sec

 val loss : tf.Tensor(4.526805, shape=(), dtype=float32) 

Epoch 2 Batch 0 Loss 3.7559
Epoch 2 Batch 10 Loss 3.7247
Epoch 2 Batch 20 Loss 3.6976
Epoch 2 Batch 30 Loss 3.6931
Epoch 2 Batch 40 Loss 3.6976
Epoch 2 Batch 50 Loss 3.6937
Epoch 2 Batch 60 Loss 3.6729
Epoch 2 Batch 70 Loss 3.6403
Epoch 2 Loss 291.5589
Time taken for 1 epoch 15.712606430053711 sec

 val loss : tf.Tensor(4.3574677, shape=(), dtype=float32) 

Epoch 3 Batch 0 Loss 3.6370
Epoch 3 Batch 10 Loss 3.5671
Epoch 3 Batch 20 Loss 3.6105
Epoch 3 Batch 30 Loss 3.5436
Epoch 3 Batch 40 Loss 3.5314
Epoch 3 Batch 50 Loss 3.5303
Epoch 3 Batch 60 Loss 3.4909
Epoch 3 Batch 70 Loss 3.4481
Epoch 3 Loss 279.2998
Time taken for 1 epoch 14.834

KeyboardInterrupt: 

Nice! The training is over, and it looks as though the model performs really well both on train and validation sets!

## Make predictions with the inference model

To make predictions on the validation set, we cannot use teacher forcing, the model has to base itself on its own predictions!

In [None]:
# le val set fait 5000
# on fait pareil qu'avant en fait


enc_input = X_val # 5000 seq en anglais
#classic encoder input

dec_input = tf.zeros(shape=(len(X_val),1))                 # 5000 token start
# the first decoder input is the special token 0

#initial_state = encoder.state_initializer(len(X_val))

enc_out, enc_state = encoder(enc_input)#, initial_state)
# we compute once and for all the encoder output and the encoder
# h state and c state

dec_state = enc_state
# The encoder h state and c state will serve as initial states for the
# decoder

pred = []  # we'll store the predictions in here

# we loop over the expected length of the target, but actually the loop can run
# for as many steps as we wish, which is the advantage of the encoder decoder
# architecture
for i in range(y_val.shape[1]-1):
  dec_out, dec_state, attention_w = decoder(dec_input, enc_out, dec_state)
  # the decoder state is updated and we get the first prediction probability 
  # vector
  decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1) # argmax pour trouver le mot prdit, on l'enregistre
  # we decode the softmax vector into and index
  pred.append(decoded_out) # update the prediction list
  dec_input = decoded_out # the previous pred will be used as the new input

pred = tf.concat(pred, axis=-1).numpy()
for i in range(10):
  print("pred:", pred[i,:].tolist())
  print("true:", y_val[i,:].tolist()[1:])
  print("\n")

The results do not look so bad, almost perfect actually! This is a clear improvement from the encoder decoder! Attention must be really powerful!

The fact that the model reuses the encoder output at each step with different weights is helping the model achieve better predictions in a shorter amount of time (understand epochs).

I hope you found this demonstration useful! Now it is time for you to apply what you have learned to a real world automatic translation problem!