In [None]:
!wget https://raw.githubusercontent.com/PacktPublishing/The-Deep-Learning-Challenge/master/Section%205/source/spa-eng/spa.txt -P datasets
!pip install faker

In [None]:
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K
import numpy as np
import tensorflow as tf
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
import matplotlib.pyplot as plt
from lib.nmt_utils import *

# 1) Luong Attention

## 1.1 Data Loading

In [None]:
m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)
dataset[:10]

100%|██████████| 10000/10000 [00:00<00:00, 18467.01it/s]


[('9 may 1998', '1998-05-09'),
 ('10.11.19', '2019-11-10'),
 ('9/10/70', '1970-09-10'),
 ('saturday april 28 1990', '1990-04-28'),
 ('thursday january 26 1995', '1995-01-26'),
 ('monday march 7 1983', '1983-03-07'),
 ('sunday may 22 1988', '1988-05-22'),
 ('08 jul 2008', '2008-07-08'),
 ('8 sep 1999', '1999-09-08'),
 ('thursday january 1 1981', '1981-01-01')]

In [None]:
machine_vocab

{'-': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10}

In [None]:
human_vocab

{' ': 0,
 '.': 1,
 '/': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 '<pad>': 36,
 '<unk>': 35,
 'a': 13,
 'b': 14,
 'c': 15,
 'd': 16,
 'e': 17,
 'f': 18,
 'g': 19,
 'h': 20,
 'i': 21,
 'j': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'y': 34}

In [None]:
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    X, Y = zip(*dataset)
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))
    return X, np.array(Y), Xoh, Yoh

In [None]:
Tx = 30
Ty = 10
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

X.shape: (10000, 30)
Y.shape: (10000, 10)
Xoh.shape: (10000, 30, 37)
Yoh.shape: (10000, 10, 11)


In [None]:
index = 0
print("Source date:", dataset[index][0])
print("Target date:", dataset[index][1])
print()
print("Source after preprocessing (indices):", X[index])
print("Target after preprocessing (indices):", Y[index])
print()
print("Source after preprocessing (one-hot):", Xoh[index])
print("Target after preprocessing (one-hot):", Yoh[index])

Source date: 9 may 1998
Target date: 1998-05-09

Source after preprocessing (indices): [12  0 24 13 34  0  4 12 12 11 36 36 36 36 36 36 36 36 36 36 36 36 36 36
 36 36 36 36 36 36]
Target after preprocessing (indices): [ 2 10 10  9  0  1  6  0  1 10]

Source after preprocessing (one-hot): [[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
Target after preprocessing (one-hot): [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


## 1.2 Attention Mechanism

* If you had to translate a book's paragraph from French to English, you would not read the whole paragraph, then close the book and translate.
* Even during the translation process, you would read/re-read and focus on the parts of the French paragraph corresponding to the parts of the English you are writing down.
* The attention mechanism tells a Neural Machine Translation model where it should pay attention to at any step.

<table>
<td>
<img src="https://github.com/sebastianbirk/coursera-deep-learning-specialization/blob/master/05_sequence_models/06_neural_machine_translation_with_keras_lstm_attention/images/attn_model.png?raw=true" style="width:500;height:500px;"> <br>
</td>
<td>
<img src="https://github.com/sebastianbirk/coursera-deep-learning-specialization/blob/master/05_sequence_models/06_neural_machine_translation_with_keras_lstm_attention/images/attn_mechanism.png?raw=true" style="width:500;height:500px;"> <br>
</td>
</table>
<caption><center> Figure 1: Neural machine translation with attention</center></caption>


**Pre-attention and Post-attention LSTMs on both sides of the attention mechanism**

- There are two separate LSTMs in this model (see diagram on the left): pre-attention and post-attention LSTMs.
- *Pre-attention* Bi-LSTM is the one at the bottom of the picture is a Bi-directional LSTM and comes *before* the attention mechanism.
    - The attention mechanism is shown in the middle of the left-hand diagram.
    - The pre-attention Bi-LSTM goes through $T_x$ time steps
- *Post-attention* LSTM: at the top of the diagram comes *after* the attention mechanism.
    - The post-attention LSTM goes through $T_y$ time steps.

- The post-attention LSTM passes the hidden state $s^{\langle t \rangle}$ and cell state $c^{\langle t \rangle}$ from one time step to the next.

**Each time step does not use predictions from the previous time step**

* The post-attention LSTM at time $t$ does not take the previous time step's prediction $y^{\langle t-1 \rangle}$ as input.
* The post-attention LSTM at time 't' only takes the hidden state $s^{\langle t\rangle}$ and cell state $c^{\langle t\rangle}$ as input.
* We have designed the model this way because unlike language generation (where adjacent characters are highly correlated) there isn't as strong a dependency between the previous character and the next character in a YYYY-MM-DD date.

Concatenation of hidden states from the forward and backward pre-attention LSTMs
- $\overrightarrow{a}^{\langle t \rangle}$: hidden state of the forward-direction, pre-attention LSTM.
- $\overleftarrow{a}^{\langle t \rangle}$: hidden state of the backward-direction, pre-attention LSTM.
- $a^{\langle t \rangle} = [\overrightarrow{a}^{\langle t \rangle}, \overleftarrow{a}^{\langle t \rangle}]$: the concatenation of the activations of both the forward-direction $\overrightarrow{a}^{\langle t \rangle}$ and backward-directions $\overleftarrow{a}^{\langle t \rangle}$ of the pre-attention Bi-LSTM.

**Computing "energies" $e^{\langle t, t' \rangle}$ as a function of $s^{\langle t-1 \rangle}$ and $a^{\langle t' \rangle}$**
- Recall in the lesson videos "Attention Model", at time 6:45 to 8:16, the definition of "e" as a function of $s^{\langle t-1 \rangle}$ and $a^{\langle t \rangle}$.
    - "e" is called the "energies" variable.
    - $s^{\langle t-1 \rangle}$ is the hidden state of the post-attention LSTM
    - $a^{\langle t' \rangle}$ is the hidden state of the pre-attention LSTM.
    - $s^{\langle t-1 \rangle}$ and $a^{\langle t \rangle}$ are fed into a simple neural network, which learns the function to output $e^{\langle t, t' \rangle}$.
    - $e^{\langle t, t' \rangle}$ is then used when computing the attention $a^{\langle t, t' \rangle}$ that $y^{\langle t \rangle}$ should pay to $a^{\langle t' \rangle}$.

## 1.3 One Step Attention

**one_step_attention**
    - $[\alpha^{<t,1>},\alpha^{<t,2>}, ..., \alpha^{<t,T_x>}]$: the attention weights
    - $context^{ \langle t \rangle }$: the context vector:
    
$$context^{<t>} = \sum_{t' = 1}^{T_x} \alpha^{<t,t'>}a^{<t'>}\tag{1}$$

* The function `model()` will call the layers in `one_step_attention()` $T_y$ using a for-loop.
* It is important that all $T_y$ copies have the same weights.
    * It should not reinitialize the weights every time.
    * In other words, all $T_y$ steps should have shared weights.


In [None]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, x_len=Tx, name='attention'):
        super(Attention, self).__init__(name=name)
        self.repeator = RepeatVector(Tx)
        self.densor1 = Dense(10, activation = "tanh")
        self.densor2 = Dense(1, activation = "relu")
        self.activator = Activation(softmax)
        self.dotor = Dot(axes=1)

    def call(self, a, s_prev):
        # Use repeator to repeat s_prev to be of shape (m, Tx, n_s) to concatenate it with a
        s_prev = self.repeator(s_prev) # (m, 30, 64)

        # Use concatenator to concatenate a and s_prev on the last axis
        concat = tf.concat([a, s_prev], axis=-1) # (m, 30, 128)

        # compute the "intermediate energies" variable e.
        score = self.densor1(concat) # (m, 30, 10)

        # compute the "energies" variable energies.
        score = self.densor2(score) # (m, 30, 1)

        # Use "activator" on "score" to compute the attention weights
        attention_w = self.activator(score) # (m, 30, 1)

        # Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell
        context = self.dotor([attention_w, a])
        return context

## 1.4 Model Building

In [None]:
def lstm_attention_model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):

    # Define the inputs of your model with a shape (Tx,)
    # Define s0(initial hidden state) and c0(initial cell state) for the decoder LSTM with shape (n_s,)
    X = Input(shape=(Tx, human_vocab_size)) # (m, 30, 37)
    s1 = Input(shape=(n_s,), name='s0') # (m, 64)
    c1 = Input(shape=(n_s,), name='c0') # (m, 64)
    s, c = s1, c1
    m = tf.cast(tf.shape(s1)[0], dtype='int32')

    # Initialize empty list of outputs
    outputs = []

    # layers, must be constant throughout the loop
    attention = Attention()
    post_activation_LSTM_cell = LSTM(n_s, return_state=True)
    dense_1 = Dense(len(machine_vocab), activation=softmax)

    # Define pre-attention Bi-LSTM
    a = Bidirectional(LSTM(n_a, return_sequences=True), input_shape=(m, Tx, n_a * 2))(X) # (m, 30, 64)

    # Iterate for Ty steps, without teacher forcing, last prediction is next input
    for t in range(Ty):

        # Perform one step of the attention mechanism to get back the context vector at step t
        context = attention.call(a, s) # (m, 1, 64)

        # Apply the post-attention LSTM cell to the "context" vector.
        s, _, c = post_activation_LSTM_cell(context, initial_state=[s, c]) # (m, 64)

        # Apply Dense layer to the hidden state output of the post-attention LSTM
        out = dense_1(s) # (m, 11)

        # Append "out" to the "outputs" list
        outputs.append(out) # (10, m, 11)

    outputs = tf.transpose(outputs, perm=[1, 0, 2])

    # Create model instance taking three inputs and returning the list of outputs.
    model = Model(inputs=[X, s1, c1], outputs=outputs)
    return model

In [None]:
# number of units for the pre-attention, bi-directional LSTM's hidden state 'a'
n_a = 32
# number of units for the post-attention LSTM's hidden state "s"
n_s = 64
model = lstm_attention_model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))

## 1.5 Model Compile

In [None]:
opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])

## 1.6 Initialization
The last step is to define all your inputs and outputs to fit the model:
- You have input X of shape $(m = 10000, T_x = 30)$ containing the training examples.
- You need to create `s0` and `c0` to initialize your `post_attention_LSTM_cell` with zeros.
- Given the `model()` you coded, you need the "outputs" to be a list of 10 elements of shape (m, T_y).
    - The list `outputs[i][0], ..., outputs[i][Ty]` represents the true labels (characters) corresponding to the $i^{th}$ training example (`X[i]`).
    - `outputs[i][j]` is the true label of the $j^{th}$ character in the $i^{th}$ training example.

In [None]:
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
# outputs = list(Yoh.swapaxes(0, 1))

In [None]:
model.fit([Xoh, s0, c0], Yoh, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f0df334d590>

 <img src="https://github.com/sebastianbirk/coursera-deep-learning-specialization/raw/8a14375d1c66f21d01a46f508bbf670f8c6cf637/05_sequence_models/06_neural_machine_translation_with_keras_lstm_attention/images/table.png" style="width:700;height:200px;"> <br>
<caption><center>Thus, `dense_2_acc_8: 0.89` means that you are predicting the 7th character of the output correctly 89% of the time in the current batch of data. </center></caption>

In [None]:
EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']

source = np.array([string_to_int(example, Tx, human_vocab) for example in EXAMPLES])
source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source)))

s_init = np.zeros((source.shape[0], n_s))
s_init = np.zeros((source.shape[0], n_s))
preds = model.predict([source, s_init, s_init])

predictions = []
for pred, ex in zip(preds, EXAMPLES):
    out = np.argmax(pred, axis=-1)
    out = [inv_machine_vocab[int(i)] for i in out]
    out = ''.join(out)
    predictions.append(out)
    print("source:", ex)
    print("output:", out,"\n")

source: 3 May 1979
output: 1979-05-03 

source: 5 April 09
output: 2009-04-05 

source: 21th of August 2016
output: 2016-08-21 

source: Tue 10 Jul 2007
output: 2007-07-10 

source: Saturday May 9 2018
output: 2018-05-09 

source: March 3 2001
output: 2001-03-03 

source: March 3rd 2001
output: 2001-03-03 

source: 1 March 2001
output: 2001-03-01 



**Here's what you should remember**

- Machine translation models can be used to map from one sequence to another. They are useful not just for translating human languages (like French->English) but also for tasks like date format translation.
- An attention mechanism allows a network to focus on the most relevant parts of the input when producing a specific part of the output.
- A network using an attention mechanism can translate from inputs of length $T_x$ to outputs of length $T_y$, where $T_x$ and $T_y$ can be different.
- You can visualize attention weights $\alpha^{\langle t,t' \rangle}$ to see what the network is paying attention to while generating each output.

# 2) BahdanauAttention

<img src="https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg" style="width:600;height:300px;"> <br>
<caption><center>

The input is put through an encoder model which gives us the encoder output of shape (batch_size, max_length, hidden_size) and the encoder hidden state of shape (batch_size, hidden_size).

Here are the equations that are implemented:

<img src="https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg" style="width:600;height:300px;"> <br>
<caption><center>

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import re
import numpy as np
import os
import io
import time

## 2.1 Data Loading

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in line.split('\t')] for line in lines[:num_examples]]
    return zip(*word_pairs)

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    # tensor = tf.one_hot(tensor, len(lang_tokenizer.index_word))
    return tensor, lang_tokenizer

def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [None]:
num_examples = 64 * 15
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset('datasets/spa.txt', num_examples)
print(input_tensor.shape, target_tensor.shape)

(960, 9) (960, 7)


## 2.2 Preparing Data

In [None]:
BUFFER_SIZE = len(input_tensor)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((
    {'inputs': input_tensor, 'dec_inputs': target_tensor},
    {'outputs': target_tensor}
))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
full_set = next(iter(dataset))
example_input_batch, example_target_batch = full_set[0]['inputs'], full_set[1]['outputs']
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 9]), TensorShape([64, 7]))

## 2.3 Model Building

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, unit, name='BahdanauAttention'):
        super(BahdanauAttention, self).__init__(name=name)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # to broadcast with values (m, seq_len, hidden size)
        query_with_time_axis = tf.expand_dims(query, 1) # (m, n_h) --> # (m, 1, n_h)
        w1 = self.W1(query_with_time_axis) # (m, 1, n_h)
        w2 = self.W2(values) # (m, seq_len, n_h)

        # attention weights
        score = tf.nn.tanh(w1 + w2) # (m, seq_len, n_h)
        score = self.V(score) # (m, seq_len, 1)
        attention_weights = tf.nn.softmax(score, axis=1) # (m, seq_len, 1)

        # context vector
        context_vector = attention_weights * values # (m, seq_len, n_h)
        context_vector = tf.reduce_sum(context_vector, axis=1) # (m, n_h)
        context_vector = tf.expand_dims(context_vector, 1) # (m, 1, n_h)
        return context_vector

In [None]:
def encoder_decoder(vocab_size_enc, vocab_size_dec, embedding_dim, n_h, batch_sz, Ty, name='Attention'):

    # inputs
    enc_inputs = tf.keras.Input(shape=(None, ), name="enc_inputs") # (m, seq_len_en)
    dec_inputs = tf.keras.Input(shape=(None, ), name="dec_inputs")

    # layers for encoder and decoder, should only initialized once
    embedding_enc = tf.keras.layers.Embedding(vocab_size_enc, embedding_dim)
    gru_enc = tf.keras.layers.GRU(n_h, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    attention = BahdanauAttention(n_h)
    embedding_dec = tf.keras.layers.Embedding(vocab_size_dec, embedding_dim)
    gru_dec = tf.keras.layers.GRU(n_h, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    fc_dec = tf.keras.layers.Dense(vocab_size_dec)

    # encoder
    initial_enc_hidden = tf.zeros((batch_sz, n_h)) # (m, n_h)
    enc_x = embedding_enc(enc_inputs) # (m, seq_len_en, embedding_dim)
    enc_output, enc_hidden = gru_enc(enc_x, initial_state=initial_enc_hidden) # (m, seq_len, n_h), (m, n_h)

    # decoder, one input at a step, only initialized once
    # first decoder input is <start>
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1) # (m, 1)
    dec_hidden = enc_hidden
    outputs = []

    for t in range(1, Ty + 1):
        dec_x = embedding_dec(dec_inputs) # (m, 1, emb_dim)
        # first, enc attention + last enc hidden, then enc attention + last dec hidden
        context_vector = attention(dec_hidden, enc_output) # (m, n_h)
#         dec_x = tf.concat([tf.expand_dims(context_vector, 1), dec_x], axis=-1) # (m, 1, emb_dim + n_h)

        # gru, depend on both context and previous input
        dec_output, dec_hidden = gru_dec(context_vector) # (m, 1, n_h) (m, n_h)
        dec_output = tf.reshape(dec_output, (-1, dec_output.shape[2])) # (m * 1, n_h)
        pred = fc_dec(dec_output) # (m, vocab_size)

        # Teacher forcing - feeding the target as the next input
        dec_input = tf.expand_dims(dec_inputs[:, t], 1)
        outputs.append(pred)

    outputs = tf.transpose(outputs, perm=[1, 0, 2]) # (m, seq_len, vocab_size)
    outputs = tf.nn.softmax(outputs, axis=-1)
    # outputs = tf.argmax(outputs, axis=-1) # (m, seq_len)
    # outputs = tf.cast(outputs, tf.float32)
    return tf.keras.Model(inputs=[enc_inputs, dec_inputs], outputs=outputs, name=name)

In [None]:
# unit test
model_test = encoder_decoder(vocab_inp_size, vocab_tar_size, embedding_dim, units, BATCH_SIZE, target_tensor.shape[1])
output = model_test([example_input_batch, example_target_batch])
output.shape

TensorShape([64, 7, 362])

In [None]:
tf.keras.backend.clear_session()
inp = [input_tensor, target_tensor]
out = target_tensor
model = encoder_decoder(vocab_inp_size, vocab_tar_size, embedding_dim, units, BATCH_SIZE, target_tensor.shape[1])

In [None]:
MAX_LEN = target_tensor.shape[1] + 1
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LEN - 1))
    # SparseCategoricalCrossentropy as y_pred.shape = (m, seq_len, voab_size), while y_true.shape = (m, seq_len)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    return tf.reduce_mean(loss)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(embedding_dim)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
    # ensure labels have shape (batch_size, MAX_LEN - 1)
    y_true = tf.reshape(y_true, shape=(-1, MAX_LEN - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [None]:
model.fit([input_tensor, target_tensor], target_tensor, batch_size=BATCH_SIZE, epochs=10, steps_per_epoch=steps_per_epoch)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0dd8fce0d0>

# 3) Scaled Dot-Product Attention

## 3.1 Self Attention

![](https://pbs.twimg.com/media/FzjhZk5X0AYAs_-?format=jpg&name=4096x4096)

In [None]:
import torch
from torch import nn, Tensor

In [None]:
class Attention(nn.Module):
    def __init__(self, ori_dim:int=512, embed_dim:int=512) -> None:
        super().__init__()
        self.embed_dim = embed_dim
        self.dim_K = torch.tensor(embed_dim)
        self.query = nn.Linear(in_features=ori_dim, out_features=embed_dim, bias=True)
        self.key  = nn.Linear(in_features=ori_dim, out_features=embed_dim, bias=True)
        self.value = nn.Linear(in_features=ori_dim, out_features=embed_dim, bias=True)

    def self_attention(self, Q:Tensor, K:Tensor, V:Tensor) -> Tensor:
        K_T = torch.transpose(K, 0, 1)
        score = torch.matmul(Q, K_T)  / torch.sqrt(self.dim_K)
        score = torch.softmax(score, dim=-1)
        Z = torch.matmul(score, V)
        return Z

    def forward(self, x:Tensor) -> Tensor:
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        Z = self.self_attention(Q, K, V)
        return Z

## 3.2 Multi-head Attention

In [None]:
class MultiheadAttention(nn.Module):
    """https://arxiv.org/abs/1706.03762"""

    def __init__(self, embed_dim:int=512, n_head:int=8) -> None:
        super().__init__()
        self.n_head = n_head
        self.embed_dim = embed_dim
        self.multihead = nn.ModuleList([
            Attention(embed_dim, embed_dim // n_head) for _ in range(n_head)
        ])

    def forward(self, x: Tensor) -> Tensor:
        Z = torch.cat([head(x) for head in self.multihead], dim=1)
        return Z

In [None]:
class MultiQueryAttention(Attention):
    """
    https://arxiv.org/pdf/1911.02150.pdf

    The keys and values are shared across all of the different attention "heads",
    greatly reducing the size of these tensors and hence the memory bandwidth requirements
    of incremental decoding. We verify experimentally that the resulting models can indeed
    be much faster to decode, and incur only minor qualitydegradation from the baseline.
    """

    def __init__(self, embed_dim:int=512, n_query:int=8) -> None:
        super().__init__(embed_dim, embed_dim // n_query)
        self.n_query = n_query
        delattr(self, 'query')

        self.querys = nn.ModuleList([
            nn.Linear(in_features=embed_dim, out_features=embed_dim // n_query, bias=True)
            for _ in range(n_query)
        ])
        self.key = nn.Linear(in_features=embed_dim, out_features=embed_dim // n_query, bias=True)
        self.value = nn.Linear(in_features=embed_dim, out_features=embed_dim // n_query, bias=True)

    def forward(self, x: Tensor) -> Tensor:
        K = self.key(x)
        V = self.value(x)

        # x should break into: n_query * q(m, seq_dim, dim)
        Z = torch.cat([self.self_attention(query(x), K, V) for query in self.querys], dim=1)
        return Z

In [None]:
class  GroupedQueryAttention(Attention):
    """https://arxiv.org/pdf/2305.13245.pdf"""

    def __init__(self, embed_dim: int=512, n_grouped:int=4, n_query_each_group:int=2) -> None:
        super().__init__(embed_dim, embed_dim // (n_query * n_query_each_group))
        delattr(self, 'query')
        delattr(self, 'key')
        delattr(self, 'value')

        self.grouped = nn.ModuleList([
            MultiQueryAttention(embed_dim // (n_grouped * n_query_each_group), n_query=n_query_each_group)
            for _ in range(n_grouped)
        ])

    def forward(self, x: Tensor) -> Tensor:
        # x should break into: n_grouped * q(m, seq_dim, dim)
        Z = torch.cat([head(x) for head in self.grouped], dim=1)
        return Z