In [150]:
# nlp c4w2

In [152]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import log_softmax
import torchtext
torchtext.disable_torchtext_deprecation_warning()

import tensorflow as tf

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import time
import utils

import textwrap
wrapper = textwrap.TextWrapper(width=70)


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
torch.cuda.is_available()


True

## Import the Dataset
You have the dataset saved in a .json file, which you can easily open with pandas. The loading function has already been taken care of in `utils.py`.

In [153]:
data_dir = "data/corpus"

train_data, test_data = utils.get_train_test_data(data_dir)

# Take one example from the dataset and print it
example_summary, example_dialogue = train_data.iloc[10]
print(f"Dialogue:\n{example_dialogue}")
print(f"\nSummary:\n{example_summary}")


Dialogue:
Lucas: Hey! How was your day?
Demi: Hey there! 
Demi: It was pretty fine, actually, thank you!
Demi: I just got promoted! :D
Lucas: Whoa! Great news!
Lucas: Congratulations!
Lucas: Such a success has to be celebrated.
Demi: I agree! :D
Demi: Tonight at Death & Co.?
Lucas: Sure!
Lucas: See you there at 10pm?
Demi: Yeah! See you there! :D

Summary:
Demi got promoted. She will celebrate that with Lucas at Death & Co at 10 pm.


## Preprocess the data

First you will do some preprocessing of the data and split it into inputs and outputs. Here you also remove some of the characters that are specific to this dataset and add the `[EOS]` (end of sentence) token to the end, like it was discussed in the lecture videos. You will also add a `[SOS]` (start of sentence) token to the beginning of the sentences.

In [154]:
document, summary           = utils.preprocess(train_data)
document_test, summary_test = utils.preprocess(test_data)


In [155]:
###########################################################################

In [156]:
document


0        [SOS] amanda: i baked  cookies. do you want so...
1        [SOS] olivia: who are you voting for in this e...
2        [SOS] tim: hi, what's up?  kim: bad mood tbh, ...
3        [SOS] edward: rachel, i think i'm in ove with ...
4        [SOS] sam: hey  overheard rick say something  ...
                               ...                        
14727    [SOS] romeo: you are on my ‘people you may kno...
14728    [SOS] theresa: <file_photo>  theresa: <file_ph...
14729    [SOS] john: every day some bad news. japan wil...
14730    [SOS] jennifer: dear celia! how are you doing?...
14731    [SOS] georgia: are you ready for hotel hunting...
Name: dialogue, Length: 14732, dtype: object

In [157]:
type(document)


pandas.core.series.Series

In [158]:
document[0]


"[SOS] amanda: i baked  cookies. do you want some?  jerry: sure!  amanda: i'll bring you tomorrow :-) [EOS]"

In [159]:
summary


0        [SOS] amanda baked cookies and will bring jerr...
1        [SOS] olivia and olivier are voting for libera...
2        [SOS] kim may try the pomodoro technique recom...
3        [SOS] edward thinks he is in love with bella. ...
4        [SOS] sam is confused, because he overheard ri...
                               ...                        
14727    [SOS] romeo is trying to get greta to add him ...
14728    [SOS] theresa is at work. she gets free food a...
14729    [SOS] japan is going to hunt whales again. isl...
14730    [SOS] celia couldn't make it to the afternoon ...
14731    [SOS] georgia and juliette are looking for a h...
Name: summary, Length: 14732, dtype: object

In [11]:
type(summary)


pandas.core.series.Series

In [161]:
summary[0]


'[SOS] amanda baked cookies and will bring jerry some tomorrow. [EOS]'

In [162]:
###########################################################################

Now perform the standard preprocessing with the tensorflow library. You will need to modify the filters, because you dont want the `[EOS]` tokens to be removed.

Then create the vocabulary by combining the data in the documents and the summaries and using `.fit_on_texts()`:

In [163]:
# The [ and ] from default tokens cannot be removed, because they mark the SOS and EOS token.
filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n'
oov_token = '[UNK]'

tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=filters, oov_token=oov_token, lower=False)

documents_and_summary = pd.concat([document, summary], ignore_index=True)

tokenizer.fit_on_texts(documents_and_summary)


inputs  = tokenizer.texts_to_sequences(document)
targets = tokenizer.texts_to_sequences(summary)

vocab_size = len(tokenizer.word_index) + 1

print(f'Size of vocabulary: {vocab_size}')


Size of vocabulary: 34250


In [15]:
#################################################################

In [164]:
documents_and_summary

0        [SOS] amanda: i baked  cookies. do you want so...
1        [SOS] olivia: who are you voting for in this e...
2        [SOS] tim: hi, what's up?  kim: bad mood tbh, ...
3        [SOS] edward: rachel, i think i'm in ove with ...
4        [SOS] sam: hey  overheard rick say something  ...
                               ...                        
29459    [SOS] romeo is trying to get greta to add him ...
29460    [SOS] theresa is at work. she gets free food a...
29461    [SOS] japan is going to hunt whales again. isl...
29462    [SOS] celia couldn't make it to the afternoon ...
29463    [SOS] georgia and juliette are looking for a h...
Length: 29464, dtype: object

In [165]:
type(documents_and_summary)


pandas.core.series.Series

In [166]:
documents_and_summary[0]


"[SOS] amanda: i baked  cookies. do you want some?  jerry: sure!  amanda: i'll bring you tomorrow :-) [EOS]"

In [167]:
documents_and_summary[14732]


'[SOS] amanda baked cookies and will bring jerry some tomorrow. [EOS]'

In [168]:
len(documents_and_summary)


29464

In [169]:
type(inputs)


list

In [170]:
len(inputs)


14732

In [171]:
inputs[0]


[7, 454, 2, 3500, 1611, 30, 5, 81, 50, 617, 66, 454, 63, 220, 5, 98, 8]

In [172]:
type(targets)


list

In [173]:
targets[0]


[7, 454, 3500, 1611, 9, 15, 220, 617, 50, 98, 8]

In [None]:
tokenizer.word_index


In [None]:
tokenizer.index_word


In [177]:
type(tokenizer.word_index)


dict

In [183]:
# Limit the size of the input and output data for being able to run it in this environment.
encoder_maxlen = 150
decoder_maxlen = 50

# Pad the sequences.
inputs  = tf.keras.preprocessing.sequence.pad_sequences(inputs,  maxlen=encoder_maxlen,  padding='post', truncating='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=decoder_maxlen,  padding='post', truncating='post')


In [184]:
len(inputs)


14732

In [185]:
# 0 ko as a padding use kar rahay hain
inputs


array([[   7,  454,    2, ...,    0,    0,    0],
       [   7,  339,  174, ...,    0,    0,    0],
       [   7,  238,  116, ...,    0,    0,    0],
       ...,
       [   7,  109,  462, ...,    0,    0,    0],
       [   7,  632,  668, ...,  107,  312,   29],
       [   7, 1667,   20, ...,  919,  108,    3]])

In [186]:
tokenizer.word_index['[UNK]'], tokenizer.word_index['[SOS]'], tokenizer.word_index['[EOS]']


(1, 7, 8)

In [188]:
# inputs  = torch.tensor(inputs,  dtype=torch.int32)
# targets = torch.tensor(targets, dtype=torch.int32)


  inputs  = torch.tensor(inputs,  dtype=torch.int32)
  targets = torch.tensor(targets, dtype=torch.int32)


In [189]:
# inputs

tensor([[   7,  454,    2,  ...,    0,    0,    0],
        [   7,  339,  174,  ...,    0,    0,    0],
        [   7,  238,  116,  ...,    0,    0,    0],
        ...,
        [   7,  109,  462,  ...,    0,    0,    0],
        [   7,  632,  668,  ...,  107,  312,   29],
        [   7, 1667,   20,  ...,  919,  108,    3]], dtype=torch.int32)

In [190]:
# inputs[0].shape

torch.Size([150])

In [191]:
# targets[0].shape


torch.Size([50])

In [37]:
# aap chaaho tu ab in ka dataset use na kro, yaheen say apna khud ka generator define kr lo

In [192]:
# Create the final training dataset.
BUFFER_SIZE = 10000
BATCH_SIZE  = 64

dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)


In [193]:
dataset
# notice the datatype, it is tf, so we'll have to convert it to torch before feeding it into our network


<BatchDataset element_spec=(TensorSpec(shape=(None, 150), dtype=tf.int32, name=None), TensorSpec(shape=(None, 50), dtype=tf.int32, name=None))>

In [194]:
len(dataset), 10000/64, (len(document))/64

(231, 156.25, 230.1875)

## Positional Encoding

In sequence to sequence tasks, the relative order of your data is extremely important to its meaning. When you were training sequential neural networks such as RNNs, you fed your inputs into the network in order. Information about the order of your data was automatically fed into your model. However, when you train a Transformer network using multi-head attention, you feed your data into the model all at once. While this dramatically reduces training time, there is no information about the order of your data. This is where positional encoding is useful.

You have learned how to implement the positional encoding in one of this week's labs. Here you will use the `positional_encoding` function to create positional encodings for your transformer. The function is already implemented for you.

In [236]:
def positional_encoding(positions, d_model):
    """
    Precomputes a matrix with all the positional encodings 
    
    Arguments:
        positions (int): Maximum number of positions to be encoded 
        d_model (int):   Encoding size 
    
    Returns:
        pos_encoding (torch.tensor): A matrix of shape (1, position, d_model) with the positional encodings
    """
    
    position = np.arange(positions)[:, np.newaxis]
    k = np.arange(d_model)[np.newaxis, :]
    i = k // 2
    
    # initialize a matrix angle_rads of all the angles 
    angle_rates = 1 / np.power(10000, (2 * i) / np.float32(d_model))
    angle_rads = position * angle_rates
  
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return torch.tensor(pos_encoding, dtype=torch.float32)


In [238]:
positional_encoding(15,128).shape


torch.Size([1, 15, 128])

## Masking

There are two types of masks that are useful when building your Transformer network: the *padding mask* and the *look-ahead mask*. Both help the softmax computation give the appropriate weights to the words in your input sentence. 


### Padding mask

In [42]:
def create_padding_mask(decoder_token_ids):
    """
    Creates a matrix mask for the padding cells
    
    Arguments:
        decoder_token_ids (matrix like): matrix of size (n, m)
    
    Returns:
        mask : binary tensor of size (n, m)
    """
    # if decoder_token_ids.shape[1] <= 1:
    #     mask = None
    # else:
    mask = decoder_token_ids == 0
    return mask


In [43]:
inputs[:2,:30]

tensor([[    7,   454,     2,  3500,  1611,    30,     5,    81,    50,   617,
            66,   454,    63,   220,     5,    98,     8,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    7,   339,   174,    20,     5,  5566,    12,    13,    38,  3018,
           636, 12283,    59,   171,   339,    24,    69,   636,    99,     8,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       dtype=torch.int32)

In [195]:
create_padding_mask(inputs[:2,:30])
# perfect


tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]])

In [196]:
a = torch.tensor([[7]])
a.shape, a, create_padding_mask(a)


(torch.Size([1, 1]), tensor([[7]]), tensor([[False]]))

# look ahead mask / attention mask

In [46]:
# tensorflow implementation
def tf_create_look_ahead_mask(sequence_length):
    """
    Returns a lower triangular matrix filled with ones
    
    Arguments:
        sequence_length (int): matrix size
    
    Returns:
        mask (tf.Tensor): binary tensor of size (sequence_length, sequence_length)
    """
    mask = tf.linalg.band_part(tf.ones((1, sequence_length, sequence_length)), -1, 0)
    return mask 



In [47]:
tf_create_look_ahead_mask(5)

<tf.Tensor: shape=(1, 5, 5), dtype=float32, numpy=
array([[[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]]], dtype=float32)>

In [208]:
# tensorflow implementation
def create_look_ahead_mask(matrix_of_sequences):
    """
    Returns a lower triangular matrix filled with ones
    
    Arguments:
        sequence_length (int): matrix size
    
    Returns:
        mask (torch.tensor): binary tensor of size (sequence_length, sequence_length)
    """

    # if matrix_of_sequences.shape[1] <= 1:
    #     mask = None
    # else:
    mask = torch.triu(torch.ones(matrix_of_sequences.shape[1], matrix_of_sequences.shape[1]), diagonal=1).to(torch.bool)
    return mask



In [209]:
b = torch.tensor([[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]])


In [210]:
a = create_look_ahead_mask(b)
a.shape
# perfect


torch.Size([5, 5])

In [211]:
a


tensor([[False,  True,  True,  True,  True],
        [False, False,  True,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False,  True],
        [False, False, False, False, False]])

In [212]:
a = torch.tensor([[7]])
a.shape, a, create_look_ahead_mask(a)


(torch.Size([1, 1]), tensor([[7]]), tensor([[False]]))

In [213]:
########################################################################

In [214]:
inputs[0,:].shape

torch.Size([150])

In [215]:
torch.ones(3, 15)

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [216]:
torch.triu(torch.ones(3, 15), diagonal=1)

tensor([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [217]:
########################################################################

Excellent work! You can now implement self-attention. With that, you can start building the encoder block! 

<a name='6'></a>
## 6 - Encoder

The Transformer Encoder layer pairs self-attention and convolutional neural network style of processing to improve the speed of training and passes K and V matrices to the Decoder, which you'll build later in the assignment. In this section of the assignment, you will implement the Encoder by pairing multi-head attention and a feed forward neural network (Figure 2a). 
<img src="images/encoder_layer.png" alt="Encoder" width="400"/>
<caption><center><font color='purple'><b>Figure 2a: Transformer encoder layer</font></center></caption>

* `MultiHeadAttention` you can think of as computing the self-attention several times to detect different features. 
* Feed forward neural network contains two Dense layers.

Your input sentence first passes through a *multi-head attention layer*, where the encoder looks at other words in the input sentence as it encodes a specific word. The outputs of the multi-head attention layer are then fed to a *feed forward neural network*. The exact same feed forward network is independently applied to each position.


## Encoder layer

In [218]:
class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim_, num_heads_, fully_connected_dim_,  dropout_rate_=0.1, layernorm_eps_=1e-6):
        super(EncoderLayer, self).__init__()
        
        self.attention           = nn.MultiheadAttention(embed_dim=embedding_dim_, num_heads=num_heads_, dropout=dropout_rate_, batch_first=True)
        self.layernorm1          = nn.BatchNorm1d(num_features=embedding_dim_, eps=layernorm_eps_)

        self.fc1                 = nn.Linear(in_features=embedding_dim_, out_features=fully_connected_dim_)
        self.fc2                 = nn.Linear(in_features=fully_connected_dim_, out_features=embedding_dim_)
        self.layernorm2          = nn.BatchNorm1d(num_features=embedding_dim_, eps=layernorm_eps_)

        self.dropout = nn.Dropout(dropout_rate_)

    def forward(self, x, mask):
        # x: (batch_size, seq_length, embedding_dim)
        # self attention
        attention_output, attention_weights = self.attention(query=x, key=x, value=x, key_padding_mask=mask)  # Self attention (batch_size, input_seq_len, embedding_dim)

        # skip_connection
        skip_x_attention = x + attention_output                   # (batch_size, seq_length, embedding_dim)
        skip_x_attention = skip_x_attention.permute(0, 2, 1)      # (batch_size, embedding_dim, input_seq_len)
        skip_x_attention = self.layernorm1(skip_x_attention)      # (batch_size, embedding_dim, input_seq_len)
        skip_x_attention = skip_x_attention.permute(0, 2, 1)      # (batch_size, input_seq_len, embedding_dim)

        # Dense / Fully connected layers / Feed forward neural network
        fc1_output = self.fc1(skip_x_attention)
        fc1_output = torch.relu(fc1_output)

        fc2_output = self.fc2(fc1_output)
        fc2_output = torch.relu(fc2_output)        

        # dropout
        fc2_output = self.dropout(fc2_output)

        # 2nd skip connection
        skip_attention_fc = skip_x_attention + fc2_output
        skip_attention_fc = skip_attention_fc.permute(0, 2, 1)
        skip_attention_fc = self.layernorm2(skip_attention_fc)
        skip_attention_fc = skip_attention_fc.permute(0, 2, 1)

        return skip_attention_fc

enc_layer = EncoderLayer(
    128,
    2,
    150,
    dropout_rate_=0.1,
    layernorm_eps_=1e-06,
).to(device)
print(enc_layer)

EncoderLayer(
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (layernorm1): BatchNorm1d(128, eps=1e-06, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=128, out_features=150, bias=True)
  (fc2): Linear(in_features=150, out_features=128, bias=True)
  (layernorm2): BatchNorm1d(128, eps=1e-06, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [59]:
##########################################################################################################################################

### Checking Encoder's outputs layer by layer

In [219]:
# Example of using BatchNorm1d
batch_size   = 10
num_features = 15

# Initialize the BatchNorm1d layer
batch_norm = nn.BatchNorm1d(num_features).to(device)

# Example input tensor of shape (batch_size, num_features)
input_tensor = torch.randn(batch_size, num_features).to(device)

# Pass the input through the BatchNorm1d layer
output = batch_norm(input_tensor)

output.shape, output[0,:], torch.sum(output[0,:]), torch.sum(output[:,0])



(torch.Size([10, 15]),
 tensor([-1.4355, -1.6055, -1.2522,  0.7274,  1.3627,  1.0898,  1.8431,  1.2993,
          0.7818,  0.2476, -0.8770, -1.4970, -0.0961,  1.5455,  0.4793],
        device='cuda:0', grad_fn=<SliceBackward0>),
 tensor(2.6134, device='cuda:0', grad_fn=<SumBackward0>),
 tensor(4.7684e-07, device='cuda:0', grad_fn=<SumBackward0>))

In [220]:
# Example input tensor of shape (batch_size, num_features)
input_tensor = torch.randint(0, 600, (batch_size, num_features)).to(device)
print('input_tensor', input_tensor.shape)


input_tensor torch.Size([10, 15])


In [221]:
embed_l   = nn.Embedding(num_embeddings=600, embedding_dim=128, padding_idx=0).to(device)
embed_out = embed_l(input_tensor)
print('embed_out: ', embed_out.shape)


embed_out:  torch.Size([10, 15, 128])


In [222]:
attention_l  = nn.MultiheadAttention(embed_dim=128, num_heads=2, dropout=0.1, batch_first=True).to(device)
mask         = create_padding_mask(input_tensor)

# self attention
attention_output, attention_weights = attention_l(query=embed_out, key=embed_out, value=embed_out, key_padding_mask=mask)  # Self attention (batch_size, input_seq_len, fully_connected_dim)
print('attention_output: ', attention_output.shape)


attention_output:  torch.Size([10, 15, 128])


#### [torch.nn.BatchNorm1d](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html)

The explanation comes from ofiicial pytorch website
Shape:

        Input: (N,C)(N,C) or (N,C,L)(N,C,L), where NN is the batch size, CC is the number of features or channels, and LL is the sequence length

        Output: (N,C)(N,C) or (N,C,L)(N,C,L) (same shape as input)



In [223]:
x_plus_attn = attention_output + embed_out
x_plus_attn = x_plus_attn.permute(0,2,1)
print('one time permuted  x_plus_attn: ', x_plus_attn.shape)


layernorm1_l  = nn.BatchNorm1d(num_features=128, eps=1e-6).to(device)
x_plus_attn   = layernorm1_l(x_plus_attn)
print("layernorm1_l_out:               ", x_plus_attn.shape)


x_plus_attn = x_plus_attn.permute(0,2,1)
print('two times permuted  x_plus_attn:', x_plus_attn.shape)


one time permuted  x_plus_attn:  torch.Size([10, 128, 15])
layernorm1_l_out:                torch.Size([10, 128, 15])
two times permuted  x_plus_attn: torch.Size([10, 15, 128])


In [228]:
x_plus_attn.shape, x_plus_attn[0,:,0], torch.mean(x_plus_attn[:,:,0]) # [:,:,0] : all the sequences, all the words of all the sequences, all the first feature's-values for all of the word's of all of the sequences


(torch.Size([10, 15, 128]),
 tensor([ 2.1386,  0.9761, -0.5938,  0.4367,  2.2274,  0.5757,  1.2709, -1.7238,
          1.2759, -0.2480,  1.6349, -0.5888,  0.5257, -1.4119, -0.6401],
        device='cuda:0', grad_fn=<SelectBackward0>),
 tensor(3.1789e-09, device='cuda:0', grad_fn=<MeanBackward0>))

In [229]:
fc1_l   = nn.Linear(in_features=128, out_features=150).to(device)
fc1_out = fc1_l(x_plus_attn)
fc1_out = torch.relu(fc1_out)

print('fc1_out: ', fc1_out.shape)


fc1_out:  torch.Size([10, 15, 150])


In [230]:
fc2_l   = nn.Linear(in_features=150, out_features=128).to(device)
fc2_out = fc2_l(fc1_out)
fc2_out = torch.relu(fc2_out)

print('fc2_out: ', fc2_out.shape)


fc2_out:  torch.Size([10, 15, 128])


In [231]:
dropout_l  = nn.Dropout(0.1).to(device)
droput_out = dropout_l(fc2_out)
print('droput_out: ', droput_out.shape)


droput_out:  torch.Size([10, 15, 128])


In [232]:
# & so on .... 2nd skip connection and second normalisation

#### Output Directly from EncoderLayer

In [234]:
enc_out = enc_layer(embed_out, create_padding_mask(input_tensor))
print('enc_out: ', enc_out.shape)
# perfect


enc_out:  torch.Size([10, 15, 128])


In [235]:
##########################################################################################################################################

<a name='6-2'></a>
### 6.2 - Full Encoder

Now you're ready to build the full Transformer Encoder (Figure 2b), where you will embed your input and add the positional encodings you calculated. You will then feed your encoded embeddings to a stack of Encoder layers. 

<img src="images/encoder.png" alt="Encoder" width="330"/>
<caption><center><font color='purple'><b>Figure 2b: Transformer Encoder</font></center></caption>

The Encoder class is implemented for you. It performs the following steps: 
1. Pass the input through the Embedding layer.
2. Scale the embedding by multiplying it by the square root of the embedding dimension. 
3. Add the position encoding: self.pos_encoding `[:, :seq_len, :]` to the embedding.
4. Pass the encoded embedding through a dropout layer
5. Pass the output of the dropout layer through the stack of encoding layers using a for loop.

In [239]:
class Encoder(nn.Module):
    """
    The entire Encoder starts by passing the input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    encoder Layers
        
    """  
    def __init__(self, num_layers_, embedding_dim_, num_heads_, fully_connected_dim_, input_vocab_size_, maximum_position_encoding_, dropout_rate_=0.1, layernorm_eps_=1e-6):
        super(Encoder, self).__init__()

        self.embedding_dim = embedding_dim_
        self.num_layers    = num_layers_

        self.embedding    = nn.Embedding(num_embeddings=input_vocab_size_, embedding_dim=self.embedding_dim, padding_idx=0)
        self.pos_encoding = positional_encoding(maximum_position_encoding_, self.embedding_dim)
        

        self.enc_layers = nn.ModuleList([EncoderLayer(embedding_dim_=self.embedding_dim,
                                        num_heads_=num_heads_,
                                        fully_connected_dim_=fully_connected_dim_,
                                        dropout_rate_=dropout_rate_,
                                        layernorm_eps_=layernorm_eps_) 
                           for _ in range(self.num_layers)])

        
        
        self.dropout = nn.Dropout(dropout_rate_)

    
    
    def forward(self, current_batch_of_sequences):
        """
        Forward pass for the Encoder
        
        Arguments:
            current_batch_of_sequences (torch.tensor):    Tensor of shape (batch_size, seq_len)
        Returns:
            x (torch.tensor): Tensor of shape (batch_size, seq_len, embedding_dim)
        """
        mask    = create_padding_mask(current_batch_of_sequences).to(current_batch_of_sequences.device)
        seq_len = current_batch_of_sequences.shape[1]
        
        # Pass input through the Embedding layer
        x = self.embedding(current_batch_of_sequences)  # (batch_size, input_seq_len, embedding_dim)

        
        # Scale embedding by multiplying it by the square root of the embedding dimension
        x = x * torch.sqrt( torch.tensor(self.embedding_dim, dtype=torch.float32).to(x.device) )
                                                                                            # x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        
        # Add the position encoding to embedding
        x += self.pos_encoding[:, :seq_len, :].to(x.device)

        # Pass the encoded embedding through a dropout layer
        # use `training=training`
        x = self.dropout(x)
        
        # Pass the output through the stack of encoding layers 
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask)

        return x  # (batch_size, input_seq_len, embedding_dim)

encoder_pseudo = Encoder(
                        num_layers_                = 3,
                        embedding_dim_             = 128,
                        num_heads_                 = 2,
                        fully_connected_dim_       = 150,
                        input_vocab_size_          = 400,
                        maximum_position_encoding_ = 15,
                        dropout_rate_              = 0.1,
                        layernorm_eps_             = 1e-06,
).to(device)


In [73]:
##############################################################################################################################

### Checking Full-Encoder's output layer by layer


In [240]:
class ABC(nn.Module):
    def __init__(self):
        super(ABC, self).__init__()

    def forward(self, current_batch_of_sequences):
        mask = create_padding_mask(current_batch_of_sequences)
        return mask


In [241]:
abc_l = ABC()


In [242]:
create_padding_mask

<function __main__.create_padding_mask(decoder_token_ids)>

In [243]:
a = torch.tensor([[1,2,0],[1,0,0],[4,5,6]]).to(device)
abc_l(a)


tensor([[False, False,  True],
        [False,  True,  True],
        [False, False, False]], device='cuda:0')

In [244]:
torch.sqrt( torch.tensor(a.shape[1]) )


tensor(1.7321)

In [245]:
b = a * torch.sqrt( torch.tensor(a.shape[1], dtype=torch.float32).to(device))
b


tensor([[ 1.7321,  3.4641,  0.0000],
        [ 1.7321,  0.0000,  0.0000],
        [ 6.9282,  8.6603, 10.3923]], device='cuda:0')

In [246]:
b.dtype


torch.float32

In [247]:
embed_out.shape

torch.Size([10, 15, 128])

In [251]:
pos_encoding = positional_encoding(15, 128)
pos_encoding.shape


torch.Size([1, 15, 128])

In [252]:
# automatic broadcasting
n = embed_out + pos_encoding.to(device)
n.shape


torch.Size([10, 15, 128])

In [84]:
# enc_layers = nn.ModuleList[EncoderLayer(
#     128,
#     2,
#     150,
#     dropout_rate_=0.1,
#     layernorm_eps_=1e-06
# ).to(device)
#                    for _ in range(6)]


In [85]:
# enc_layers[0]


In [86]:
# enc_layers[5]


In [87]:
# enc_layers[5](embed_out, mask).shape
# # okayyy


#### Direct output from full-encoder

In [253]:
# Example input tensor of shape (batch_size, num_features)
input_tensor = torch.randint(0, 400, (batch_size, num_features)).to(device)
print('input_tensor', input_tensor.shape)


input_tensor torch.Size([10, 15])


In [254]:
encoder_pseudo(input_tensor).shape
# it is working


torch.Size([10, 15, 128])

In [90]:
##############################################################################################################################

<a name='7'></a>
## 7 - Decoder

Now it is time to implement the decoder. You have seen it in the videos and you can use some help by looking at the encoder implementation above. The Decoder layer takes the K and V matrices generated by the Encoder and computes the second multi-head attention layer with the Q matrix from the output (Figure 3a).

<img src="images/decoder_layer.png" alt="Decoder" width="250"/>
<caption><center><font color='purple'><b>Figure 3a: Transformer Decoder layer</font></center></caption>

<a name='7-1'></a>    
### 7.1 - Decoder Layer
Again, you'll pair multi-head attention with a feed forward neural network, but this time you'll implement two multi-head attention layers. You will also use residual connections and layer normalization to help speed up training (Figure 3a).

<a name='ex-2'></a>    
### Exercise 2 - DecoderLayer
    
Implement `DecoderLayer()` using the `forward()` method
    
1. Block 1 is a multi-head attention layer with a residual connection, and look-ahead mask. Like in the `EncoderLayer`, Dropout is defined within the multi-head attention layer.
2. Block 2 will take into account the output of the Encoder, so the multi-head attention layer will receive K and V from the encoder, and Q from the Block 1. You will then apply a normalization layer and a residual connection, just like you did before with the `EncoderLayer`.
3. Finally, Block 3 is a feed forward neural network with dropout and normalization layers and a residual connection.
    
**Additional Hints:**
* The first two blocks are fairly similar to the EncoderLayer except you will return `attention_scores` when computing self-attention

In [255]:
# GRADED FUNCTION: DecoderLayer
class DecoderLayer(nn.Module):
    """
    The decoder layer is composed by two multi-head attention blocks, 
    one that takes the new input and uses self-attention, and the other 
    one that combines it with the output of the encoder, followed by a
    fully connected block. 
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()

        self.mha1 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True)        
        self.mha2 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate, batch_first=True) 

        self.fc1  = nn.Linear(in_features=embedding_dim, out_features=fully_connected_dim)
        self.fc2  = nn.Linear(in_features=fully_connected_dim, out_features=embedding_dim)

        self.layernorm1 = nn.BatchNorm1d(num_features=embedding_dim, eps=layernorm_eps)
        self.layernorm2 = nn.BatchNorm1d(num_features=embedding_dim, eps=layernorm_eps)
        self.layernorm3 = nn.BatchNorm1d(num_features=embedding_dim, eps=layernorm_eps)

        self.dropout_ffn = nn.Dropout(dropout_rate)


    
    def forward(self, x, enc_output, look_ahead_mask, padding_mask_dec_query, padding_mask_enc_key):
        """
        Forward pass for the Decoder Layer
        
        Arguments:
            x (torch.tensor):               Tensor of shape (batch_size, target_seq_len, embedding_dim)
            enc_output (torch.tensor):      Tensor of shape(batch_size, input_seq_len, embedding_dim)
            look_ahead_mask (torch.tensor): Boolean mask for the target_input
            padding_mask (torch.tensor):    Boolean mask for the second multihead attention layer
        Returns:
            out3 (torch.tensor):                Tensor of shape (batch_size, target_seq_len, embedding_dim)
            attn_weights_block1 (torch.tensor): Tensor of shape (batch_size, ..............................
            attn_weights_block2 (torch.tensor): Tensor of shape (batch_size, ..............................
        """
        
        ### START CODE HERE ###
        # enc_output.shape == (batch_size, input_seq_len, fully_connected_dim) embeddin_dim
        
        # BLOCK 1
        # calculate self-attention and return attention scores as attn_weights_block1.
        # Dropout will be applied during training (~1 line).
        # if look_ahead_mask != None:
        mult_attn_out1, attn_weights_block1 = self.mha1(query=x, key=x, value=x,
                                                        is_causal        = True,               ##################################################
                                                        attn_mask        = look_ahead_mask,
                                                        key_padding_mask = padding_mask_dec_query)
        # else:
        #     mult_attn_out1, attn_weights_block1 = self.mha1(query=x, key=x, value=x)


        # apply layer normalization (layernorm1) to the sum of the attention output and the input (~1 line)
        # skip_connection
        Q1 = x + mult_attn_out1
        Q1 = Q1.permute(0, 2, 1)      # (batch_size, embedding_dim, input_seq_len)
        Q1 = self.layernorm1(Q1)      # (batch_size, embedding_dim, input_seq_len)
        Q1 = Q1.permute(0, 2, 1)      # (batch_size, input_seq_len, embedding_dim)



        # BLOCK 2
        # calculate self-attention using the Q from the first block and K and V from the encoder output. 
        # Dropout will be applied during training
        # Return attention scores as attn_weights_block2 (~1 line) 
        mult_attn_out2, attn_weights_block2 = self.mha2(query=Q1, key=enc_output, value=enc_output, key_padding_mask=padding_mask_enc_key)

        
        # apply layer normalization (layernorm2) to the sum of the attention output and the output of the first block (~1 line)
        mult_attn_out2 = Q1 + mult_attn_out2
        mult_attn_out2 = mult_attn_out2.permute(0, 2, 1)      # (batch_size, embedding_dim, input_seq_len)
        mult_attn_out2 = self.layernorm2(mult_attn_out2)      # (batch_size, embedding_dim, input_seq_len)
        mult_attn_out2 = mult_attn_out2.permute(0, 2, 1)      # (batch_size, input_seq_len, embedding_dim)

        
        #BLOCK 3
        # pass the output of the multi-head attention layer through a ffn        
        # Dense / Fully connected layers / Feed forward neural network
        ffn_output = self.fc1(mult_attn_out2)
        ffn_output = torch.relu(ffn_output)

        ffn_output = self.fc2(ffn_output)
        ffn_output = torch.relu(ffn_output)        

        # dropout
        ffn_output = self.dropout_ffn(ffn_output)

        
        # apply layer normalization (layernorm3) to the sum of the ffn output and the output of the second block        
        out3 = ffn_output + mult_attn_out2
        out3 = out3.permute(0, 2, 1)      # (batch_size, embedding_dim, input_seq_len)
        out3 = self.layernorm3(out3)      # (batch_size, embedding_dim, input_seq_len)
        out3 = out3.permute(0, 2, 1)      # (batch_size, input_seq_len, embedding_dim)
        
        ### END CODE HERE ###

        return out3, attn_weights_block1, attn_weights_block2
    

In [256]:
# Test your function!
key_dim = 12
n_heads = 6

decoderLayer_test = DecoderLayer(embedding_dim=key_dim, num_heads=n_heads, fully_connected_dim=32).to(device)

q = torch.from_numpy(np.ones((1, 15, key_dim))).to(torch.float32).to(device)
encoder_test_output = torch.from_numpy(np.random.rand(1, 7, 12)).to(torch.float32).to(device)
look_ahead_mask = create_look_ahead_mask(q).to(device)

out, attn_w_b1, attn_w_b2 = decoderLayer_test(q, encoder_test_output, look_ahead_mask, None, None)

print(f"Using embedding_dim={key_dim} and num_heads={n_heads}:\n")
print(f"q has shape:{q.shape}")
print(f"Output of encoder has shape:{encoder_test_output.shape}\n")

print(f"Output of decoder layer has shape:{out.shape}")
print(f"Att Weights Block 1 has shape:{attn_w_b1.shape}")
print(f"Att Weights Block 2 has shape:{attn_w_b2.shape}")


Using embedding_dim=12 and num_heads=6:

q has shape:torch.Size([1, 15, 12])
Output of encoder has shape:torch.Size([1, 7, 12])

Output of decoder layer has shape:torch.Size([1, 15, 12])
Att Weights Block 1 has shape:torch.Size([1, 15, 15])
Att Weights Block 2 has shape:torch.Size([1, 15, 7])


In [93]:
##############################################################################################################

### Checking outputs of decoder-layer

In [257]:
input_tensor.shape, embed_out.shape


(torch.Size([10, 15]), torch.Size([10, 15, 128]))

In [258]:
mha1_l                 =  nn.MultiheadAttention(embed_dim=128, num_heads=2, dropout=0.1, batch_first=True).to('cpu')
mha1_out, mha1_weights =  mha1_l(query=embed_out.to('cpu'), key=embed_out.to('cpu'), value=embed_out.to('cpu'),
                                    is_causal        = True,
                                    attn_mask        = create_look_ahead_mask(input_tensor).to('cpu'),
                                    key_padding_mask = create_padding_mask(input_tensor).to('cpu'))

print('mha1_out: ', mha1_out.shape), print('mha1_weights: ', mha1_weights.shape)


mha1_out:  torch.Size([10, 15, 128])
mha1_weights:  torch.Size([10, 15, 15])


(None, None)

In [96]:
# create_look_ahead_mask(input_tensor)
# create_padding_mask(input_tensor)

#### Checking Decoder Layer's Output Directly

In [259]:
decoderLayer_test = DecoderLayer(
    embedding_dim=128,
    num_heads=2,
    fully_connected_dim=150,
    dropout_rate=0.1,
    layernorm_eps=1e-06,
)


In [260]:
out, attn_w_b1, attn_w_b2 = decoderLayer_test(embed_out.to('cpu'), embed_out.to('cpu'), 
                                              create_look_ahead_mask(input_tensor).to('cpu'),
                                              create_padding_mask(input_tensor).to('cpu'), None)

print('out: ', out.shape), print('attn_w_b1: ', attn_w_b1.shape), print('attn_w_b2: ', attn_w_b2.shape)


out:  torch.Size([10, 15, 128])
attn_w_b1:  torch.Size([10, 15, 15])
attn_w_b2:  torch.Size([10, 15, 15])


(None, None, None)

In [99]:
##############################################################################################################

<a name='7-2'></a> 
### 7.2 - Full Decoder
You're almost there! Time to use your Decoder layer to build a full Transformer Decoder (Figure 3b). You will embed your output and add positional encodings. You will then feed your encoded embeddings to a stack of Decoder layers. 


<img src="images/decoder.png" alt="Decoder" width="300"/>
<caption><center><font color='purple'><b>Figure 3b: Transformer Decoder</font></center></caption>

<a name='ex-3'></a>     
### Exercise 3 - Decoder

Implement `Decoder()` using the `forward()` method to embed your output, add positional encoding, and implement multiple decoder layers.
 
In this exercise, you will initialize your Decoder with an Embedding layer, positional encoding, and multiple DecoderLayers. Your `forward()` method will perform the following steps: 
1. Pass your generated output through the Embedding layer.
2. Scale your embedding by multiplying it by the square root of your embedding dimension. Remember to cast the embedding dimension to data type `torch.float32` before computing the square root.
3. Add the position encoding: self.pos_encoding `[:, :seq_len, :]` to your embedding.
4. Pass the encoded embedding through a dropout layer, remembering to use the `training` parameter to set the model training mode. 
5. Pass the output of the dropout layer through the stack of Decoding layers using a for loop.

In [261]:
# GRADED FUNCTION: Decoder
class Decoder(nn.Module):
    """
    The entire Encoder starts by passing the target input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    decoder Layers
        
    """ 
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, target_vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(num_embeddings=target_vocab_size, embedding_dim=self.embedding_dim, padding_idx=0)
        
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.embedding_dim)

        self.dec_layers = nn.ModuleList([DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)])
       
        self.dropout = nn.Dropout(dropout_rate)



    
    def forward(self, current_batch_of_sequences, enc_input, enc_output):
        """
        Forward  pass for the Decoder
        
        Arguments:
            x (torch.tensor):               Tensor of shape (batch_size, target_seq_len, embedding_dim)
            enc_output (torch.tensor):      Tensor of shape(batch_size, input_seq_len, embedding_dim)
            look_ahead_mask (torch.tensor): Boolean mask for the target_input
            padding_mask (torch.tensor):    Boolean mask for the second multihead attention layer
        Returns:
            x (torch.tensor): Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            attention_weights (dict[str: tf.Tensor]): Dictionary of tensors containing all the attention weights
                                each of shape Tensor of shape (batch_size, .....................................
        """
        padding_mask_dec_query = create_padding_mask(current_batch_of_sequences).to(current_batch_of_sequences.device)        
        padding_mask_enc_key   = create_padding_mask(enc_input).to(current_batch_of_sequences.device)
        look_ahead_mask        = create_look_ahead_mask(current_batch_of_sequences).to(current_batch_of_sequences.device)
        seq_len                = current_batch_of_sequences.shape[1]

        attention_weights = {}
        
        ### START CODE HERE ###
        # create word embeddings 
        x = self.embedding(current_batch_of_sequences)
        
        # scale embeddings by multiplying by the square root of their dimension
        x = x * torch.sqrt( torch.tensor(self.embedding_dim, dtype=torch.float32).to(x.device) )
        
        # add positional encodings to word embedding
        x += self.pos_encoding [:, :seq_len, :].to(x.device)

        # apply a dropout layer to x
        x = self.dropout(x)
        

        # use a for loop to pass x through a stack of decoder layers and update attention_weights (~4 lines total)
        for i in range(self.num_layers):
            # pass x and the encoder output through a stack of decoder layers and save the attention weights
            # of block 1 and 2 (~1 line)
            x, block1, block2 = self.dec_layers[i](x, enc_output, look_ahead_mask, padding_mask_dec_query, padding_mask_enc_key)

            #update attention_weights dictionary with the attention weights of block 1 and block 2
            attention_weights['decoder_layer{}_block1_self_att'.format(i+1)]   = block1
            attention_weights['decoder_layer{}_block2_decenc_att'.format(i+1)] = block2
        ### END CODE HERE ###
        
        # x.shape == (batch_size, target_seq_len, fully_connected_dim)
        return x, attention_weights



In [102]:
#######################################################################################################################

In [263]:
encoder_pseudo = Encoder(
                        num_layers_                = 5,
                        embedding_dim_             = 128,
                        num_heads_                 = 2,
                        fully_connected_dim_       = 150,
                        input_vocab_size_          = 600,
                        maximum_position_encoding_ = 7,
                        dropout_rate_              = 0.1,
                        layernorm_eps_             = 1e-06,
).to(device)

In [264]:
enc_in = torch.randint(0,100, (10, 7)).to(device)
enc_out = encoder_pseudo(enc_in)
enc_out.shape


torch.Size([10, 7, 128])

In [265]:
input_tensor.shape


torch.Size([10, 15])

In [266]:
decoder_test = Decoder(
    num_layers                = 5,
    embedding_dim             = 128,
    num_heads                 = 2,
    fully_connected_dim       = 150,
    target_vocab_size         = 600,
    maximum_position_encoding = 15,
    dropout_rate              = 0.1,
    layernorm_eps             = 1e-06
).to(device)


In [267]:
outd, att_weights = decoder_test(input_tensor, enc_in, enc_out)
print('outd: ', outd.shape), print('att_weights: ', att_weights['decoder_layer1_block1_self_att'].shape)


outd:  torch.Size([10, 15, 128])
att_weights:  torch.Size([10, 15, 15])


(None, None)

In [108]:
#######################################################################################################################

<a name='8'></a> 
## 8 - Transformer

Phew! This has been quite the assignment! Congratulations! You've done all the hard work, now it's time to put it all together.  

<img src="images/transformer.png" alt="Transformer" width="550"/>
<caption><center><font color='purple'><b>Figure 4: Transformer</font></center></caption>
    
The flow of data through the Transformer Architecture is as follows:
* First your input passes through an Encoder, which is just repeated Encoder layers that you implemented:
    - embedding and positional encoding of your input
    - multi-head attention on your input
    - feed forward neural network to help detect features
* Then the predicted output passes through a Decoder, consisting of the decoder layers that you implemented:
    - embedding and positional encoding of the output
    - multi-head attention on your generated output
    - multi-head attention with the Q from the first multi-head attention layer and the K and V from the Encoder
    - a feed forward neural network to help detect features
* Finally, after the Nth Decoder layer, one dense layer and a softmax are applied to generate prediction for the next output in your sequence.

<a name='ex-4'></a> 
### Exercise 4 - Transformer

Implement `Transformer()` using the `forward()` method
1. Pass the input through the Encoder with the appropiate mask.
2. Pass the encoder output and the target through the Decoder with the appropiate mask.
3. Apply a linear transformation and a softmax to get a prediction.

In [268]:
# GRADED FUNCTION: Transformer
class Transformer(nn.Module):
    """
    Complete transformer with an Encoder and a Decoder
    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, 
               target_vocab_size, max_positional_encoding_input,
               max_positional_encoding_target, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers_=num_layers,
                               embedding_dim_=embedding_dim,
                               num_heads_=num_heads,
                               fully_connected_dim_=fully_connected_dim,
                               input_vocab_size_=input_vocab_size,
                               maximum_position_encoding_=max_positional_encoding_input,
                               dropout_rate_=dropout_rate,
                               layernorm_eps_=layernorm_eps)

        

        self.decoder = Decoder(num_layers=num_layers, 
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               target_vocab_size=target_vocab_size, 
                               maximum_position_encoding=max_positional_encoding_target,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.final_layer = nn.Linear(in_features=embedding_dim, out_features=target_vocab_size)

    
    def forward(self, input_sentence, output_sentence):
        """
        Forward pass for the entire Transformer
        Arguments:
            input_sentence (torch.tensor): Tensor of shape (batch_size, input_seq_len, embedding_dim)
                                          An array of the indexes of the words in the input sentence
            output_sentence (torch.tensor): Tensor of shape (batch_size, target_seq_len, embedding_dim)
                                          An array of the indexes of the words in the output sentence
        Returns:
            final_output (torch.tensor): The final output of the model
            attention_weights (dict[str: torch.tensor]): Dictionary of tensors containing all the attention weights for the decoder
                                each of shape Tensor of shape (batch_size, .....................................................
        """
        ### START CODE HERE ###
        # call self.encoder with the appropriate arguments to get the encoder output
        enc_output = self.encoder(input_sentence)
        
        # call self.decoder with the appropriate arguments to get the decoder output
        # dec_output.shape == (batch_size, tar_seq_len, embedding_dim)
        dec_output, attention_weights = self.decoder(output_sentence, input_sentence, enc_output)
        
        # pass decoder output through a linear layer and log_softmax (~1 line)
        final_output = self.final_layer(dec_output)
        final_output = torch.nn.functional.log_softmax(final_output, dim=-1)
        ### END CODE HERE ###

        return final_output, attention_weights


In [269]:
# Test your function!
n_layers                       = 3
emb_d                          = 34   
n_heads                        = 17
fully_connected_dim            = 8
input_vocab_size               = 300
target_vocab_size              = 350
max_positional_encoding_input  = 12
max_positional_encoding_target = 12

model = Transformer(n_layers, 
    emb_d, 
    n_heads, 
    fully_connected_dim, 
    input_vocab_size, 
    target_vocab_size, 
    max_positional_encoding_input,
    max_positional_encoding_target).to(device)

# 0 is the padding value
sentence_a = torch.from_numpy(np.array([[2, 3, 1, 3, 0, 0, 0]])).to(torch.int).to(device)
sentence_b = torch.from_numpy(np.array([[1, 3, 4, 0, 0, 0, 0]])).to(torch.int).to(device)


test_summary, att_weights = model(
    sentence_a,
    sentence_b
)

print(f"Using num_layers={n_layers}, target_vocab_size={target_vocab_size} and num_heads={n_heads}:\n")
print(f"sentence_a has shape:{sentence_a.shape}")
print(f"sentence_b has shape:{sentence_b.shape}")

print(f"\nOutput of transformer (summary) has shape:{test_summary.shape}\n")
print("Attention weights:")
for name, tensor in att_weights.items():
    print(f"{name} has shape:{tensor.shape}")

Using num_layers=3, target_vocab_size=350 and num_heads=17:

sentence_a has shape:torch.Size([1, 7])
sentence_b has shape:torch.Size([1, 7])

Output of transformer (summary) has shape:torch.Size([1, 7, 350])

Attention weights:
decoder_layer1_block1_self_att has shape:torch.Size([1, 7, 7])
decoder_layer1_block2_decenc_att has shape:torch.Size([1, 7, 7])
decoder_layer2_block1_self_att has shape:torch.Size([1, 7, 7])
decoder_layer2_block2_decenc_att has shape:torch.Size([1, 7, 7])
decoder_layer3_block1_self_att has shape:torch.Size([1, 7, 7])
decoder_layer3_block2_decenc_att has shape:torch.Size([1, 7, 7])


##### __Expected Output__

```
Using num_layers=3, target_vocab_size=350 and num_heads=17:

sentence_a has shape:(1, 7)
sentence_b has shape:(1, 7)

Output of transformer (summary) has shape:(1, 7, 350)

Attention weights:
decoder_layer1_block1_self_att has shape:(1, 17, 7, 7)
decoder_layer1_block2_decenc_att has shape:(1, 17, 7, 7)
decoder_layer2_block1_self_att has shape:(1, 17, 7, 7)
decoder_layer2_block2_decenc_att has shape:(1, 17, 7, 7)
decoder_layer3_block1_self_att has shape:(1, 17, 7, 7)
decoder_layer3_block2_decenc_att has shape:(1, 17, 7, 7)
```

<a name='9'></a>
## 9 - Initialize the Model
Now that you have defined the model, you can initialize and train it. First you can initialize the model with the parameters below. Note that generally these models are much larger and you are using a smaller version to fit this environment and to be able to train it in just a few minutes.

The base model described in the original Transformer paper used `num_layers=6`, `embedding_dim=512`, and `fully_connected_dim=2048`.

In [270]:
# Define the model parameters
num_layers                 = 2
embedding_dim              = 128
fully_connected_dim        = 128
num_heads                  = 2
positional_encoding_length = 256              ##################### okay tu yani k max seq len 256 tak hum rakhain gay

# Initialize the model
model = Transformer(
    num_layers, 
    embedding_dim, 
    num_heads, 
    fully_connected_dim,
    vocab_size, 
    vocab_size, 
    positional_encoding_length, 
    positional_encoding_length,
).to(device)

# Transformer(
#     num_layers,
#     embedding_dim,
#     num_heads,
#     fully_connected_dim,
#     input_vocab_size,
#     target_vocab_size,
#     max_positional_encoding_input,
#     max_positional_encoding_target,
#     dropout_rate=0.1,
#     layernorm_eps=1e-06,
# )


<a name='11'></a>
## 11 - Summarization

The last thing you will implement is inference. With this, you will be able to produce actual summaries of the documents. You will use a simple method called greedy decoding, which means you will predict one word at a time and append it to the output. You will start with an `[SOS]` token and repeat the word by word inference until the model returns you the `[EOS]` token or until you reach the maximum length of the sentence (you need to add this limit, otherwise a poorly trained model could give you infinite sentences without ever producing the `[EOS]` token.

<a name='ex-5'></a> 
### Exercise 5 - next_word
Write a helper function that predicts the next word, so you can use it to write the whole sentences. Hint: this is very similar to what happens in the train_step, but you have to set the training of the model to False.

In [280]:
model = torch.load('transformer_based_text_summarizer.pt')


In [272]:
# GRADED FUNCTION: next_word
def next_word(model, encoder_input, output):
    """
    Helper function for summarization that uses the model to predict just the next word.
    Arguments:
        encoder_input (torch.tensor): Input data to summarize
        output (torch.tensor): (incomplete) target (summary)
    Returns:
        predicted_id (tf.Tensor): The id of the predicted word
    """

    # Run the prediction of the next word with the transformer model
    predictions, attention_weights = model(encoder_input, output)
    ### END CODE HERE ###

    predictions  = predictions[: ,-1:, :]
    predicted_id = torch.argmax(predictions, axis=-1).to(torch.int32)
    
    return predicted_id
    

Check if your function works.


In [276]:
# Take a random sentence as an input
input_document = tokenizer.texts_to_sequences(["a random sentence"])
input_document = tf.keras.preprocessing.sequence.pad_sequences(input_document, maxlen=encoder_maxlen, padding='post', truncating='post')
encoder_input  = tf.expand_dims(input_document[0], 0).numpy()

encoder_input = torch.from_numpy(encoder_input).to(torch.int32).to('cpu')


# Take the start of sentence token as the only token in the output to predict the next word
output = tf.expand_dims([tokenizer.word_index["[SOS]"]], 0).numpy()
output = torch.from_numpy(output).to(torch.int32).to('cpu')

model.eval()
# predict the next word with your function
predicted_token = next_word(model.to('cpu'), encoder_input, output)
print(f"Predicted token: {predicted_token}")

predicted_word = tokenizer.sequences_to_texts(predicted_token.to('cpu').numpy())[0]
print(f"Predicted word: {predicted_word}")


Predicted token: tensor([[25543]], dtype=torch.int32)
Predicted word: clasess


In [278]:
model.to(device)

def summarize(model, input_document):
    """
    A function for summarization using the transformer model
    Arguments:
        input_document (tf.Tensor): Input data to summarize
    Returns:
        _ (str): The summary of the input_document
    """    
    input_document = tokenizer.texts_to_sequences([input_document])
    input_document = tf.keras.preprocessing.sequence.pad_sequences(input_document, maxlen=encoder_maxlen, padding='post', truncating='post')
    encoder_input  = tf.expand_dims(input_document[0], 0).numpy()
    encoder_input  = torch.tensor(encoder_input).to(torch.int32).to(device)
    
    output = tf.expand_dims([tokenizer.word_index["[SOS]"]], 0).numpy()
    output = torch.tensor(output).to(torch.int32).to(device)
    
    model.eval()
    for i in range(decoder_maxlen):

        predicted_id = next_word(model, encoder_input, output)
        output       = torch.cat((output, predicted_id), dim=-1)
        
        if predicted_id == tokenizer.word_index["[EOS]"]:
            break

    return tokenizer.sequences_to_texts(output.to('cpu').numpy())[0]  # since there is just one translated document


Now you can already summarize a sentence! But beware, since the model was not yet trained at all, it will just produce nonsense.

In [281]:
training_set_example = 0

# Check a summary of a document from the training set
print('Training set example:')
print(document[training_set_example])
print('\nHuman written summary:')
print(summary[training_set_example])
print('\nModel written summary:')
summarize(model, document[training_set_example])


Training set example:
[SOS] amanda: i baked  cookies. do you want some?  jerry: sure!  amanda: i'll bring you tomorrow :-) [EOS]

Human written summary:
[SOS] amanda baked cookies and will bring jerry some tomorrow. [EOS]

Model written summary:


'[SOS] amanda baked cookies and amanda will bring some cookies tomorrow [EOS]'

# Model Training

In [288]:
for (batch, (inp, tar)) in enumerate(dataset):
    if batch >=2:
        break


In [289]:
inp = torch.tensor(inp.numpy()).to(torch.int32).to(device)
tar = torch.tensor(tar.numpy()).to(torch.int32).to(device)
inp.shape, tar.shape


(torch.Size([64, 150]), torch.Size([64, 50]))

In [290]:
inp

tensor([[   7, 2395,    5,  ...,    0,    0,    0],
        [   7,  630,  307,  ...,    0,    0,    0],
        [   7, 1894,   30,  ...,    0,    0,    0],
        ...,
        [   7, 9076,   17,  ...,    0,    0,    0],
        [   7,  423, 1020,  ...,    0,    0,    0],
        [   7,  399,   20,  ...,  125,  274,   34]], device='cuda:0',
       dtype=torch.int32)

In [291]:
preds, _ = model(inp, tar)
print('preds: ', preds.shape)


preds:  torch.Size([64, 50, 34250])


In [292]:
criterion = nn.NLLLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Ensure outputs is of type Float
outputs = preds.float().clone()
outputs = outputs.reshape(-1, preds.shape[2])
print('outputs: ', outputs.shape)

# Ensure targets is of type Long
targets = tar.long().clone()
targets = targets.reshape(-1)
print('targets: ', targets.shape)

optimizer.zero_grad()

# Calculate loss
loss = criterion(outputs, targets)
loss.backward()

print(loss.item())


outputs:  torch.Size([3200, 34250])
targets:  torch.Size([3200])
10.202282905578613


In [293]:
64*50

3200

In [294]:
tar

tensor([[    7,   286,    70,  ...,     0,     0,     0],
        [    7,   307,    15,  ...,     0,     0,     0],
        [    7, 17173,    67,  ...,     0,     0,     0],
        ...,
        [    7,     3,  6944,  ...,     0,     0,     0],
        [    7,   423,  5817,  ...,     0,     0,     0],
        [    7,   399,    11,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)

In [295]:
tar[:, :-1] # tar_inp | to decoder i guess


tensor([[    7,   286,    70,  ...,     0,     0,     0],
        [    7,   307,    15,  ...,     0,     0,     0],
        [    7, 17173,    67,  ...,     0,     0,     0],
        ...,
        [    7,     3,  6944,  ...,     0,     0,     0],
        [    7,   423,  5817,  ...,     0,     0,     0],
        [    7,   399,    11,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)

In [296]:
tar[:, 1:] # tar_real | to loss fn

tensor([[  286,    70,  5217,  ...,     0,     0,     0],
        [  307,    15,    21,  ...,     0,     0,     0],
        [17173,    67,  3521,  ...,     0,     0,     0],
        ...,
        [    3,  6944,    33,  ...,     0,     0,     0],
        [  423,  5817,   649,  ...,     0,     0,     0],
        [  399,    11,  1387,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)

In [297]:
tokenizer.index_word[7]

'[SOS]'

In [129]:
# # error expected, explicitly hum nay hard code nahi kia tha, padding index
# tokenizer.index_word[0]

# Training

In [131]:
# Define the model parameters
num_layers                 = 2
embedding_dim              = 128
fully_connected_dim        = 128
num_heads                  = 2
positional_encoding_length = 256              ##################### okay tu yani k max seq len 256 tak hum rakhain gay

# Initialize the model
model = Transformer(
    num_layers, 
    embedding_dim, 
    num_heads, 
    fully_connected_dim,
    vocab_size, 
    vocab_size, 
    positional_encoding_length, 
    positional_encoding_length,
).to(device)

criterion = nn.NLLLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())


In [138]:
model = torch.load('transformer based summarizer.pt')


In [139]:
model.to(device)

test_example  = 0
true_summary  = summary_test[test_example]
true_document = document_test[test_example]


# Training Hyperparameters
num_epochs    = 10 #20
learning_rate = 0.001
batch_size    = 64

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


pad_idx    = 0
criterion  = nn.NLLLoss(ignore_index=pad_idx)
optimizer  = optim.Adam(model.parameters())



for epoch in range(num_epochs):
    print(f'Epoch [{epoch+1} / {num_epochs}]')

    start = time.time()

    running_loss = 0
    model.train()
    for (batch, (inp, tar)) in enumerate(dataset):

        inp1 = torch.tensor(inp.numpy()).long().clone().to(device)
        tar1 = torch.tensor(tar.numpy()).long().clone().to(device)

        # print('inp_data1: ', inp_data1.shape)
        # print('target1: ',   target1.shape)

        preds, _ = model(inp1, tar1[:, :-1])

        # Ensure outputs is of type Float
        outputs = preds.float().clone()
        outputs = outputs.reshape(-1, preds.shape[2])
        # print('outputs: ', outputs.shape)
        
        # Ensure targets is of type Long
        targets = tar1[:, 1:].long().clone()
        targets = targets.reshape(-1)
        # print('targets: ', targets.shape)


        
        optimizer.zero_grad()
        loss = criterion(outputs, targets)

        loss.backward()

        optimizer.step()


        running_loss = running_loss+loss.item()
        avg_running_loss = running_loss/(batch+1)

        if (batch+1)%20 == 0:
            print()
            print(f'{batch+1}: ', avg_running_loss)
            print()
        
        if (batch+1) >= len(dataset):
            break
    
    print (f'Time taken for one epoch: {time.time() - start} sec')
    print('Example summarization on the test set:')
    print('  True summarization:')
    print(f'    {true_summary}')
    print('  Predicted summarization:')
    
    model.eval()
    print(f'    {summarize(model, true_document)}\n')



Epoch [1 / 10]

20:  3.6275198578834535


40:  3.676102066040039


60:  3.722142791748047


80:  3.754060631990433


100:  3.7766186881065367


120:  3.7964085976282758


140:  3.818076319353921


160:  3.8356647089123728


180:  3.8478218966060216


200:  3.8601041400432585


220:  3.873857092857361

Time taken for one epoch: 58.92952537536621 sec
Example summarization on the test set:
  True summarization:
    [SOS] hannah needs betty's number but amanda doesn't have it. she needs to contact larry. [EOS]
  Predicted summarization:
    [SOS] amanda and hannah are going to the same place on the same place [EOS]

Epoch [2 / 10]

20:  3.5403417229652403


40:  3.5622637033462525


60:  3.579885896046956


80:  3.5937197685241697


100:  3.597118332386017


120:  3.605500167608261


140:  3.6236435277121406


160:  3.6328338205814363


180:  3.641001776854197


200:  3.6505386698246003


220:  3.658218042417006

Time taken for one epoch: 58.73113822937012 sec
Example summarization on the 

In [140]:
# torch.save(model, 'transformer_based_text_summarizer.pt')

<a name='13'></a>
# 13 - Summarize some Sentences!

Below you can see an example of summarization of a sentence from the training set and a sentence from the test set. See if you notice anything interesting about them!

In [141]:
model = torch.load('transformer_based_text_summarizer.pt')


In [142]:
training_set_example = 0

# Check a summary of a document from the training set
print('Training set example:')
print(document[training_set_example])
print('\nHuman written summary:')
print(summary[training_set_example])
print('\nModel written summary:')
print(summarize(model, document[training_set_example]))


Training set example:
[SOS] amanda: i baked  cookies. do you want some?  jerry: sure!  amanda: i'll bring you tomorrow :-) [EOS]

Human written summary:
[SOS] amanda baked cookies and will bring jerry some tomorrow. [EOS]

Model written summary:
[SOS] amanda baked cookies and amanda will bring some cookies tomorrow [EOS]


In [143]:
test_set_example = 3

# Check a summary of a document from the test set
print('Test set example:')
print(document_test[test_set_example])
print('\nHuman written summary:')
print(summary_test[test_set_example])
print('\nModel written summary:')
print(summarize(model, document_test[test_set_example]))


Test set example:
[SOS] will: hey babe, what do you want for dinner tonight?  emma:  gah, don't even worry about it tonight  will: what do you mean? everything ok?  emma: not really, but it's ok, don't worry about cooking though, i'm not hungry  will: well what time will you be home?  emma: soon, hopefully  will: you sure? maybe you want me to pick you up?  emma: no no it's alright. i'll be home soon, i'll tell you when i get home.   will: alright, love you.   emma: love you too.  [EOS]

Human written summary:
[SOS] emma will be home soon and she will let will know. [EOS]

Model written summary:
[SOS] emma will be home in 15 minutes [EOS]


If you critically examine the output of the model, you can notice a few things:
 - In the training set the model output is (almost) identical to the real output (already after 20 epochs and even more so with more epochs). This might be because the training set is relatively small and the model is relatively big and has thus learned the sentences in the training set by heart (overfitting).
 - While the performance on the training set looks amazing, it is not so good on the test set. The model overfits, and fails to generalize. Again an easy candidate to blame is the small training set and a comparatively large model, but there might be a variety of other factors.
 - Look at the test set example 3 and its summarization. Would you summarize it the same way as it is written here? Sometimes the data may be ambiguous. And the training of **your model can only be as good as your data**.

Here you only use a small dataset, to show that something can be learned in a reasonable amount of time in a relatively small environment. Generally, large transformers are trained on more than one task and on very large quantities of data to achieve superb performance. You will learn more about this in the rest of this course.

**Congratulations on finishing this week's assignment!** You did a lot of work and now you should have a better understanding of the Transformers and their building blocks (encoder and decoder) and how they can be used for text summarization. And remember: you dont need to change much to use the same model for a translator, just change the dataset and it should work!

**Keep it up!**

In [299]:
test_set_example      = '[SOS] he woke up at 5 am. he drank some water. he offered his prayer. he started to study. he completed his work. it was a productive day [EOS]'

# Check a summary of a document from the test set
print('Test set example:')
print(test_set_example)

print('\nModel written summary:')
print(summarize(model, test_set_example))


Test set example:
[SOS] he woke up at 5 am. he drank some water. he offered his prayer. he started to study. he completed his work. it was a productive day [EOS]

Model written summary:
[SOS] heidi was at work [EOS]
