 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DoranLyong/Awesome-Tensor-Architecture/blob/main/pytorch_reference/NYU-DL/12-Attention-and-Transformer/01-Attention-and-Transformer.ipynb)

## Transformers 
* [page](https://atcold.github.io/pytorch-Deep-Learning/en/week12/12-3/)
* [code review](https://github.com/Atcold/pytorch-Deep-Learning/blob/master/15-transformer.ipynb)

In [1]:
import numpy as np 

import torch 
import torch.nn as nn 
import torch.nn.functional as F 

In [2]:
# (ref) https://jeongwookie.github.io/2020/03/24/200324-pytorch-cuda-gpu-allocate/
# (ref) https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device

GPU_NUM = 0 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print (f'Current cuda device number: {torch.cuda.current_device()}') # check

# Additional Infos
if device.type == 'cuda':
    print(torch.cuda.get_device_name(GPU_NUM))
    print('Memory Usage:')
    print(f'Allocated: {round(torch.cuda.memory_allocated(GPU_NUM)/1024**3,1)} GB')
    print(f'Cached: {round(torch.cuda.memory_reserved(GPU_NUM)/1024**3,1)} GB')

Current cuda device number: 0
NVIDIA TITAN Xp
Memory Usage:
Allocated: 0.0 GB
Cached: 0.0 GB


In [3]:
nn_Softargmax = nn.Softmax  # (ref) https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html

### Multi-head attention 

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, p, d_input=None):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads
        self.d_model = d_model

        if d_input is None:
            d_xq = d_xk = d_xv = d_model  # query = key = value 
        else:
            d_xq, d_xk, d_xv = d_input    

        assert d_model % self.num_heads == 0 # Make sure that the embedding dimension of model is a multiple of number of heads

        self.d_k = d_model // self.num_heads
        
        # === These are still of dimension d_model. They will be split into number of heads === # 
        self.W_q = nn.Linear(d_xq, d_model, bias=False)
        self.W_k = nn.Linear(d_xk, d_model, bias=False)
        self.W_v = nn.Linear(d_xv, d_model, bias=False)
        
        # === Outputs of all sub-layers need to be of dimension d_model === #
        self.W_h = nn.Linear(d_model, d_model)        


    def scaled_dot_product_attention(self, Q, K, V):
        batch_size = Q.size(0) 
        k_length = K.size(-2) 
        
        # === Scaling by d_k so that the soft(arg)max doesnt saturate === # 
        Q = Q / np.sqrt(self.d_k)                      # (bs, n_heads, q_length, dim_per_head)
        scores = torch.matmul(Q, K.transpose(2,3))     # (bs, n_heads, q_length, k_length)
        
        A = nn_Softargmax(dim=-1)(scores)   # (bs, n_heads, q_length, k_length)
        
        # === Get the weighted average of the values === # 
        H = torch.matmul(A, V)     # (bs, n_heads, q_length, dim_per_head)

        return H, A 

        
    def split_heads(self, x, batch_size):
        """
        Split the last dimension into (heads X depth)
        Return after transpose to put in shape (batch_size X num_heads X seq_length X d_k)
        """
        return x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

    def group_heads(self, x, batch_size):
        """
        Combine the heads again to get (batch_size X seq_length X (num_heads times d_k))
        """
        return x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
    

    def forward(self, X_q, X_k, X_v):
        batch_size, seq_length, dim = X_q.size()

        # === After transforming, split into num_heads === # 
        Q = self.split_heads(self.W_q(X_q), batch_size)  # (bs, n_heads, q_length, dim_per_head)
        K = self.split_heads(self.W_k(X_k), batch_size)  # (bs, n_heads, k_length, dim_per_head)
        V = self.split_heads(self.W_v(X_v), batch_size)  # (bs, n_heads, v_length, dim_per_head)
        
        # === Calculate the attention weights for each of the heads === # 
        H_cat, A = self.scaled_dot_product_attention(Q, K, V)
        
        # === Put all the heads back together by concat === # 
        H_cat = self.group_heads(H_cat, batch_size)    # (bs, q_length, dim)
        
        # === Final linear layer === # 
        H = self.W_h(H_cat)          # (bs, q_length, dim)
        
        return H, A                

In [5]:
# Check 

temp_mha = MultiHeadAttention(d_model=512, num_heads=8, p=0)

def print_out(Q, K, V):
    temp_out, temp_attn = temp_mha.scaled_dot_product_attention(Q, K, V)

    print('Attention weights are:', temp_attn.squeeze())
    print('Output is:', temp_out.squeeze())

To check our self attention works - 
* if the query matches with one of the key values, it should have all the attention focused there, with the value returned as the value at that index

In [6]:
test_K = torch.tensor( [[10, 0, 0],
                        [ 0,10, 0],
                        [ 0, 0,10],
                        [ 0, 0,10]]).float()[None,None]  # unsqueeze; (4, 3) -> (1, 1, 4, 3)

test_V = torch.tensor( [[   1,0,0],
                        [  10,0,0],
                        [ 100,5,0],
                        [1000,6,0]]).float()[None,None]


# Case1 
test_Q = torch.tensor( [[0, 10, 0]]).float()[None,None]

print_out(test_Q, test_K, test_V) # Query, Key, Value 

Attention weights are: tensor([3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06])
Output is: tensor([1.0004e+01, 4.0993e-05, 0.0000e+00])


* We can see that it focuses on the ```second``` ```key```  (watch attention weights)
* and returns the ```second value```. <br/>

If we give a ```query``` that matches ```two keys exactly```, it should return the ```averaged value``` of the two values for those two keys.



In [7]:
# Case2 
test_Q = torch.tensor([[0, 0, 10]]).float()  
print_out(test_Q, test_K, test_V)

Attention weights are: tensor([1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01])
Output is: tensor([549.9979,   5.5000,   0.0000])


We see that it focuses equally on the third and fourth key and returns the average of their values.

Now giving all the queries at the same time:

In [8]:
test_Q = torch.tensor( [[0, 0, 10], 
                        [0, 10, 0], 
                        [10, 10, 0]]).float()[None,None]

print_out(test_Q, test_K, test_V)

Attention weights are: tensor([[1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01],
        [3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06],
        [5.0000e-01, 5.0000e-01, 1.8633e-06, 1.8633e-06]])
Output is: tensor([[5.5000e+02, 5.5000e+00, 0.0000e+00],
        [1.0004e+01, 4.0993e-05, 0.0000e+00],
        [5.5020e+00, 2.0497e-05, 0.0000e+00]])


***
### 1D convolution with ```kernel_size=1```
This is basically an MLP with one hidden layer and ReLU activation applied to each and every element in the set.

In [9]:
class CNN(nn.Module):
    def __init__(self, d_model, hidden_dim, p):
        super(CNN, self).__init__()

        self.k1convL1 = nn.Linear(in_features = d_model, out_features = hidden_dim)
        self.k1convL2 = nn.Linear(hidden_dim, d_model)

        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.k1convL1(x)
        x = self.activation(x)
        x = self.k1convL2(x)
        return x

### Transformer encoder
all components for our Transformer Encoder block 

In [10]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, conv_hidden_dim, p=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads, p)
        self.cnn = CNN(d_model, conv_hidden_dim, p)

        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
    
    def forward(self, x):
        
        # === Multi-head attention 
        attn_output, _ = self.mha(x, x, x)  # (batch_size, input_seq_len, d_model)
        
        # === Layer norm after adding the residual connection 
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
        
        # === Feed forward 
        cnn_output = self.cnn(out1)  # (batch_size, input_seq_len, d_model)
        
        # === Second layer norm after adding residual connection 
        out2 = self.layernorm2(out1 + cnn_output)  # (batch_size, input_seq_len, d_model)

        return out2

### Encoder (refer to [Attention Is All You Need](https://youtu.be/iDulhoQ2pro))
#### Blocks of N Encoder Layers + Positional encoding + Input embedding

```Self-attention``` by itself does not have any recurrence or convolutions so to make it ```sensitive``` to ```position``` we must provide additional positional encodings. <br/>
These are calculated as follows:

\begin{aligned}
E(p, 2i)    &= \sin(p / 10000^{2i / d}) \\
E(p, 2i+1) &= \cos(p / 10000^{2i / d})
\end{aligned}

In [11]:
def create_sinusoidal_embeddings(nb_p, dim, E):
    
    E.requires_grad = False   
    E.detach_() # detach_() ; in-palce version of detach() # (ref) https://subinium.github.io/pytorch-Tensor-Variable/
                                                            # (ref) https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
                # Why do we call .detach() before calling .numpy() on a Pytorch Tensor? ; (ref) https://stackoverflow.com/questions/63582590/why-do-we-call-detach-before-calling-numpy-on-a-pytorch-tensor

    theta = np.array([ [p / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for p in range(nb_p)])
    E[:, 0::2] = torch.FloatTensor(np.sin(theta[:, 0::2]))
    E[:, 1::2] = torch.FloatTensor(np.cos(theta[:, 1::2]))
    
    E = E.to(device)



class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size, max_position_embeddings, p):
        super(Embeddings, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)   # (ref) https://wikidocs.net/64779
        self.position_embeddings = nn.Embedding(max_position_embeddings, d_model)

        create_sinusoidal_embeddings(   nb_p=max_position_embeddings,
                                        dim=d_model,
                                        E=self.position_embeddings.weight
                                    )

        self.LayerNorm = nn.LayerNorm(d_model, eps=1e-12)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)                      # (bs, max_seq_length)
        
        # === Get word embeddings for each input id
        word_embeddings = self.word_embeddings(input_ids)                   # (bs, max_seq_length, dim)
        
        # === Get position embeddings for each position id 
        position_embeddings = self.position_embeddings(position_ids)        # (bs, max_seq_length, dim)
        
        # === Add them both 
        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
        
        # === Layer norm 
        embeddings = self.LayerNorm(embeddings)             # (bs, max_seq_length, dim)

        return embeddings

In [12]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, ff_hidden_dim, input_vocab_size, maximum_position_encoding, p=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embeddings(d_model, input_vocab_size,maximum_position_encoding, p)

        self.enc_layers = nn.ModuleList()

        for _ in range(num_layers):
            self.enc_layers.append(EncoderLayer(d_model, num_heads, ff_hidden_dim, p))
        
    def forward(self, x):
        x = self.embedding(x) # Transform to (batch_size, input_seq_length, d_model)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # (batch_size, input_seq_len, d_model)

***
### IMDB 리뷰 감성 분류 (Movie Review Sentiment Analysis) - ([ref](https://wikidocs.net/60691)) ([ref2](https://pytorch.org/tutorials/beginner/transformer_tutorial.html)) ([pytorch-sentiment-analysis](https://github.com/bentrevett/pytorch-sentiment-analysis))

In [13]:
import torchtext.legacy.data as data  # (ref) https://stackoverflow.com/questions/66516388/attributeerror-module-torchtext-data-has-no-attribute-field
import torchtext.legacy.datasets as datasets

### Data Loading 

In [14]:
# Define Field

max_len = 200

TEXT = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
LABEL = data.LabelField(sequential=False, dtype=torch.long)

In [15]:
# Build Datasets

datasets.IMDB.download('./')
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(f'train.fields: {train_data.fields}')

Number of training examples: 25000
Number of testing examples: 25000
train.fields: {'text': <torchtext.legacy.data.field.Field object at 0x7fd160385a30>, 'label': <torchtext.legacy.data.field.LabelField object at 0x7fd160385d60>}


In [16]:
train_data, val_data = train_data.split(0.9)


print(f"train: {len(train_data)}")
print(f"valid: {len(val_data)}")
print(f"test: {len(test_data)}")


train: 22500
valid: 2500
test: 25000


In [17]:
# Build Vocabulary Bag
num_words = 50_000

TEXT.build_vocab(train_data, max_size=num_words)
LABEL.build_vocab(train_data)

vocab = TEXT.vocab

In [18]:
# Build DataLoader (=BucketIterator)

batch_size = 164
train_loader, valid_loader, test_loader = data.BucketIterator.splits(   (train_data, val_data, test_data), 
                                                                        batch_size=batch_size, 
                                                                        sort_key=lambda x: len(x.text), 
                                                                        repeat=False
                                                                    )

### Model Design

In [19]:
class TransformerClassifier(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, conv_hidden_dim, input_vocab_size, num_answers):
        super(TransformerClassifier, self).__init__()
        
        self.encoder = Encoder( num_layers, d_model, num_heads, conv_hidden_dim, input_vocab_size,
                                maximum_position_encoding=10000)
        self.dense = nn.Linear(d_model, num_answers)

    def forward(self, x):
        x = self.encoder(x)
        
        x, _ = torch.max(x, dim=1)
        x = self.dense(x)
        return x

In [20]:
model = TransformerClassifier(  num_layers=1, d_model=32, num_heads=2, 
                                conv_hidden_dim=128, input_vocab_size=50002, num_answers=2)
model.to(device)

TransformerClassifier(
  (encoder): Encoder(
    (embedding): Embeddings(
      (word_embeddings): Embedding(50002, 32, padding_idx=1)
      (position_embeddings): Embedding(10000, 32)
      (LayerNorm): LayerNorm((32,), eps=1e-12, elementwise_affine=True)
    )
    (enc_layers): ModuleList(
      (0): EncoderLayer(
        (mha): MultiHeadAttention(
          (W_q): Linear(in_features=32, out_features=32, bias=False)
          (W_k): Linear(in_features=32, out_features=32, bias=False)
          (W_v): Linear(in_features=32, out_features=32, bias=False)
          (W_h): Linear(in_features=32, out_features=32, bias=True)
        )
        (cnn): CNN(
          (k1convL1): Linear(in_features=32, out_features=128, bias=True)
          (k1convL2): Linear(in_features=128, out_features=32, bias=True)
          (activation): ReLU()
        )
        (layernorm1): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
        (layernorm2): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
   

### Training Loop

In [21]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
epochs = 10
t_total = len(train_loader) * epochs

In [22]:
def train(train_loader, valid_loader):
    
    for epoch in range(epochs):
        train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
        nb_batches_train = len(train_loader)
        train_acc = 0
        model.train()
        losses = 0.0

        for batch in train_iterator:
            x = batch.text.to(device)
            y = batch.label.to(device)
            
            out = model(x)  # ①

            loss = F.cross_entropy(out, y)  # ②
            
            model.zero_grad()  # ③

            loss.backward()  # ④
            losses += loss.item()

            optimizer.step()  # ⑤
                        
            train_acc += (out.argmax(1) == y).cpu().numpy().mean()
        
        print(f"Training loss at epoch {epoch} is {losses / nb_batches_train}")
        print(f"Training accuracy: {train_acc / nb_batches_train}")
        print('Evaluating on validation:')
        evaluate(valid_loader)

In [23]:
def evaluate(data_loader):
    data_iterator = iter(data_loader)
    nb_batches = len(data_loader)
    model.eval()
    acc = 0 
    for batch in data_iterator:
        x = batch.text.to(device)
        y = batch.label.to(device)
                
        out = model(x)
        acc += (out.argmax(1) == y).cpu().numpy().mean()

    print(f"Eval accuracy: {acc / nb_batches}")

In [24]:
train(train_loader, valid_loader)

Training loss at epoch 0 is 0.6787079024142113
Training accuracy: 0.5716297720042416
Evaluating on validation:
Eval accuracy: 0.6575457317073171
Training loss at epoch 1 is 0.5932481729465983
Training accuracy: 0.6924818840579708
Evaluating on validation:
Eval accuracy: 0.708689024390244
Training loss at epoch 2 is 0.5249270358379337
Training accuracy: 0.7443995227995754
Evaluating on validation:
Eval accuracy: 0.7384146341463416
Training loss at epoch 3 is 0.4583083831745645
Training accuracy: 0.7894353128313891
Evaluating on validation:
Eval accuracy: 0.7781631097560975
Training loss at epoch 4 is 0.38871721944947174
Training accuracy: 0.8303894927536232
Evaluating on validation:
Eval accuracy: 0.7872713414634145
Training loss at epoch 5 is 0.3227095889008563
Training accuracy: 0.8645999027925064
Evaluating on validation:
Eval accuracy: 0.796798780487805
Training loss at epoch 6 is 0.2737854348792546
Training accuracy: 0.8886587575114883
Evaluating on validation:
Eval accuracy: 0.798

In [25]:
evaluate(test_loader)

Eval accuracy: 0.8132693908638432
