# Transformer Tutorial

[Attention Is All You Need](https://arxiv.org/abs/1706.03762) 논문관련 실습

---

### Goal
1. Self Attention & Multi-Head Attention 을 이해한다.
3. Transformer 를 pytorch 로 구현한다.

### Pre-requisite
1. DL 에 대한 기본적인 이해
2. RNN 에 대한 기본적인 이해 (RNN, LSTM, GRU, Seq2Seq, ...)
3. Pytorch Library 사용법

### Reference
- reference : [The Illustrated Transformer](https://jalammar.github.io/illustrated-transformer/).

## Step I. Self Attention

i. Scale Dot-Product Attention
ii. Multi-Head Attention

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

In [None]:
class ScaledDotProductAttentionLayer(nn.Module):
    """Scaled Dot-Product Attention Layer.
    
    """
    def __init__(self, d_k: int, d_v: int, dropout: float = 0.1) -> None:
        super().__init__()
        self.d_k = d_k
        self.d_v = d_v
        
        # Query, Key, Value 에 대한 Linear Transformation
        self.k_linear = nn.Linear(d_k, d_k)
        self.q_linear = nn.Linear(d_k, d_k)
        self.v_linear = nn.Linear(d_v, d_v)

        # 
        self.out = nn.Linear(d_v, d_v)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
        # Encoding
        # (batch_size, seq_len, d_k) -> (batch_size, seq_len, d_k)
        k = self.k_linear(k)
        q = self.q_linear(q)
        # (batch_size, seq_len, d_b) -> (batch_size, seq_len, d_b)
        v = self.v_linear(v)
        
        # attn_probs : (batch_size, seq_len, seq_len)
        attn_probs = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
        attn_probs = F.softmax(attn_probs, dim=-1)
        attn_probs = self.dropout(attn_probs)
        
        # (batch_size, seq_len, seq_len) x (batch_size, seq_len, d_v) -> (batch_size, seq_len, d_v)
        attn = torch.matmul(attn_probs, v)
        attn = self.out(attn)
        return attn

In [31]:
class MultiHeadAttentionLayer(nn.Module):
    """Multi-Head Attention layer.
    
    Args:
        d_embed: dimension of embedding
        num_heads: number of heads
    
    Note:
        `d_model` is
    """
    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
        super().__init__() 

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # Query, Key, Value 에 대한 Multi-Head Linear Transformation
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        
        self.out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: torch.Tensor | None = None,
    ) -> torch.Tensor:
        batch_size = query.size(0)

        # Perform linear and split int `num_heads` heads.
        # batch_size * seq_len * num_heads * d_model
        k = self.k_linear(key).view(batch_size, -1, self.num_heads, self.d_k)
        q = self.q_linear(query).view(batch_size, -1, self.num_heads, self.d_k)
        v = self.v_linear(value).view(batch_size, -1, self.num_heads, self.d_k)

        # Transpose to get dimensions batch_size * num_heads * seq_len * d_model
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
        scores = torch.matmul(scores, v)

        # batch_size * seq_len * num_heads * d_model
        scores = scores.transpose(1, 2)

        concat = scores.contiguous().view(batch_size, -1, self.d_model)
        return self.out(concat)