### Импорт модулей


In [60]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import logging
from typing import Optional

logger = logging.getLogger("my_transformer")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Positional Encoding

Формулы для расчета:
$$PE_(pos, 2i) = sin(\frac{pos}{10000^{2i/d_{model}}})$$
$$PE_(pos, 2i+1) = cos(\frac{pos}{10000^{2i/d_{model}}})$$

In [56]:
def positional_encoding(max_len: int, d_model: int):
    denom = torch.pow(torch.tensor(10_000.0), torch.arange(0, d_model, 2) / d_model)
    pos_encoding = torch.arange(max_len, dtype=torch.float32).unsqueeze(1).repeat(1, d_model)

    pos_encoding[:, 0::2] = torch.sin(pos_encoding[:, 0::2] / denom)
    pos_encoding[:, 1::2] = torch.cos(pos_encoding[:, 1::2] / denom)

    return pos_encoding

In [59]:
positional_encoding(10, 8)

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
          9.9995e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
          9.9980e-01,  2.0000e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
          9.9955e-01,  3.0000e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
          9.9920e-01,  4.0000e-03,  9.9999e-01],
        [-9.5892e-01,  2.8366e-01,  4.7943e-01,  8.7758e-01,  4.9979e-02,
          9.9875e-01,  5.0000e-03,  9.9999e-01],
        [-2.7942e-01,  9.6017e-01,  5.6464e-01,  8.2534e-01,  5.9964e-02,
          9.9820e-01,  6.0000e-03,  9.9998e-01],
        [ 6.5699e-01,  7.5390e-01,  6.4422e-01,  7.6484e-01,  6.9943e-02,
          9.9755e-01,  6.9999e-03,  9.9998e-01],
        [ 9.8936

### Attention

$$Attention(K, Q, V) = softmax(\frac{QK^T}{\sqrt(d_k)})V$$

In [None]:
def attention(
    Q: torch.tensor,
    K: torch.tensor,
    V: torch.tensor,
    dropout: Optional[nn.Dropout] = None,
    mask: Optional[torch.tensor] = None
) -> tuple[torch.tensor, Optional[torch.tensor]]:
    """
    Считаем внимание
    Размеры тензоров:
    - Q [batch_size, m, d_model]
    - K [batch_size, m, d_model]
    - V [batch_size, m, d_model]
    """
    d_k = K.size(-1)
    scale = torch.sqrt(torch.tensor(d_k)).to(device)
    attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / scale

    if mask is not None:
        attention_scores = attention_scores.masked_fill(mask == 0, -1e9)

    attention_weights = F.softmax(attention_scores, dim = -1)

    if dropout is not None:
        attentions_weights = dropout(attentions_weights)

    values = torch.matmul(attention_weights, V)

    return values, attention_weights


In [61]:
def split_heads(m_tensor: torch.tensor, num_heads: int):
    batch_size, max_len, d_model = m_tensor.size()
    head_dim = d_model // num_heads

    m_tensor = m_tensor.view(batch_size, max_len, num_heads, head_dim)

    m_tensor.transpose(1, 2)

    return m_tensor

tensor([[[[1, 2],
          [3, 4]],

         [[5, 6],
          [7, 8]],

         [[1, 2],
          [3, 4]],

         [[5, 6],
          [7, 8]]]])

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
      super(MultiHeadAttention, self).__init__()
      self.num_heads = num_heads
      self.head_dim = d_model // num_heads

      self.W_k = nn.Linear(d_model, d_model, bias = False)
      self.W_q = nn.Linear(d_model, d_model, bias = False)
      self.W_v = nn.Linear(d_model, d_model, bias = False)

      self.W_o = nn.Linear(d_model, d_model, bias = False)
      self.dropout = nn.Dropout(dropout)

    def forward(self, Q: torch.tensor,
                K: torch.tensor, V: torch.tensor,
                mask: Optional[torch.tensor] = None):
      Q = self.W_q(Q)
      K = self.W_k(K)
      V = self.W_v(V)
      batch_size = Q.size(0)
      d_model = Q.size(-1)

      # TODO добавить маску!
      Q = split_heads(Q, self.num_heads)
      K = split_heads(K, self.num_heads)
      V = split_heads(V, self.num_heads)

      output, attention_weights = attention(Q, K, V, dropout = self.dropout, mask = self.mask)
      output = output.permute(0, 2, 1, 3).reshape(batch_size, -1, d_model)
      output = self.W_o(output)

      return output, attention_weights


## LayerNorm

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, d_model: int, eps: float = 1e-5):
        super(LayerNorm, self).__init__()
        self.d_model = d_model
        self.eps = eps

        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))

    def forward(self, x: torch.tensor) -> torch.float:
        mean = x.mean()
        std = x.std()

        x_normalized = self.gamma * (x - mean) / (std + self.eps) + self.beta
        return x_normalized

## FFN

In [67]:
class FFN(nn.Module):
    def __init__(self, d_model: int, d_out: int, dropout: float = 0.1):
        super(FFN, self).__init__()
        self.W_1 = nn.Linear(d_model, d_out)
        self.relu = nn.ReLU()
        self.W_2 = nn.Linear(d_out, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x:torch.tensor):
      output = self.W_1(x)
      output = self.dropout(self.relu(output))
      output = self.W_2(output)

      return output

## Полезные ссылки:

* ["*On Layer Normalization in the Transformer Architecture*"](https://arxiv.org/pdf/2002.04745.pdf)
* ["*Attention is all you Need*"](https://ailab-ua.github.io/courses/resources/Attention_Vaswani_2017.pdf)
* ["*Illustrated transformer by J.Allamar*"](https://jalammar.github.io/illustrated-transformer/)