# Transformer 구현하기

## 필요한 라이브러리 import하기

In [5]:
%%capture
!pip install gdown
!pip install transformers
!pip install sentencepiece # MarianTokenizer 불러올 때 필요
!pip install sacremoses # MarianMTModel 에서 불러올 때 warning 뜨는 것 방지
!pip install einops # 지리는 einops 쓰기 (Einstein operations)

In [6]:
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
import torch
from torch import nn, optim
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
from tqdm import tqdm
import math
from einops import rearrange

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cpu


## 하이퍼 파라미터

In [1]:
n_layers=3
d_model=256
d_ff=512
n_heads=8
drop_o=0.1

# 모델

## Multi-Head Attention

In [7]:
class MHA(nn.Module):
  def __init__(self, d_model, n_heads):
    super().__init__()
    self.n_heads=n_heads

    self.fc_q=nn.Linear(d_model,d_model)
    self.fc_k=nn.Linear(d_model, d_model)
    self.fc_v=nn.Linear(d_model, d_model)

    self.fc_o=nn.Linear(d_model, d_model)

    self.scale=torch.sqrt(torch.tensor(d_model/n_heads))

  def forward(self, Q, K, V, mask=None):
    Q=self.fc_q(Q)
    K=self.fc_k(K)
    V=self.fc_v(V)
    # rearrange해서 헤드 수 드러내게 바꾸는 과정 필요

    attention_score=Q @ K.transpose(-2,-1)/self.scale #마지막 두 차원을 전치시키라는 의미

    #시계열에서는 마스킹이 인코더에서 필요한지 모르겠음
    if mask is not None:
      attention_score[mask]=-1e10

    attention_weights=torch.softmax(attention_score, dim=-1)

    attention=attention_weights @ V

    # 헤드 합치는 과정 필요x=rearrange(attention, )
    x=attention
    x=self.fc_o(x) #토론하는 과정의 레이어가 필요
    return x, attention_weights

class FeedForward(nn.Module):
  def __init__(self, d_model, d_ff, drop_p):
    super().__init__()

    self.linear=nn.Sequential(nn.Linear(d_model, d_ff),
                              nn.ReLU(),
                              nn.Dropout(drop_p),
                              nn,Linear(d_ff,d_model))
  def forward(self, x):
    x=self.linear(x)
    return x



## Encoder

In [8]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, d_ff, n_heads, drop_p):
    super.__init__()

    self.self_atten=MHA(d_model, n_heads)
    self.self_atten_LN=nn.LayerNorm(d_model)

    self.FF=FeedForward(d_model, d_ff, drop_p)
    self.FF_LN=nn.LayerNorm(d_model)

    self.dropout=nn.Dropout(drop_p)

  def forward(self, x, enc_mask):
    residual=self.self_atten(x,x,x,enc_mask)
    residual=self.dropout(residual)
    x=self.self_atten_LN(x+residual)

    residual=self.FF(x)
    residual=self.dropout(residual)
    x=self.FF_LN(x+residual)

    return x

class Encoder(nn.Module):
  def __init__(self, input_embedding, max_len, n_layers, d_model, d_ff, n_heads, drop_p):
    super().__init__()

    self.scale=torch.sqrt(torch.tensor(d_model))
    self.input_embedding=input_embedding
    self.pos_embedding=nn.Embedding(max_len, d_model)

    self.dropout=nn.Dropout(drop_p)

    self.layers=nn.ModuleList([EncoderLayer(d_model, d_ff, n_heads, drop_p) for _ in range(n_layers)])

  def forward(self, src, mask, atten_map_save=False):
    #시계열인 경우 위치 임베딩 차원 고려 다시
    pos=torch.arange(src.shape[1]).repeat(src.shape[0],1).to(DEVICE)

    x=self.scale*self.input_embedding(src)+self.pos_embedding(pos)
    x=self.dropout(x)

    for layer in self.layers:
      x=layer(x,mask)

    return x #하늘 정원 아웃풋

