# Transformer 구현하기

## 필요한 라이브러리 import하기

In [2]:
%%capture
!pip install gdown
!pip install transformers
!pip install sentencepiece # MarianTokenizer 불러올 때 필요
!pip install sacremoses # MarianMTModel 에서 불러올 때 warning 뜨는 것 방지
!pip install einops # 지리는 einops 쓰기 (Einstein operations)

In [3]:
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
import torch
from torch import nn, optim
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
from tqdm import tqdm
import math
from einops import rearrange

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

Mounted at /content/drive
cpu


## 하이퍼 파라미터

In [4]:
n_layers=3
d_model=256
d_ff=512
n_heads=8
drop_o=0.1

# 모델

## Multi-Head Attention

In [5]:
class MHA(nn.Module):
  def __init__(self, d_model, n_heads):
    super().__init__()
    self.n_heads=n_heads

    self.fc_q=nn.Linear(d_model,d_model)
    self.fc_k=nn.Linear(d_model, d_model)
    self.fc_v=nn.Linear(d_model, d_model)

    self.fc_o=nn.Linear(d_model, d_model)

    self.scale=torch.sqrt(torch.tensor(d_model/n_heads))

  def forward(self, Q, K, V, mask=None):
    Q=self.fc_q(Q)
    K=self.fc_k(K)
    V=self.fc_v(V)
    # rearrange해서 헤드 수 드러내게 바꾸는 과정 필요

    attention_score=Q @ K.transpose(-2,-1)/self.scale #마지막 두 차원을 전치시키라는 의미

    #시계열에서는 마스킹이 인코더에서 필요한지 모르겠음
    if mask is not None:
      attention_score[mask]=-1e10

    attention_weights=torch.softmax(attention_score, dim=-1)

    attention=attention_weights @ V

    # 헤드 합치는 과정 필요x=rearrange(attention, )
    x=attention
    x=self.fc_o(x) #토론하는 과정의 레이어가 필요
    return x, attention_weights

class FeedForward(nn.Module):
  def __init__(self, d_model, d_ff, drop_p):
    super().__init__()

    self.linear=nn.Sequential(nn.Linear(d_model, d_ff),
                              nn.ReLU(),
                              nn.Dropout(drop_p),
                              nn,Linear(d_ff,d_model))
  def forward(self, x):
    x=self.linear(x)
    return x



## Encoder

In [6]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, d_ff, n_heads, drop_p):
    super.__init__()

    self.self_atten=MHA(d_model, n_heads)
    self.self_atten_LN=nn.LayerNorm(d_model)

    self.FF=FeedForward(d_model, d_ff, drop_p)
    self.FF_LN=nn.LayerNorm(d_model)

    self.dropout=nn.Dropout(drop_p)

  def forward(self, x, enc_mask):
    residual=self.self_atten(x,x,x,enc_mask)
    residual=self.dropout(residual)
    x=self.self_atten_LN(x+residual)

    residual=self.FF(x)
    residual=self.dropout(residual)
    x=self.FF_LN(x+residual)

    return x

class Encoder(nn.Module):
  def __init__(self, input_embedding, max_len, n_layers, d_model, d_ff, n_heads, drop_p):
    super().__init__()

    self.scale=torch.sqrt(torch.tensor(d_model))
    self.input_embedding=input_embedding
    self.pos_embedding=nn.Embedding(max_len, d_model)

    self.dropout=nn.Dropout(drop_p)

    self.layers=nn.ModuleList([EncoderLayer(d_model, d_ff, n_heads, drop_p) for _ in range(n_layers)])

  def forward(self, src, mask, atten_map_save=False):
    #시계열인 경우 위치 임베딩 차원 고려 다시
    pos=torch.arange(src.shape[1]).repeat(src.shape[0],1).to(DEVICE)

    x=self.scale*self.input_embedding(src)+self.pos_embedding(pos)
    x=self.dropout(x)

    for layer in self.layers:
      x=layer(x,mask)

    return x #하늘 정원 아웃풋



## Decoder

In [10]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model, d_ff, n_heads, drop_p):
    super().__init__()

    self.self_atten=MHA(d_model, n_heads)
    self.self_atten_LN=nn.LayerNorm(d_model)

    self.enc_de_atten=MHA(d_model,n_heads)
    self.enc_dec_atten_LN=nn.LayerNorm(d_model)

    self.FF=FeedForward(d_model, n_heads)
    self.FF_LN=nn.LayerNorm(d_model)

    self.dropout=nn.Dropout(drop_p)

  def froward(self, x, enc_out, dec_mask, enc_dec_mask):
    residual=self.self_atten(x,x,x,dec_mask)
    residual=self.dropout(residual)

    x=self.self_atten_LN(x+residual)

    residual=self.enc_dec_atten(x, enc_out, enc_out, enc_dec_mask) #Q는 디코더로부터 K,V는 인코더로부터
    residual=self.dropout(residual)

    x=self.enc_dec_atten_LN(x+residual)

    residual=self.FF(x)
    residual=self.dropout(residual)

    x=self.FF_LN(x+residual)

    return x

class Decoder(nn.Module):
  def __init__(self, input_embedding, max_len, n_layers, d_model, d_ff, n_heads, drop_p):
    super().__init__()

    self.scale=torch.sqrt(torch.tensor(d_model))
    self.input_embedding=input_embedding
    self.pos_embedding=nn.Embedding(max_len, d_model)

    self.dropout=nn.Dropout(drop_p)

    self.layers=nn.ModuleLIst([DecoderLayer(d_model, d_FF, n_heads, drop_p) for _ in range(n_layers)])

    self.fc_out=nn.Linear(d_model, vocab_size) #마지막에 원하는 것을 시계열로 바꿔주는 과정 필요

  def forward(self, trg, enc_out, dec_mask, enc_dec_mask, atten_map_save=False):

    pos=torch.arrange(trg.shape[1]).repeat(trg.shape[0],1).to(DEVICE) #위치 임베딩 시계열의 경우 주의해주기

    x=self.scale*self.input_embedding(trg)+self.pos_embedding(pos)
    #self.scale을 곱해주면position보다 token 정보를 더 보게 된다

    x=self.dropout(x)

    for layer in self.layers:
      x=layer(s, enc_out, dec_mask, enc_dec_mask)

    x=self.fc_out(x)

    return x



## 모델

In [None]:
class Transformer(nn.Module):
  def __init__(self, vocab_size, max_len, ne_layers, d_model, d_ff, n_heads, drop_p):
    super.__init__()

    self.input_embedding=nn.Embedding(vocab_size, d_model)

    self.encoder=Encoder(self.input_embedding, max_len, n_layers, d_model, d_ff, n_heads, drop_p)
    self.decoder=Decoder(self.input_embedding, max_len, n_layers, d_model, d_ff, n_heads, drop_p)

    self.n_heads=n_heads

    for m in self.modules():
            if hasattr(m,'weight') and m.weight.dim() > 1: # 인풋 임베딩은 그대로 쓰기 위함
                nn.init.xavier_uniform_(m.weight) # xavier의 분산은 2/(Nin+Nout) 즉, 분산이 더 작다. => 그래서 sigmoid/tanh에 적합한 것! (vanishing gradient 막기 위해)

  def make_enc_mask(self, src):

    enc_mask=(src==pad_idx).unsqueeze(1).unsqueeze(2)  #시계열할 때 pad_idx로 통일시킬 필요 있음
    enc_mask=enc_mask.repeat(1, self.n_heads, src.shape[1],1)

    return enc_mask

  def make_dec_maks(self, trg):
    trg_pad_mask=(trg.to('cpu')==pad_idx).unsqueeze(1).unsqueeze(2)
    trg_pad_maks=trg_pad_mask.repeat(1, self.n_jeads, trg.shape[1],1)

    trg_future_mask=torch.trill(torch.ones(trg.shape[0],self.n_heads, trg.shape[1],trg.shape[1]))==0

    dec_mask=trg_pad_mask | trg_future_mask

    return dec_mask

  def make_enc_dec_mask(self, src, trg):
    enc_dec_mask=(src==pad_idx).unsqueeze(1).unsqueeze(2)
    enc_dec_maks=enc_dec_mask.repeat(1, self.n_heads, trg.shape[1],1)

    return enc_dec_mask

  def forward(self, src, trg):

    enc_mask=self.make_enc_mask(src)
    dec_mask=self.make_dec_mask(src)
    enc_dec_mask=self. make_enc_dec_mask(src, trg)

    enc_out=self.encoder(src, enc_mask)
    out=self.decoder(trg, enc_out, dec_mask, enc_dec_mask)

    return out




