https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
seed=1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic=True

In [3]:
spacy_de=spacy.load('de_core_news_sm')
spacy_en=spacy.load('en_core_web_sm')

In [4]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
SRC=Field(tokenize=tokenize_de, init_token='', eos_token='',lower=True)
TRG=Field(tokenize=tokenize_en, init_token='',eos_token='',lower=True)

In [6]:
train_data,valid_data,test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC,TRG))#exts는 source와 target으로 사용할 언어를 지정

downloading training.tar.gz


C:\Users\Lee_JongWoo\private\20221115\.data\multi30k\training.tar.gz: 100%|████████| 1.21M/1.21M [00:06<00:00, 194kB/s]


downloading validation.tar.gz


C:\Users\Lee_JongWoo\private\20221115\.data\multi30k\validation.tar.gz: 100%|█████| 46.3k/46.3k [00:00<00:00, 64.0kB/s]


downloading mmt_task1_test2016.tar.gz


C:\Users\Lee_JongWoo\private\20221115\.data\multi30k\mmt_task1_test2016.tar.gz: 100%|█| 66.2k/66.2k [00:00<00:00, 94.3k


In [7]:
print(len(vars(train_data.examples[0])['src']))
print(len(vars(train_data.examples[1])['src']))

print(vars(train_data.examples[0]))
print(vars(train_data.examples[1]))

13
8
{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}
{'src': ['mehrere', 'männer', 'mit', 'schutzhelmen', 'bedienen', 'ein', 'antriebsradsystem', '.'], 'trg': ['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']}


In [8]:
SRC.build_vocab(train_data, min_freq=2)#build_vocab함수를 이용하여 각 token을 indexing해줌. 이때, source와 target의 vocabulary는 다름
TRG.build_vocab(train_data, min_freq=2)#이때, vocabulary는 training set에서만 만들어져야함

In [9]:
device= torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [10]:
BATCH_SIZE=128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)
#BucketIterator를 이용해 batch size별로 token묶고 어휘를 읽은수 있는 token에서 index로 변환

In [11]:
print(TRG.vocab.stoi['pitched'])#의 index=1

for i, batch in enumerate(train_iterator):
    src=batch.src
    trg=batch.trg
    
    src=src.transpose(1,0)
    print(f"첫번째 배치의 text크기:{src.shape}")
    print(src[0])
    print(src[1])
    
    break

        
print(len(train_iterator))
print(len(train_iterator)*128)

4284
첫번째 배치의 text크기:torch.Size([128, 23])
tensor([  2,   7,  15,   6,  13, 330, 255,  30,  11,  23, 267,   3,   2,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1])
tensor([   2,   53, 1550,    8,   34,    4,  307,   10,   76,   86,   21,   13,
         500,   61,    3,    2,    1,    1,    1,    1,    1,    1,    1])
227
29056


In [23]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim,emb_dim)
        #양방향=True
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True) #ht=EncoderGRU^->(e(xt^->),ht-1^->) 두개 넣음.
         # 양방향 rnn의 출력값을 concat 한 후에 fc layer에 전달합니다.
        self.fc=nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout=nn.Dropout(dropout)
    def forward(self,src):
        #src=[src len, batch_size]
        embedded= self.dropout(self.embedding(src))
        #embedded = [src len, batch_size, emb dim]
        outputs, hidden = self.rnn(embedded)
        #outputs=[단어길이, 배치사이즈, 은닉차원 * num_directions] 이건 time step마다 나오는 outputs임 //output은 언제나 hidden layer의 top에 있음
        #hidden=[n_layers*num_direction(2), batch size, hid_dim]
        #hidden layer는 [forward1,backward1,forward2,backward2... 식으로 쌓임]
        #hidden[-2,:,:]--> 마지막 forward, hidden[-1,:,:]->마지막 backward //hidden: 마지막 은닉상태 ht값임.
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        return outputs, hidden

In [24]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) +dec_hid_dim, dec_hid_dim) #attn= 얘는뭘까 in= (enc_hid_dim * 2) +dec_hid_dim, out=dec_hid_dim ?
        #Energy 계산시 attn에 st-1과 H가 들어감. 
        self.v = nn.Linear(dec_hid_dim,1, bias=False) #[1,dec_hid_dim] tensor ^at=vEt
    
    def forward(self,hidden,encoder_outputs):
        #hidden = [batch size, dec hid dim]
        #encoderoutput=(모든 h들) [src len,batchsize,enc_hid_dim*2]
        batch_size=encoder_outputs.shape[1]
        src_len=encoder_outputs.shape[0]
        
        hidden=hidden.unsqueeze(1).repeat(1, src_len, 1)#unsqueeze dim1에 1인차원채우기 torch.repeat(*sizes)
        
        encoder_outputs=encoder_outputs.permute(1,0,2)#차원의 자리만 바꿈 (0,1,2)->(1,0,2) 
        #hidden = [batchsize, src_len, dec_hid_dim]
        #encoder=[bathsize, src_len, enc_hid_dim*2]
        #위의 두코드는 아마 차원맞추기 위해서 짠듯 cat할려고 나중에 print로 한번 넣어보면 알듯?
        
        energy=torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))# Et=tanh(attn(st-1,H))
        #energy=[bath_size, src_len,dec_hid_dim]-->맞나? 나중에 print 넣어서 shape 보기[(enc_hid_dim*2)+dec_hid dim 왜아님]
        
        attention=self.v(energy).squeeze(2)# at=vEt로 a 얘도 차원봐야알듯
        #attention=[batch size, src len]
        return F.softmax(attention, dim=1)#A 계산

In [25]:
class Decoder(nn.Module):
    def __init__(self,output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        
        self.output_dim= output_dim
        
        self.attention=attention
        
        self.embedding=nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim*2) + emb_dim,dec_hid_dim)##rnn_input=torch.cat((embedded, weighted), dim=2) 하단에
        
        self.fc_out=nn.Linear((enc_hid_dim*2)+dec_hid_dim + emb_dim, output_dim)
        ###prediction=[batchsize,output_dim=(end_hid_dim*2]+dec_hid_dim+emb_dim)]
        self.dropout=nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
        #input=[batch size]
        #hidden=[batch size, dec_hid_dim]
        #eocnder_outputs=[src len, batch size, end hid dim*2]
        
        input= input.unsqueeze(0)
        #input =[1, batch size]
        embedded=self.dropout(self.embedding(input))
        #embedded = [1,batch size, emd_dim]
        a=self.attention(hidden, encoder_outputs)
        # 위에 Attention 클래스에 넣는 것과 동일 
        #a = [batch size, src len]
        a=a.unsqueeze(1)
        #a= [batch size, 1, src len]
        encoder_outputs = encoder_outputs.permute(1,0,2)
        #encoder_outputs = [batch size, src len, end_hid dim*2]
        weighted=torch.bmm(a, encoder_outputs)# bmm [Batch, n, m] x [Batch,m,p]=[Batch,n,p] 배치는 나두고 내적
        #weighted=[batch size, 1, end_hid_dim*2 w=atxH
        weighted=weighted.permute(1,0,2)
        #weighted=[1,batch size, end_hid_dim*2]
        rnn_input=torch.cat((embedded, weighted), dim=2)
        ##rnn_input=[1,batch_size,emb_dim+(enc_hid_dim)*2]
        output, hidden=self.rnn(rnn_input, hidden.unsqueeze(0))
        #output=[src len, batch_size, dec_hid_dim * n_directions(1->단방향)] 디코더에서 n_directions=1
        #hidden=[n_layers*n_directions(1*1),batch_size,dec hid dim] 디코더에서 n_layers는 1
        assert (output == hidden).all()
        #this also means that output == hidden
        embedded = embedded.squeeze(0)
        #embedded=[batch_size,emb_dim]
        output=output.squeeze(0)
        #output=[batch_size,dec_hid_dim*n_directions]
        weighted=weighted.squeeze(0)
        #weight=[batch_size,end_hid_dim*2]
        prediction=self.fc_out(torch.cat((output, weighted, embedded),dim=1))
        ###prediction=[batchsize,output_dim=(end_hid_dim*2]+dec_hid_dim+emb_dim)]
        return prediction, hidden.squeeze(0) #decoder에서는 y와 각gru에서나온 hidden state 반환

In [26]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder=encoder
        self.decoder=decoder
        self.device=device
    
    def forward(self,src,trg,teacher_forcing_ratio=0.5):
        
        #src=[src len, batch size]
        #trg=[trg len, batch size]
        
        batch_size=src.shape[1]
        trg_len=trg.shape[0]
        
        outputs= torch.zeros(trg_len, bath_size, trg_vocab_size).to(self.device)
        #enc_output 은 H중에 forward와 backward 들의 모임
        #hidden 은 마지막 forward 와 backwad h
        encoder_outputs, hidden = self.encoder(src)
        
        input=trg[0,:] #첫번째 trg 입력값은 토큰<sos>인듯 
        
        for t in range (1, trg_len):
            #디코더에 token_embedding, 이전 hidden state,H 넣기
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #각각 토큰에 대한 predcition outputs에 넣기
            outputs[t]=output
            #teacher force 임계값 정하기
            teacher_force =random.random()<teacher_forcing_ratio
            #우리 예측에서 가장 높은 prediction token 구하기
            top1=output.argmax(1)
            #teacher force시, 실제 다음 토큰을 다음 input으로써 사용
            #아니라면 predicted token 사용
            input=trg[t] if teacher_force else top1
            
        return outputs

In [29]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM= len(TRG.vocab)
ENC_EMB_DIM=256
DEC_EMB_DIM=256
ENC_HID_DIM=512
DEC_HID_DIM=512
ENC_DROPOUT=0.5
DEC_DROPOUT=0.5

attn=Attention(ENC_HID_DIM,DEC_HID_DIM)
enc=Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec=Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model=Seq2Seq(enc, dec, attn).to(device)

In [30]:
#모든 bias=0 모든 weights N(0,0.01)
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data,0)
    
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7852, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(5892, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=5892, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (device): Attention(
    (attn): Linear(in_features=1536, out_features=512, bias=True)
    (v): Linear(in_features=512, out_features=1, bias=False)
  )
)

In [33]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,516,100 trainable parameters


In [34]:
optimizer=torch.optim.Adam(model.parameters())

In [35]:
TRG_PAD_IDX= TRG.vocab.stoi[TRG.pad_token]
criterion=nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)#pad들어간건 무시

In [38]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss=0
    
    for i, batch in enumerate(iterator):
        src=batch.src
        trg=batch.trg
        
        optimizer.zero_grad()
        
        output=model(src,trg)
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        output_dim=output.shape[-1]
        
        output=output[1:].view(-1, output_dim)
        trg=trg[1:].view(-1)
        #trg=[(trg len-1)*batch_size]
        #output=[(trg len-1)*batch_size, output_dim]
        
        loss=criterion(output,trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
        
        optimizer.step()
        
        epoch_loss+=epoch.itme()
        
    return epoch_loss/len(iterator)

In [40]:
def evaluate(model, iterator, criterion):
    model.eval()
    
    epoch_loss=0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            
            src=batch.src
            trg=batch.trg
            
            output=model(src,trg,0)#teaching force 정지
            
            output_dim=output.shape[-1]
            
            output=output[1:].view(-1, output_dim)
            trg=trg[1:].view(-1)
            
            loss=criterion(output,trg)
            
            epoch_loss+=loss.item()
            
        return epoch_loss/len(iterator)

In [42]:
def epoch_time(start_time, end_time):
    elapsed_time=end_time-start_time
    elapsed_mins=int(elapsed_time/60)
    elapsed_secs=int(elapsed_time- (elapsed_mins*60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS=10
CLIP=1
best_valid_loss= int('inf')

for 