In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/paper codes

/content/drive/MyDrive/paper codes


In [6]:
%pwd

'/content/drive/MyDrive/paper codes'

## Data 준비
- torch 버전과 torchtext버전이 잘 맞아야함,,
- 여기서는 torch 1.9와 torchtext 0.10을 사용하였음

In [None]:
!pip install torch==1.9

In [None]:
!pip install torchtext==0.10

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [8]:
import torchtext

In [9]:
print(f"torchtext version: {torchtext.__version__}")
print(f"pytorch version: {torch.__version__}")

torchtext version: 0.10.0
pytorch version: 1.9.0+cu102


In [None]:
!python -m spacy download en_core_web_sm

In [3]:
!python -m spacy download de_core_news_sm

Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [4]:
#Tokenizer 정의
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [6]:
#Source sentece를 뒤집어서 학습시키는 것이 효과적임을 보였음.
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [10]:
#Field는 아마 버전에 따라서 import하는 방식이 다를 것이다.
        #Torchtext : 0.10.0, Torch:1.9.0+cu102
#어쨌든 Fiel는 데이터가 어떻게 전처리되는지 파이프라인을 한번에 확인할 수 있다.
SRC = Field(tokenize = tokenize_de,
            init_token = '',
            eos_token = '',
            lower = True)

TRG = Field(tokenize = tokenize_en,
            init_token = '',
            eos_token = '',
            lower = True)

In [13]:
#데이터 다운받기 << Multi30k dataset
    #3만개의 영어, 독일어 문장 쌍
    #각 문장은 최대 12개의 단어를 가지고 있음
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                         #exts는 어떤 언어를 source와 target으로 할 지 결정한다
                                         fields = (SRC, TRG))
                                        #field를 통해 한번에 전처리 진행

In [14]:
#데이터의 크기를 확인
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [15]:
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [16]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2) #min_freq: 최소 등장 조건
            #build_vocab --> vectors = '' 으로 사전 훈련된 임베딩 벡터도 설정할 수 있음!
                            #unit_init << 사전훈련된 단어 집합에 없는 단어를 0으로 만드는 것을 방지하기 위한 파라미터

In [17]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7852
Unique tokens in target (en) vocabulary: 5892


In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [20]:
BATCH_SIZE  = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),#data
    batch_size = BATCH_SIZE, 
    device = device)
#BucketIterator << NLP dataset에 특화된 iterator
    #전처리 파이프라인을 통과하게 할 수 있고, 
    #배치/패팅/정수인코딩/어휘집합 구축 등 다양한 기능 제공

In [23]:
train_iterator

<torchtext.legacy.data.iterator.BucketIterator at 0x7f628f4ea450>

## 모델 구축

#### 1. Encoder

In [24]:
class Encoder(nn.Module):
    #input dim: 입력 벡터의 차원
    #emb_dim; 임베딩 벡터의 차원
    #hid_dim: 은닉상태 벡터의 차원
    #n_layers: layer의 개수 <<, 논문에서는 LSTM을 Encoder, Decoder각각 4개씩 쌓았음
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        #임베딩 층
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        #LSTM사용 / n_layer만큼 쌓는다

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size] << 이런 식의 shape
        
        embedded = self.dropout(self.embedding(src)) 
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        #임베딩 된
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

### 2. Decoder

In [29]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

### 3. Seq2seq

In [30]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the  tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

## 훈련하기