# Seq2Seq

## 1) 필요한 모듈을 import 합니다

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

## 2) manual seed를 고정시킵니다

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x19eac2724b0>

## 3) raw 데이터를 변수에 저장합니다

In [3]:
raw = ["나는 배가 고프다.	Ich habe hunger.",
       "파이토치는 매우 쉽다.	Pytorch ist sehr einfach.",
       "파이토치는 딥러닝을 위한 프레임워크이다.	Pytorch ist ein Framework für das Deep-learning.",
       "파이토치는 사용하기 매우 직관적이다.	Pytorch ist sehr klar zu bedienen."]

## 4) 토큰을 생성합니다
- SOS_token = Start Of Sentence
- EOS_token = End Of Sentence

In [4]:
SOS_token = 0
EOS_token = 1

## 5) vocab 생성을 위한 클래스를 구성합니다

In [5]:
class Vocab():
    def __init__(self):
        self.vocab2index = {'<SOS>':SOS_token, '<EOS>':EOS_token}
        self.index2vocab = {SOS_token:'<SOS>', EOS_token:'<EOS>'}
        self.vocab_count = {}
        self.n_vocab = len(self.vocab2index)
        
    def add_vocab(self, sentence):
        for word in sentence.split(' '):
            if word not in self.vocab2index:
                self.vocab2index[word] = self.n_vocab
                self.vocab_count[word] = 1
                self.index2vocab[self.n_vocab] = word
                self.n_vocab += 1
            else:
                self.vocab_count[word] += 1

## 6) source, target data로부터 긴 문장을 구분하는 함수를 생성합니다

In [6]:
def filter_pair(pair, source_max_length, target_max_length):
    return len(pair[0].split(' ')) < source_max_length and len(pair[1].split(' ')) < target_max_length

## 7) preprocessing 함수를 생성합니다

In [7]:
def preprocess(corpus, source_max_length, target_max_length):
    print('reading corpus...')
    pairs = []
    for line in corpus:
        pairs.append([s for s in line.strip().lower().split('\t')])
    print('Read {} sentence pairs'.format(len(pairs)))
    
    pairs = [pair for pair in pairs if filter_pair(pair, source_max_length, target_max_length)]
    print('Trimmed to {} sentence pairs'.format(len(pairs)))
    
    source_vocab = Vocab()
    target_vocab = Vocab()
    
    print('Counting words...')
    for pair in pairs:
        source_vocab.add_vocab(pair[0])
        target_vocab.add_vocab(pair[1])
    print('source vocab size = ', source_vocab.n_vocab)
    print('target vocab size = ', target_vocab.n_vocab)
    
    return pairs, source_vocab, target_vocab

## 8) 인코더 클래스를 선언합니다

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, x, hidden):
        x = self.embedding(x).view(1, 1, -1)
        x, hidden = self.gru(x, hidden)
        return x, hidden

## 9) 디코더 클래스를 선언합니다

In [19]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, hidden):
        x = self.embedding(x).view(1, 1, -1)
        x, hidden = self.gru(x, hidden)
        x = self.softmax(self.out(x[0]))
        return x, hidden

## 10) sentence를 idx tensor로 변환하는 함수를 선언합니다

In [20]:
def tensorize(vocab, sentence):
    indices = [vocab.vocab2index[word] for word in sentence.split(' ')]
    indices.append(vocab.vocab2index['<EOS>'])
    
    return torch.Tensor(indices).long().view(-1, 1)

## 11) Seq2Seq training을 위한 function을 선언합니다

In [21]:
def train(pairs, source_vocab, target_vocab, encoder, decoder, n_iter, print_every=1000, learning_rate=1e-2):
    loss_total = 0
    
    encoder_opitmizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    training_batch = [random.choice(pairs) for _ in range(n_iter)]
    training_source = [tensorize(source_vocab, pair[0]) for pair in training_batch]
    training_target = [tensorize(target_vocab, pair[1]) for pair in training_batch]
    
    criterion = nn.NLLLoss()
    
    for i in range(1, n_iter+1):
        source_tensor = training_source[i - 1]
        target_tensor = training_target[i - 1]
        
        encoder_hidden = torch.zeros([1, 1, encoder.hidden_size])
        
        encoder_opitmizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        source_length = source_tensor.size(0)
        target_length = target_tensor.size(0)
        
        loss = 0
        
        for enc_input in range(source_length):
            _, encoder_hidden = encoder(source_tensor[enc_input], encoder_hidden)
        
        decoder_input = torch.Tensor([[SOS_token]]).long()
        decoder_hidden = encoder_hidden
        
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]
            
        loss.backward()
        
        encoder_opitmizer.step()
        decoder_optimizer.step()
        
        loss_iter = loss.item() / target_length
        loss_total += loss_iter
        
        if i % print_every == 0:
            loss_avg = loss_total / print_every
            loss_total = 0
            print('[{} - {}%] loss = {:05.4f}'.format(i, i / n_iter * 100, loss_avg))

## 12) Seq2Seq evaluation을 위한 function을 선언합니다

In [22]:
def evaluate(pairs, source_vocab, target_vocab, encoder, decoder, target_max_length):
    for pair in pairs:
        print('>', pair[0])
        print('=', pair[1])
        source_tensor = tensorize(source_vocab, pair[0])
        source_length = source_tensor.size()[0]
        encoder_hidden = torch.zeros([1, 1, encoder.hidden_size])
        
        for ei in range(source_length):
            _, encoder_hidden = encoder(source_tensor[ei], encoder_hidden)
            
        decoder_input = torch.Tensor([[SOS_token]]).long()
        decoder_hidden = encoder_hidden
        decoded_words = []
        
        for di in range(target_max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, top_index = decoder_output.data.topk(1)
            
            if top_index.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(target_vocab.index2vocab[top_index.item()])
            
            decoder_input = top_index.squeeze().detach()
            
        predict_word = decoded_words
        predict_sentence = ' '.join(predict_word)
        print('<', predict_sentence)
        print('')

## 13) sentence의 max length를 지정해줍니다

In [23]:
SOURCE_MAX_LENGTH = 20
TARGET_MAX_LENGTH = 20

## 14) Corpus preprocessing을 시작합니다

In [24]:
load_pairs, load_source_vocab, load_target_vocab = preprocess(raw, SOURCE_MAX_LENGTH, TARGET_MAX_LENGTH)
print(random.choice(load_pairs))

reading corpus...
Read 4 sentence pairs
Trimmed to 4 sentence pairs
Counting words...
source vocab size =  13
target vocab size =  17
['파이토치는 사용하기 매우 직관적이다.', 'pytorch ist sehr klar zu bedienen.']


## 15) Encoder와 decoder를 선언합니다

In [25]:
enc_hidden_size = 20
dec_hidden_size = enc_hidden_size

enc = Encoder(load_source_vocab.n_vocab, enc_hidden_size)
dec = Decoder(dec_hidden_size, load_target_vocab.n_vocab)

## 16) Seq2seq model training을 시작합니다

In [26]:
train(load_pairs, load_source_vocab, load_target_vocab, enc, dec, 5000, print_every=1000)

[1000 - 20.0%] loss = 0.6077
[2000 - 40.0%] loss = 0.1616
[3000 - 60.0%] loss = 0.1434
[4000 - 80.0%] loss = 0.1313
[5000 - 100.0%] loss = 0.0820


## 17) Evaluation을 수행합니다

In [27]:
evaluate(load_pairs, load_source_vocab, load_target_vocab, enc, dec, TARGET_MAX_LENGTH)

> 나는 배가 고프다.
= ich habe hunger.
< ich habe hunger. <EOS>

> 파이토치는 매우 쉽다.
= pytorch ist sehr einfach.
< pytorch ist sehr klar zu bedienen. <EOS>

> 파이토치는 딥러닝을 위한 프레임워크이다.
= pytorch ist ein framework für das deep-learning.
< pytorch ist ein framework für das deep-learning. <EOS>

> 파이토치는 사용하기 매우 직관적이다.
= pytorch ist sehr klar zu bedienen.
< pytorch ist sehr klar zu bedienen. <EOS>

