In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext
import nltk
from konlpy.tag import Kkma
from torchtext.data import Field,Iterator,Example, TabularDataset

## TODO

data/parallel_data.txt를 torchtext를 이용하여 loader로 만드시오

In [3]:
kor_tagger = Kkma()

kor_tagger = kor_tagger.morphs
eng_tagger = nltk.word_tokenize

In [4]:
SOURCE = Field(tokenize=kor_tagger,use_vocab=True,init_token="<s>",eos_token="</s>",lower=True, include_lengths=True, batch_first=True)
TARGET = Field(tokenize=eng_tagger,use_vocab=True,init_token="<s>",eos_token="</s>",lower=True, batch_first=True)

In [5]:
train_data = TabularDataset(
                                   path="data/parallel_data.txt",
                                   format='tsv', # \t로 구분
                                   #skip_header=True, # 헤더가 있다면 스킵
                                   fields=[('inputs',SOURCE),('targets',TARGET)])

KeyboardInterrupt: 

In [None]:
SOURCE.build_vocab(train_data)
TARGET.build_vocab(train_data)

In [None]:
print(len(SOURCE.vocab),len(TARGET.vocab))

In [None]:
train_loader = Iterator(
    train_data, batch_size=32, device=-1, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.inputs),sort_within_batch=True,repeat=False) 

## Encoder 

In [1]:
class Encoder(nn.Module):
    def __init__(self,V,E,H,num_layers=1,bidirec=False):
        super(Encoder,self).__init__()
        
        self.embed = nn.Embedding(V,E)
        self.gru = nn.GRU(E,H,num_layers,batch_first=True,bidirectional=bidirec)
    
    def forward(self,inputs):
        
        embed = self.embed(inputs)
        # hidden을 명시적으로 넣어줄 수도 있지만
        # 안 넣어주면 디폴트로 내부에서 제로벡터 hidden_state(0)를 생성
        output, hidden = self.gru(embed)
        
        # last hidden 선택하기 , concat
        hidden = hidden[-self.num_directions:] # num_layers*num_directions,batch_size,hidden_size
        hidden = torch.cat([h for h in hidden],1).unsqueeze(0) # 1,B,2H
        
        return output, hidden

NameError: name 'nn' is not defined

## Decoder 

In [42]:
class Decoder(nn.Module):
    def __init__(self,V,E,H,sos_idx,max_len=15):
        super(Decoder,self).__init__()
        
        self.hidden_size = H
        self.max_len = max_len
        self.sos_idx = sos_idx
        self.embed = nn.Embedding(V,E)
        self.gru = nn.GRU(E,H,batch_first=True)
        self.linear = nn.Linear(H,V)
        
    def start_token(self,batch_size):
        return Variable(torch.LongTensor([self.sos_idx]*batch_size)).unsqueeze(1)
        
    def forward(self,hidden, max_len=None):
        
        if max_len is None: max_len = self.max_len
        
        inputs = self.start_token(hidden.size(1)) # Batch_size
        embed = self.embed(inputs)
                    
        scores=[]
        for _ in range(max_len):
            _, hidden = self.gru(embed,hidden)
            score = self.linear(hidden.squeeze(0))
            scores.append(score)
            decoded = score.max(1)[1]
            embed = self.embed(decoded).unsqueeze(1) # y_{t-1}
            
        #  column-wise concat, reshape!!
        scores = torch.cat(scores,1)
        return scores.view(inputs.size(0)*max_len,-1)

## Train 

In [43]:
HIDDEN = 100
EMBED = 50
STEP = 100
LR = 0.01

encoder = Encoder(len(SOURCE.vocab),EMBED,HIDDEN,bidirec=True)
decoder = Decoder(len(TARGET.vocab),EMBED,HIDDEN*2,TARGET.vocab.stoi['<s>'])

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),lr=LR)
scheduler = optim.lr_scheduler.StepLR(gamma=0.1,step_size=50,optimizer=optimizer)

In [44]:
for step in range(STEP):
    losses=[]
    scheduler.step()
    for i, batch in enumerate(train_loader):
        inputs,lengths = batch.inputs
        targets = batch.targets
        
        encoder.zero_grad()
        decoder.zero_grad()
        
        output, hidden = encoder(inputs)
        preds = decoder(hidden,targets.size(1))
        
        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data[0])
        
        loss.backward()
        optimizer.step()
        
    if step % 10 == 0:
        print(np.mean(losses))
        losses=[]

4.757963046431541
1.369030237197876
0.33919186098501086
0.2684489921666682
1.4195171743631363
0.750604840926826
0.1817042720504105
0.11466072034090757
0.07586698874365538
0.060892949579283595


## TEST 

In [46]:
while 1:
    try:
        text = input()
        tokenized = ["<s>"] + kor_tagger(text) +["</s>"]
        input_,length = SOURCE.numericalize(([tokenized],[len(tokenized)]),train=False,device=-1)

        o,h = encoder(input_)
        preds = decoder(h)
        reply = [TARGET.vocab.itos[i] for i in preds.max(1)[1].data.tolist() if i not in [0,1,2,3]]

        print(" ".join(reply))
    except KeyboardInterrupt as e:
        break

안녕하세요.
hello . feeling you . feeling feeling
도와주세요!
help ! help ! me
무슨 문제가 있나요?
well reservation time accident on ?
어떻게 지내세요?
what day mr. to meet in ?
처음 뵙겠습니다.
pleased to meet you . .
실례합니다.
excuse me . be you
