In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext
import nltk
from konlpy.tag import Kkma
from torchtext.data import Field,Iterator,Example, TabularDataset

In [2]:
USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

## TODO

data/parallel_data.txt를 torchtext를 이용하여 loader로 만드시오

In [4]:
kor_tagger = Kkma()

kor_tagger = kor_tagger.morphs
eng_tagger = nltk.word_tokenize

In [5]:
SOURCE = Field(tokenize=kor_tagger,use_vocab=True,init_token="<s>",eos_token="</s>",lower=True, include_lengths=True, batch_first=True)
TARGET = Field(tokenize=eng_tagger,use_vocab=True,init_token="<s>",eos_token="</s>",lower=True, batch_first=True)

In [6]:
train_data = TabularDataset(
                                   path="data/parallel_data.txt",
                                   format='tsv', # \t로 구분
                                   #skip_header=True, # 헤더가 있다면 스킵
                                   fields=[('inputs',SOURCE),('targets',TARGET)])

In [7]:
SOURCE.build_vocab(train_data)
TARGET.build_vocab(train_data)

In [8]:
print(len(SOURCE.vocab),len(TARGET.vocab))

1307 1149


In [12]:
train_loader = Iterator(
    train_data, batch_size=32, device=DEVICE, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.inputs),sort_within_batch=True,repeat=False) 

## Encoder 

## TODO : 인코더의 forward 부분을 완성하시오 

In [17]:
class Encoder(nn.Module):
    def __init__(self,V,E,H,num_layers=1,bidirec=False):
        super(Encoder,self).__init__()
        
        self.num_directions = 2 if bidirec else 1
        self.embed = nn.Embedding(V,E)
        self.gru = nn.GRU(E,H,num_layers,batch_first=True,bidirectional=bidirec)
    
    def forward(self,inputs):
        
        # TODO
        
        return output, hidden

## Decoder 

In [18]:
class Decoder(nn.Module):
    def __init__(self,V,E,H,sos_idx,max_len=15):
        super(Decoder,self).__init__()
        
        self.hidden_size = H
        self.max_len = max_len
        self.sos_idx = sos_idx
        self.embed = nn.Embedding(V,E)
        self.gru = nn.GRU(E,H,batch_first=True)
        self.linear = nn.Linear(H,V)
        
    def start_token(self,batch_size):
        sos = Variable(torch.LongTensor([self.sos_idx]*batch_size)).unsqueeze(1)
        if USE_CUDA:
            sos = sos.cuda()
        return sos
        
    def forward(self,hidden, max_len=None):
        
        if max_len is None: max_len = self.max_len
        
        inputs = self.start_token(hidden.size(1)) # Batch_size
        embed = self.embed(inputs)
                    
        scores=[]
        for _ in range(max_len):
            _, hidden = self.gru(embed,hidden)
            score = self.linear(hidden.squeeze(0))
            scores.append(score)
            decoded = score.max(1)[1]
            embed = self.embed(decoded).unsqueeze(1) # y_{t-1}
            
        #  column-wise concat, reshape!!
        scores = torch.cat(scores,1)
        return scores.view(inputs.size(0)*max_len,-1)

## Train 

In [19]:
HIDDEN = 100
EMBED = 50
STEP = 100
LR = 0.01

encoder = Encoder(len(SOURCE.vocab),EMBED,HIDDEN,bidirec=True)
decoder = Decoder(len(TARGET.vocab),EMBED,HIDDEN*2,TARGET.vocab.stoi['<s>'])

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),lr=LR)
scheduler = optim.lr_scheduler.StepLR(gamma=0.1,step_size=50,optimizer=optimizer)

In [20]:
for step in range(STEP):
    losses=[]
    scheduler.step()
    for i, batch in enumerate(train_loader):
        inputs,lengths = batch.inputs
        targets = batch.targets
        
        encoder.zero_grad()
        decoder.zero_grad()
        
        output, hidden = encoder(inputs)
        preds = decoder(hidden,targets.size(1))
        
        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data[0])
        
        loss.backward()
        optimizer.step()
        
    if step % 10 == 0:
        print(np.mean(losses))
        losses=[]

3.52738273144
0.917744835839
0.318660788704
0.0576301888505
0.0232011171465
0.0180320216823
0.00937887139298
0.00794518835028
0.00731793000159
0.00556319087991


## TEST 

In [22]:
while 1:
    try:
        text = input()
        tokenized = ["<s>"] + kor_tagger(text) +["</s>"]
        input_,length = SOURCE.numericalize(([tokenized],[len(tokenized)]),train=False,device=DEVICE)

        o,h = encoder(input_)
        preds = decoder(h)
        reply = [TARGET.vocab.itos[i] for i in preds.max(1)[1].data.tolist() if i not in [0,1,2,3]]

        print(" ".join(reply))
    except KeyboardInterrupt as e:
        break

안녕하세요
hello .
누구세요
are this .
고마워요
we were to new york .
감사합니다
thank you .
잘이썽요
please help
도와줘용
yes . do you .
도와줘요!
yes .
ㅡㅡ
in you eat ?
ㅗㅗ
what should help ?
