In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext
import nltk
from konlpy.tag import Kkma
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from attention import Attention

In [14]:
USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

## 데이터 준비 

In [3]:
kor_tagger = Kkma()

kor_tagger = kor_tagger.morphs
eng_tagger = nltk.word_tokenize

In [4]:
SOURCE = Field(tokenize=kor_tagger,use_vocab=True,init_token="<s>",eos_token="</s>",lower=True, include_lengths=True, batch_first=True)
TARGET = Field(tokenize=eng_tagger,use_vocab=True,init_token="<s>",eos_token="</s>",lower=True, batch_first=True)

In [6]:
train_data = TabularDataset(
                                   path="data/parallel_data.txt",
                                   format='tsv', # \t로 구분
                                   #skip_header=True, # 헤더가 있다면 스킵
                                   fields=[('inputs',SOURCE),('targets',TARGET)])

In [7]:
SOURCE.build_vocab(train_data)
TARGET.build_vocab(train_data)

In [8]:
print(len(SOURCE.vocab),len(TARGET.vocab))

1307 1149


In [15]:
train_loader = BucketIterator(
    train_data, batch_size=32, device=DEVICE, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.inputs),sort_within_batch=True,repeat=False) 

## Encoder 

In [16]:
class Encoder(nn.Module):
    def __init__(self,V,E,H,num_layers=1,bidirec=False):
        super(Encoder,self).__init__()
        
        self.num_directions = 2 if bidirec else 1
        self.embed = nn.Embedding(V,E)
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(E,H,num_layers,batch_first=True,bidirectional=bidirec)
            
    def forward(self,inputs,input_lengths):
        """
        inputs : B,T # LongTensor
        input_lengths : B # list
        """
        
        embed = self.embed(inputs)
        embed = self.dropout(embed)
        
        # 패딩된 문장을 패킹(패딩은 연산 안들어가도록)
        packed = pack_padded_sequence(embed, input_lengths,batch_first=True) 
        output, hidden = self.gru(packed)
        
        # 패킹된 문장을 다시 unpack
        output, output_lengths = pad_packed_sequence(output,batch_first=True) 
        
        # last hidden 선택하기 , concat
        hidden = hidden[-self.num_directions:] # num_layers*num_directions,batch_size,hidden_size
        hidden = torch.cat([h for h in hidden],1).unsqueeze(0) # 1,B,2H
        
        return output, hidden

## Decoder 

In [34]:
class Decoder(nn.Module):
    def __init__(self,V,E,H,sos_idx,max_len=15):
        super(Decoder,self).__init__()
        
        self.hidden_size = H
        self.max_len = max_len
        self.sos_idx = sos_idx
        self.embed = nn.Embedding(V,E)
        self.gru = nn.GRU(E+H,H,batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(2*H,V)
        self.attention = Attention(H,'general') # 어텐션
        
    def start_token(self,batch_size):
        sos = Variable(torch.LongTensor([self.sos_idx]*batch_size)).unsqueeze(1)
        if USE_CUDA: sos = sos.cuda()
        return sos
       
    def forward(self,hidden, encoder_hiddens, encoder_lengths=None, max_len=None):
        """
        hidden : 1,B,H (인코더 라스트 히든)
        encoder_hiddens : B,T,H (인코더 모든 타임스텝에서 히든스테이트들)
        encoder_lengths : B (인풋의 진짜 길이 list)
        """
        if max_len is None: max_len = self.max_len
        
        inputs = self.start_token(hidden.size(1)) # Batch_size
        embed = self.embed(inputs)
        embed= self.dropout(embed)
        scores=[]
        attn_weights=[]
        for _ in range(max_len):
            
            # context vector 계산
            context, attn_weight = self.attention(hidden.transpose(0,1), encoder_hiddens, encoder_lengths,True)
            attn_weights.append(attn_weight.squeeze(1))
            
            # concat해서 rnn에
            rnn_input = torch.cat([embed,context],2)
            _, hidden = self.gru(rnn_input,hidden)
            
            # concat해서 linear에
            concated = torch.cat([hidden.transpose(0,1),context],2)
            score = self.linear(concated.squeeze(1))
            scores.append(score)
            decoded = score.max(1)[1]
            embed = self.embed(decoded).unsqueeze(1) # y_{t-1}
            embed = self.dropout(embed)
            
        #  column-wise concat, reshape!!
        scores = torch.cat(scores,1)
        return scores.view(inputs.size(0)*max_len,-1), torch.cat(attn_weights)

## Train 

In [46]:
HIDDEN = 100
EMBED = 50
STEP = 200
LR = 0.001

encoder = Encoder(len(SOURCE.vocab),EMBED,HIDDEN,bidirec=True)
decoder = Decoder(len(TARGET.vocab),EMBED,HIDDEN*2,TARGET.vocab.stoi['<s>'])

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

loss_function = nn.CrossEntropyLoss(ignore_index=TARGET.vocab.stoi['<pad>'])
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(gamma=0.1,milestones=[100],optimizer=optimizer)

In [47]:
encoder.train()
decoder.train()
for step in range(STEP):
    losses=[]
    scheduler.step()
    for i, batch in enumerate(train_loader):
        inputs,lengths = batch.inputs
        targets = batch.targets
        
        encoder.zero_grad()
        decoder.zero_grad()
        
        output, hidden = encoder(inputs,lengths.tolist())
        preds, _ = decoder(hidden,output,lengths.tolist(),targets.size(1))
        
        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data[0])
        
        loss.backward()
        optimizer.step()
        
    if step % 10 == 0:
        print(np.mean(losses))
        losses=[]

5.36419428885
3.25571791083
2.57027836516
2.08639037982
1.69171138667
1.39513733797
1.17431018874
0.968443433754
0.791452653706
0.665851117112
0.551195865031
0.488491506316
0.461174290162
0.44934902899
0.445566720329
0.420567046851
0.412186958827
0.390605855267
0.403296662029
0.369189135497


## TEST 

In [48]:
encoder.eval()
decoder.eval()
while 1:
    try:
        text = input()
        tokenized = ["<s>"] + kor_tagger(text) +["</s>"]
        input_,length = SOURCE.numericalize(([tokenized],[len(tokenized)]),train=False,device=DEVICE)

        o,h = encoder(input_,length.tolist())
        preds, _ = decoder(h,o,length.tolist())
        reply = [TARGET.vocab.itos[i] for i in preds.max(1)[1].data.tolist() if i not in [0,1,2,3]]

        print(" ".join(reply))
    except KeyboardInterrupt as e:
        break

안녕하세요.
good morning .
도와줘요!
help ! .
미안해요.
i 'm bored to .
네, 감사합니다.
yes , thank you .
