In [15]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
import pickle
import random
import time
import math
import numpy as np
from konlpy.tag import Mecab;tagger=Mecab()
from collections import Counter
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

torch.manual_seed(1)

<torch._C.Generator at 0x7f93580be5e8>

In [2]:
USE_CUDA = False

# 데이터 

일단 최대 길이 (10,10)으로 고정하고 PAD & Batch

In [3]:
SEQ_LENGTH=10
SOS_token = 0
EOS_token = 1

In [5]:
data = open('../../dataset/corpus/dsksd_chat.txt').readlines()
data = [[t.split('\\t')[0],t.split('\\t')[1][:-1]] for t in data if t !='\n']

In [6]:
DATA_SIZE = len(data) # 배치 사이즈
DATA_SIZE

153

### 전처리 

1. 형태소 분석
2. 최대 길이 10보다 긴 것들 10으로 제한
3. EOS 태그 달기
4. 길이 10이 안되는 것들 PADDING
5. [[Q,A]...] 

In [7]:
train=[]

In [8]:
for t0,t1 in data:
    token0 = tagger.morphs(t0)
    
    if len(token0)>=SEQ_LENGTH:
        token0= token0[:SEQ_LENGTH-1]
    token0.append("EOS")

    token1 = tagger.morphs(t1)
    if len(token1)>=SEQ_LENGTH:
        token1=token1[:SEQ_LENGTH-1]
    
    token1.append("EOS")
    while len(token0)<SEQ_LENGTH:
        token0.append('PAD')
    
    while len(token1)<SEQ_LENGTH:
        token1.append('PAD')
    
    train.append([token0,token1])

In [9]:
train[-1]

[['끝말잇기', '고', '?', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'],
 ['저', '바보', '라', '몰라요', 'ㅠㅠ', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD']]

### Indexing words 

인덱스 딕셔너리 선언

In [10]:
n_words=3
word2index={"SOS":0,"EOS":1,"PAD":2}

for t0,t1 in train:
    for token in t0+t1:
        if token not in word2index:
            word2index[token]=n_words
            n_words+=1

index2word = {v:k for k,v in word2index.items()}

### Data to Tensor(LongTensor) 

각 토큰을 인덱스로 바꾼 후, LongTensor로 만든 후, autograd.Variable로 wrapping

In [11]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w], seq))
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

In [12]:
train_x=[]
train_y=[]
lengths=[]
for tr in train:
    temp = prepare_sequence(tr[0], word2index)
    temp = temp.view(1,-1)
    train_x.append(temp)

    temp2 = prepare_sequence(tr[1],word2index)
    temp2 = temp2.view(1,-1)
    train_y.append(temp2)
    
    length = [t for t in tr[1] if t !='PAD']
    lengths.append(len(length))

inputs = torch.cat(train_x)
targets = torch.cat(train_y)

del train_x
del train_y

In [13]:
train[0][-1]

['안녕', '하', '세요', '!', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']

In [14]:
lengths[0]

5

## 모델링  

### Encoder 

In [16]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,batch_first=True)
        
    def forward(self, input):
        hidden = Variable(torch.zeros(self.n_layers, input.size(0), self.hidden_size)) 
        
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded, hidden)

        return output, hidden

In [271]:
encoder_test = EncoderRNN(len(word2index), 30, 2)
print(encoder_test)

EncoderRNN (
  (embedding): Embedding(452, 30)
  (gru): GRU(30, 30, num_layers=2, batch_first=True)
)


In [272]:
out, hidden = encoder_test(inputs.view(DATA_SIZE,-1))
print(out.size(), hidden.size())

torch.Size([153, 10, 30]) torch.Size([2, 153, 30])


In [284]:
attenW = nn.Linear(10,30)

In [285]:
pres =attenW(out.contiguous().view(out.size(0)*out.size(2),-1))

In [286]:
pres

Variable containing:
-1.8987e-02 -2.0545e-01 -1.1391e-01  ...   3.9493e-02  1.1909e-01  4.7588e-02
-2.2662e-02 -1.7350e-01 -1.0588e-01  ...   7.4951e-02 -1.1513e-02  1.6381e-01
-2.7924e-02 -1.2639e-01 -1.2584e-01  ...   9.7742e-02  1.3633e-02  7.0687e-02
                ...                   ⋱                   ...                
 6.8057e-03 -4.3524e-01 -7.4464e-03  ...   1.1046e-01  1.6226e-01 -2.2756e-01
-1.5275e-03 -3.4376e-01 -2.7320e-02  ...   1.5413e-01 -8.9346e-03  2.1097e-01
-5.9231e-02 -3.5677e-01 -6.4551e-02  ...   3.4149e-01 -3.1862e-02 -1.2726e-01
[torch.FloatTensor of size 4590x30]

In [287]:
h_s = hidden.detach()
h_s_t = h_s.view(hidden.size(1)*hidden.size(2),-1).transpose(1,0).contiguous().view(hidden.size(1),hidden.size(2),-1)

In [290]:
h_s_t

Variable containing:
( 0 ,.,.) = 
  0.1186  0.5221
 -0.1144 -0.1200
  0.6512  0.4637
       ⋮        
 -0.5866 -0.5544
 -0.4265 -0.5052
  0.0741  0.4383

( 1 ,.,.) = 
  0.1333  0.4689
 -0.0850 -0.1517
  0.4521  0.2581
       ⋮        
 -0.5850 -0.5478
 -0.4225 -0.5040
  0.0720  0.4517

( 2 ,.,.) = 
  0.1048  0.5146
 -0.1132 -0.1188
  0.6140  0.4401
       ⋮        
 -0.6055 -0.5594
 -0.4272 -0.5605
  0.0839  0.4413
... 

(150,.,.) = 
 -0.0407 -0.0495
  0.1782 -0.0403
 -0.1609  0.0860
       ⋮        
  0.3053  0.3487
 -0.0665 -0.0814
 -0.1082  0.0003

(151,.,.) = 
  0.1433  0.2728
 -0.0746  0.0346
 -0.5980  0.0519
       ⋮        
  0.2436  0.1646
  0.3391  0.1187
 -0.0113 -0.1584

(152,.,.) = 
  0.1411  0.2893
 -0.0714  0.0219
 -0.5979  0.0698
       ⋮        
  0.2557  0.2376
  0.3860  0.1438
  0.0976 -0.1481
[torch.FloatTensor of size 153x30x2]

In [293]:
new_context = pres.view(out.size(0),out.size(1),-1).bmm(h_s_t)

RuntimeError: wrong matrix size, batch1: 10x90, batch2: 30x2 at /b/wheel/pytorch-src/torch/lib/TH/generic/THTensorMath.c:1441

In [292]:
new_context

Variable containing:
( 0 ,.,.) = 
  2.2535e-01  2.0677e-01
  2.1020e-01  1.6996e-01
  1.5623e-01  1.5868e-01
           ⋮            
  5.2848e-02  1.6493e-01
  4.8768e-02  1.6331e-01
  4.5525e-02  1.6183e-01

( 1 ,.,.) = 
  1.3036e-01  1.0524e-01
  2.7541e-01  1.0715e-01
  1.4126e-01  4.9466e-03
           ⋮            
  8.6346e-02 -5.3067e-02
  3.9827e-02  3.8982e-03
  1.3021e-02  4.5903e-02

( 2 ,.,.) = 
  2.5046e-01  2.0593e-01
  3.7503e-01  2.4798e-01
  3.3191e-01  3.8001e-01
           ⋮            
  2.9911e-01  3.7111e-01
  2.8985e-01  3.6531e-01
  2.8672e-01  3.5644e-01
... 

(150,.,.) = 
 -1.1240e-01 -1.6369e-01
 -1.3291e-01 -1.9411e-01
 -1.4539e-01 -2.4040e-01
           ⋮            
 -2.1075e-01 -3.3737e-01
 -2.1882e-01 -3.4003e-01
 -2.2490e-01 -3.4161e-01

(151,.,.) = 
 -1.2620e-02 -1.5515e-01
  2.7235e-02 -2.3201e-01
 -3.6732e-02 -2.0563e-01
           ⋮            
 -4.1697e-02 -2.2070e-01
 -4.5115e-02 -2.4145e-01
 -4.7682e-02 -2.5632e-01

(152,.,.) = 
 -4.6085e-02 -1.

### Decoder with Attention 

In [267]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.attenW = nn.Linear(10,10)
        
        # Define the layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)

        #self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, self.n_layers,batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, input,output,hidden,lengths,seq_length,training=True):
        
        # Get the embedding of the current input word
        embedded = self.embedding(input)
        #embedded = self.dropout(embedded)
        
        decode=[]
        # Apply GRU to the output so far
        for i in range(seq_length):
            
            _, hidden = self.gru(embedded, hidden)
            # Attention 계산으로 그 다음 hidden 계산
            pres =self.attenW(output.contiguous().view(output.size(0)*output.size(2),-1))
            h_s = hidden.detach()
            h_s_t = h_s.view(hidden.size(1)*hidden.size(2),-1).transpose(1,0).contiguous().view(hidden.size(1),hidden.size(2),-1)
            print(pres.view(output.size(0),output.size(1),-1).size(),h_s_t.size())
            hidden = pres.view(output.size(0),output.size(1),-1).bmm(h_s_t).view(self.n_layers,hidden.size(1),hidden.size(2)) # New hidden
            score = self.out(hidden.view(hidden.size(0)*hidden.size(1),-1))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            _,input = torch.max(softmaxed,1)
            embedded = self.embedding(input)
            #embedded = self.dropout(embedded)
        
        # if training:
        # TODO 패딩이 아닌 진짜 length만 cost 계산하기...
            
        # 요고 주의! time-step을 column-wise concat한 후, reshape!!
        scores = torch.cat(decode,1)
        return scores.view(input.size(0)*seq_length,-1) 

## 트레이닝 

In [268]:
HIDDEN_SIZE = 30
LEARNING_RATE=0.01

In [269]:
encoder =  EncoderRNN(len(word2index), HIDDEN_SIZE, 2)
decoder = DecoderRNN(HIDDEN_SIZE,len(word2index))

loss_function = nn.CrossEntropyLoss()
enc_optim= optim.Adam(encoder.parameters(), lr=LEARNING_RATE)
dec_optim = optim.Adam(decoder.parameters(),lr=LEARNING_RATE)

In [270]:
decoder_input = Variable(torch.LongTensor([[SOS_token]*DATA_SIZE])).transpose(1,0)
outputs,context = encoder(inputs)

score = decoder(decoder_input,outputs,context,lengths,SEQ_LENGTH)

torch.Size([153, 10, 30]) torch.Size([153, 30, 1])


RuntimeError: size '[1 x 153 x 30]' is invalid for input of with 1530 elements at /b/wheel/pytorch-src/torch/lib/TH/THStorage.c:59

In [83]:
losses=[]
for epoch in range(1000):
    
    encoder.zero_grad()
    decoder.zero_grad()
    decoder_input = Variable(torch.LongTensor([[SOS_token]*DATA_SIZE])).transpose(1,0)
    _,context = encoder(inputs)

    score = decoder(decoder_input,context,lengths,SEQ_LENGTH)
    loss=loss_function(score,targets.view(-1))
    losses.append(loss)
    loss.backward()
    
    torch.nn.utils.clip_grad_norm(encoder.parameters(), 5.0)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 5.0)
    
    enc_optim.step()
    dec_optim.step()
    
    if epoch % 100==0:
        print("[{epoch}] loss : {loss}".format(epoch=epoch,loss=loss.data.numpy()[0]))

[0] loss : 6.0109076499938965
[100] loss : 1.4232630729675293
[200] loss : 0.49393996596336365
[300] loss : 0.15646184980869293
[400] loss : 0.05080809444189072
[500] loss : 0.032962311059236526
[600] loss : 0.025963518768548965
[700] loss : 0.023465832695364952
[800] loss : 0.021949483081698418
[900] loss : 0.020666683092713356


## 테스트 

In [126]:
index = random.choice(range(DATA_SIZE))
input_ = train[index][0]
target = train[index][1]
print('Q: ', ' '.join([i for i in input_ if i !='PAD' and i != 'EOS'])+'\n')


decoder_input = Variable(torch.LongTensor([[SOS_token]])).transpose(1,0)
_,context = encoder(inputs[index].view(1,-1))

score = decoder(decoder_input,context,lengths,SEQ_LENGTH)

v,i = torch.max(score,1)

decoded=[]
for t in range(i.size()[0]):
    decoded.append(index2word[i.data.numpy()[t][0]])

print('A: ', ' '.join([i for i in decoded if i !='PAD' and i != 'EOS'])+'\n')

Q:  그나저나 내일 오 시 는 분

A:  저 갑니다 용

