In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
import pickle
import random
import time
import math
import numpy as np
from konlpy.tag import Mecab;tagger=Mecab()
from collections import Counter
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

torch.manual_seed(1)

<torch._C.Generator at 0x7fd62590b798>

In [2]:
USE_CUDA = False

# 데이터 

일단 최대 길이 (10,10)으로 고정하고 PAD & Batch

In [3]:
SEQ_LENGTH=10
SOS_token = 0
EOS_token = 1

In [4]:
data = open('../../dataset/corpus/dsksd_chat.txt').readlines()
data = [[t.split('\\t')[0],t.split('\\t')[1][:-1]] for t in data if t !='\n']

In [5]:
DATA_SIZE = len(data) # 배치 사이즈
DATA_SIZE

153

### 전처리 

1. 형태소 분석
2. 최대 길이 10보다 긴 것들 10으로 제한
3. EOS 태그 달기
4. 길이 10이 안되는 것들 PADDING
5. [[Q,A]...] 

In [6]:
train=[]

In [7]:
for t0,t1 in data:
    token0 = tagger.morphs(t0)
    
    if len(token0)>=SEQ_LENGTH:
        token0= token0[:SEQ_LENGTH-1]
    token0.append("EOS")

    token1 = tagger.morphs(t1)
    if len(token1)>=SEQ_LENGTH:
        token1=token1[:SEQ_LENGTH-1]
    
    token1.append("EOS")
    while len(token0)<SEQ_LENGTH:
        token0.append('PAD')
    
    while len(token1)<SEQ_LENGTH:
        token1.append('PAD')
    
    train.append([token0,token1])

In [8]:
train[-1]

[['끝말잇기', '고', '?', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'],
 ['저', '바보', '라', '몰라요', 'ㅠㅠ', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD']]

### Indexing words 

인덱스 딕셔너리 선언

In [9]:
n_words=3
word2index={"SOS":0,"EOS":1,"PAD":2}

for t0,t1 in train:
    for token in t0+t1:
        if token not in word2index:
            word2index[token]=n_words
            n_words+=1

index2word = {v:k for k,v in word2index.items()}

### Data to Tensor(LongTensor) 

각 토큰을 인덱스로 바꾼 후, LongTensor로 만든 후, autograd.Variable로 wrapping

In [10]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w], seq))
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

In [11]:
train_x=[]
train_y=[]
lengths=[]
for tr in train:
    temp = prepare_sequence(tr[0], word2index)
    temp = temp.view(1,-1)
    train_x.append(temp)

    temp2 = prepare_sequence(tr[1],word2index)
    temp2 = temp2.view(1,-1)
    train_y.append(temp2)
    
    length = [t for t in tr[1] if t !='PAD']
    lengths.append(len(length))

inputs = torch.cat(train_x)
targets = torch.cat(train_y)

del train_x
del train_y

In [15]:
train[0][-1]

['안녕', '하', '세요', '!', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']

In [16]:
lengths[0]

5

## 모델링  

### Encoder 

In [66]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,batch_first=True)
        
    def forward(self, input):
        hidden = Variable(torch.zeros(self.n_layers, input.size(0), self.hidden_size)) 
        
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded, hidden)

        return output, hidden

In [67]:
encoder_test = EncoderRNN(len(word2index), 30, 2)
print(encoder_test)

EncoderRNN (
  (embedding): Embedding(455, 30)
  (gru): GRU(30, 30, num_layers=2, batch_first=True)
)


In [69]:
out, hidden = encoder_test(inputs.view(DATA_SIZE,-1))
print(out.size(), hidden.size())

torch.Size([153, 10, 30]) torch.Size([2, 153, 30])


### Decoder with Attention 

* https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
* https://arxiv.org/pdf/1409.0473.pdf
* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture10.pdf

In [278]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size,max_len=10,n_layers=1,dropout_p=0.1):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_len=max_len

        self.attn = nn.Linear(self.hidden_size,self.hidden_size) # Attention
        # Define the layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)

        #self.dropout = nn.Dropout(self.dropout_p)
        
        self.gru = nn.GRU(self.hidden_size*2, self.hidden_size, self.n_layers,batch_first=True)
        self.out = nn.Linear(self.hidden_size*2, self.output_size)
    
    def Attention(self, hidden, encoder_outputs):
        
        hidden = hidden.squeeze(0).unsqueeze(2)  # 히든 : (1,배치,차원) -> (배치,차원,1)
        
        batch_size = encoder_outputs.size(0) # B
        max_len = encoder_outputs.size(1) # T
        energies = self.attn(encoder_outputs.contiguous().view(batch_size*max_len,-1)) # B*T,D -> B*T,D
        energies = energies.view(batch_size,max_len,-1) # B,T,D (배치,타임,차원)
        
        attn_energies = energies.bmm(hidden).transpose(1,2) # B,T,D * B,D,1 --> B,1,T 
        
        alpha = F.softmax(attn_energies.squeeze(1)) # B,T
        alpha = alpha.unsqueeze(1) # B,1,T
        context = alpha.bmm(encoder_outputs) # B,1,T * B,T,D => B,1,D
        
        
        return context # B,1,D
    
    
    def forward(self, input,context,encoder_outputs,training=True):
        
        # Get the embedding of the current input word
        embedded = self.embedding(input)
        hidden = Variable(torch.zeros(self.n_layers, input.size(0), self.hidden_size)) 
        #embedded = self.dropout(embedded)
        
        decode=[]

        # Apply GRU to the output so far
        for i in range(self.max_len):


            _, hidden = self.gru(torch.cat((embedded,context),2), hidden)
            concated = torch.cat((hidden,context.transpose(0,1)),2)
            score = self.out(concated.squeeze(0))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            _,input = torch.max(softmaxed,1)
            embedded = self.embedding(input)
            
            # 그 다음 Context Vector를 Attention으로 계산
            context = self.Attention(hidden, encoder_outputs) 
         
        # if training:
        # TODO 패딩이 아닌 진짜 length만 cost 계산하기...

        # 요고 주의! time-step을 column-wise concat한 후, reshape!!
        scores = torch.cat(decode,1)
        return scores.view(input.size(0)*self.max_len,-1) 

## 트레이닝 

In [279]:
HIDDEN_SIZE = 30
LEARNING_RATE=0.01

In [288]:
encoder =  EncoderRNN(len(word2index), HIDDEN_SIZE, 2)
decoder = DecoderRNN(HIDDEN_SIZE,len(word2index))

loss_function = nn.CrossEntropyLoss()
enc_optim= optim.Adam(encoder.parameters(), lr=LEARNING_RATE)
dec_optim = optim.Adam(decoder.parameters(),lr=LEARNING_RATE)

In [289]:
decoder_input = Variable(torch.LongTensor([[SOS_token]*DATA_SIZE])).transpose(1,0)
outputs,context = encoder(inputs)

score = decoder(decoder_input,context[-1].view(DATA_SIZE,1,-1),outputs)

In [290]:
%%time
losses=[]
for epoch in range(500):
    
    encoder.zero_grad()
    decoder.zero_grad()
    decoder_input = Variable(torch.LongTensor([[SOS_token]*DATA_SIZE])).transpose(1,0)
    outputs,context = encoder(inputs)

    score = decoder(decoder_input,context[-1].view(DATA_SIZE,1,-1),outputs)
    loss=loss_function(score,targets.view(-1))
    losses.append(loss)
    loss.backward()
    
    torch.nn.utils.clip_grad_norm(encoder.parameters(), 5.0)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 5.0)
    
    enc_optim.step()
    dec_optim.step()
    
    if epoch % 100==0:
        print("[{epoch}] loss : {loss}".format(epoch=epoch,loss=loss.data.numpy()[0]))

[0] loss : 6.2091264724731445
[100] loss : 1.30751633644104
[200] loss : 0.2609458267688751
[300] loss : 0.07936285436153412
[400] loss : 0.03935229405760765
CPU times: user 4min 58s, sys: 4min 40s, total: 9min 38s
Wall time: 2min 33s


## 테스트 

In [299]:
index = random.choice(range(DATA_SIZE))
input_ = train[index][0]
target = train[index][1]
print('Q: ', ' '.join([i for i in input_ if i !='PAD' and i != 'EOS'])+'\n')


decoder_input = Variable(torch.LongTensor([[SOS_token]])).transpose(1,0)
outputs,context = encoder(inputs[index].view(1,-1))

score = decoder(decoder_input,context[-1].view(1,1,-1),outputs)

v,i = torch.max(score,1)

decoded=[]
for t in range(i.size()[0]):
    decoded.append(index2word[i.data.numpy()[t][0]])

print('A: ', ' '.join([i for i in decoded if i !='PAD' and i != 'EOS'])+'\n')

Q:  너 성별 뭐 야

A:  남자 요 !

