In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext
import nltk
from konlpy.tag import Kkma
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

## 데이터 준비 

대화 모델 더미 데이터

In [2]:
kor_tagger = Kkma()

kor_tagger = kor_tagger.morphs

In [3]:
SOURCE = Field(tokenize=kor_tagger,use_vocab=True,init_token="<s>",eos_token="</s>",lower=True, include_lengths=True, batch_first=True)
TARGET = Field(tokenize=kor_tagger,use_vocab=True,init_token="<s>",eos_token="</s>",lower=True, batch_first=True)

In [4]:
train_data = TabularDataset(
                                   path="data/dsksd_chat.txt",
                                   format='tsv', # \t로 구분
                                   #skip_header=True, # 헤더가 있다면 스킵
                                   fields=[('inputs',SOURCE),('targets',TARGET)])

In [5]:
SOURCE.build_vocab(train_data)
TARGET.build_vocab(train_data)

In [6]:
print(len(SOURCE.vocab),len(TARGET.vocab))

316 272


In [7]:
train_loader = BucketIterator(
    train_data, batch_size=32, device=-1, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.inputs),sort_within_batch=True,repeat=False) 

In [8]:
for batch in train_loader:
    break

## Encoder 

In [9]:
class Encoder(nn.Module):
    def __init__(self,V,E,H,num_layers=1,bidirec=False):
        super(Encoder,self).__init__()
        
        self.num_directions = 2 if bidirec else 1
        
        self.embed = nn.Embedding(V,E)
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(E,H,num_layers,batch_first=True,bidirectional=bidirec)
    
    def forward(self,inputs,input_lengths):
        """
        inputs : B,T # LongTensor
        input_lengths : B # list
        """
        
        embed = self.embed(inputs)
        embed = self.dropout(embed)
        
        # 패딩된 문장을 패킹(패딩은 연산 안들어가도록)
        packed = pack_padded_sequence(embed, input_lengths,batch_first=True) 
        output, hidden = self.gru(packed)
        
        # 패킹된 문장을 다시 unpack
        output, output_lengths = pad_packed_sequence(output,batch_first=True) 
        
        # last hidden 선택하기 , concat
        hidden = hidden[-self.num_directions:] # num_layers*num_directions,batch_size,hidden_size
        hidden = torch.cat([h for h in hidden],1).unsqueeze(0) # 1,B,2H
        
        return output, hidden

## Decoder 

In [10]:
class Decoder(nn.Module):
    def __init__(self,V,E,H,sos_idx,max_len=15):
        super(Decoder,self).__init__()
        
        self.hidden_size = H
        self.max_len = max_len
        self.sos_idx = sos_idx
        self.embed = nn.Embedding(V,E)
        self.gru = nn.GRU(E,H,batch_first=True)
        self.linear = nn.Linear(H,V)
        
    def start_token(self,batch_size):
        return Variable(torch.LongTensor([self.sos_idx]*batch_size)).unsqueeze(1)
        
    def forward(self,hidden, max_len=None):
        
        if max_len is None: max_len = self.max_len
        
        inputs = self.start_token(hidden.size(1)) # Batch_size
        embed = self.embed(inputs)
                    
        scores=[]
        for _ in range(max_len):
            _, hidden = self.gru(embed,hidden)
            score = self.linear(hidden.squeeze(0))
            scores.append(score)
            decoded = score.max(1)[1]
            embed = self.embed(decoded).unsqueeze(1) # y_{t-1}
            
        #  column-wise concat, reshape!!
        scores = torch.cat(scores,1)
        return scores.view(inputs.size(0)*max_len,-1)

## Train 

In [11]:
HIDDEN = 30
EMBED = 30
STEP = 150
LR = 0.01

encoder = Encoder(len(SOURCE.vocab),EMBED,HIDDEN,bidirec=True)
decoder = Decoder(len(TARGET.vocab),EMBED,HIDDEN*2,TARGET.vocab.stoi['<s>'])

loss_function = nn.CrossEntropyLoss(ignore_index=TARGET.vocab.stoi['<pad>'])
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),lr=LR)

In [12]:
encoder.train()
for step in range(STEP):
    losses=[]
    for i, batch in enumerate(train_loader):
        inputs,lengths = batch.inputs
        targets = batch.targets
        
        encoder.zero_grad()
        decoder.zero_grad()
        
        output, hidden = encoder(inputs,lengths.tolist())
        preds = decoder(hidden,targets.size(1))
        
        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data[0])
        
        loss.backward()
        optimizer.step()
        
    if step % 10 == 0:
        print(np.mean(losses))
        losses=[]

5.16219909986
3.30701418718
2.75705514352
2.16114969055
1.82994547983
1.55784422159
1.33134998381
1.16969255731
1.02958300213
0.834863356625
0.783482762054
0.556017734731
0.58878508086
0.571181234593
0.575213879657


## TEST 

In [13]:
encoder.eval()
while 1:
    try:
        text = input()
        tokenized = ["<s>"] + kor_tagger(text) +["</s>"]
        input_,length = SOURCE.numericalize(([tokenized],[len(tokenized)]),train=False,device=-1)

        o,h = encoder(input_,length.tolist())
        preds = decoder(h)
        reply = [TARGET.vocab.itos[i] for i in preds.max(1)[1].data.tolist() if i not in [0,1,2,3]]

        print(" ".join(reply))
    except KeyboardInterrupt as e:
        break

하이
안녕 하세 요 ~
파이토치 좋아?
한번 사용 하 어 보시 면 알 시 ㄹ 거 에 요 !
뭘 알아
감사 하 ㅂ니다
니가 뭐 아니
역시 만 요 찾아보 ㄹ게요
배고파
죄송 하 ㅂ니다 ㄹ게요
ㅡㅡ
죄송 하 ㅂ니다 ㄹ게요
ㅗ
죄송 하 세요 다시 찾아보 ㄹ게요
바보
죄송 하 세요 다시 찾아보 ㄹ게요
멍처앙
저 는 힙합 다시 찾아보 ㄹ게요
죽을래
죄송 하 ㅂ니다
쯧
죄송 하 세요 다시 찾아보 ㄹ게요
ㅡㅡ
죄송 하 ㅂ니다 ㄹ게요
모야 죄송하다 봇이니
떡볶이 해 요 ~
ㅗㅗㅗ
저 는 ㅂ니다 다시 찾아보 ㄹ게요
