https://arxiv.org/pdf/1506.06726.pdf

In [1]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
import numpy as np
from konlpy.tag import Mecab;tagger=Mecab()
from collections import Counter
import nltk
import pickle
import re
%matplotlib inline  

In [2]:
torch.__version__

'0.2.0+751198f'

In [3]:
USE_CUDA = torch.cuda.is_available()

In [4]:
USE_CUDA

True

In [5]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w] if w in to_ix.keys() else to_ix["<UNK>"], seq))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else Variable(torch.LongTensor(idxs))
    return tensor


flatten = lambda l: [item for sublist in l for item in sublist]

### 데이터 준비 & 전처리 (문장 단위로 나누기) 

In [6]:
data = pickle.load(open("insight_life_sent.pkl","rb"))

In [7]:
percentage = re.compile("\d+[.]\d+%")
email = re.compile("[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*")
url = re.compile("((http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|www.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)")
com = re.compile("[.]{2,}")
numbering = re.compile("\d+[.]")
news = re.compile("[가-힣]+(뉴스|일보)")
reporter = re.compile("[가-힣]{3} 기자 = ")
photo = re.compile("")

In [8]:
data = [d[1:-1] for d in data]

In [9]:
data = [[d for d in dd if d!="연합뉴스"] for dd in data]
data = [[d for d in dd if "SBS" not in d] for dd in data]
data = [[d for d in dd if "KBS" not in d] for dd in data]
data = [[d for d in dd if "MBC" not in d] for dd in data]
data = [[d.replace("\'","") for d in dd] for dd in data]
data = [[d for d in dd if "Facebook" not in d] for dd in data]
data = [[d for d in dd if "Youtube" not in d] for dd in data]
data = [[d for d in dd if "Instagram" not in d] for dd in data]
data = [[d for d in dd if d!="Littlethings"] for dd in data]
data = [[d for d in dd if "imagesbank" not in d.lower()] for dd in data]
data = [[d for d in dd if "기사와 관련 없는 자료 사진" not in d] for dd in data]
data = [[news.sub("",d) for d in dd] for dd in data]
data = [[email.sub("",d) for d in dd] for dd in data]
data = [[url.sub("",d) for d in dd] for dd in data]
data = [[d.replace("[인사이트]","") for d in dd] for dd in data]
data = [[reporter.sub("",d).strip() for d in dd] for dd in data]

In [10]:
X=[]
y=[]
for d in data:
    tris = list(nltk.trigrams(d))
    for tri in tris:
        X.append(tri[1].strip())
        y.append([tri[0].strip(),tri[2].strip()])

### 길이 분포  파악

In [11]:
from collections import Counter

In [12]:
c_X = [tagger.morphs(x) for x in X]

In [13]:
LENGTH=60

### 데이터 프로세싱(패딩) 

In [14]:
p_X=[]
p_y=[]

In [15]:
for x in X:
    temp = tagger.morphs(x) 
    if len(temp)<LENGTH:
        temp.append('<EOS>')
        while len(temp)<LENGTH:
            temp.append('<PAD>')
    else:
        temp = temp[:LENGTH]
        temp[-1]='<EOS>'

    p_X.append(temp)
    
    
for yy in y:
    temp_y=[]
    temp = tagger.morphs(yy[0])
    if len(temp)<LENGTH:
        temp.append('<EOS>')
        while len(temp)<LENGTH:
            temp.append('<PAD>')
    else:
        temp = temp[:LENGTH]
        temp[-1]='<EOS>'
    temp_y.append(temp)
    
    temp = tagger.morphs(yy[1])
    if len(temp)<LENGTH:
        temp.append('<EOS>')
        while len(temp)<LENGTH:
            temp.append('<PAD>')
    else:
        temp = temp[:LENGTH]
        temp[-1]='<EOS>'
    temp_y.append(temp)
    p_y.append(temp_y)

### Vocab index dic 준비 

In [16]:
y1,y2 = zip(*p_y)

In [17]:
all_vocab = flatten(p_X) + flatten(y1) + flatten(y2)

In [18]:
word2index = {'<PAD>': 0, '<UNK>':1,'<SOS>':2,'<EOS>':3}
for token in all_vocab:
    if token not in word2index.keys():
        word2index[token]=len(word2index)

index2word = {v:k for k,v in word2index.items()}

In [19]:
len(word2index)

13069

In [20]:
train_data = list(zip(p_X,p_y))

In [21]:
inputs=[]

for tr in train_data:
    
    temp = prepare_sequence(tr[0],word2index)
    temp = temp.view(1,-1)
    
    temp2 = prepare_sequence(tr[1][0],word2index)
    temp2 = temp2.view(1,-1)
    temp3 = prepare_sequence(tr[1][1],word2index)
    temp3 = temp3.view(1,-1)
    
    inputs.append((temp,temp2,temp3))

In [22]:
len(train_data)

15299

In [23]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        
        yield batch

### 모델 선언 (Bi-Skip 모델)

In [24]:
class BiSkipEncoder(nn.Module):
    def __init__(self, input_size,embedding_size, hidden_size,n_layers=1):
        super(BiSkipEncoder, self).__init__()
        
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True,bidirectional=True)
    
    def init_weights(self):
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        #self.lstm.weight.data.
    
    def init_hidden(self,input):
        hidden = Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size))
        context = Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size))
        return (hidden,context)
    
    def make_translation_matrix(self,global_embedding_size):
        self.global_embedding_size = global_embedding_size
        self.translation_weight = nn.Linear(self.global_embedding_size,self.embedding_size)
        self.translation_weight.bias.data.fill_(0)
        self.translation_weight.weight.data.uniform_(-0.1, 0.1)
        
        
    def translation_matrix(self,word_vecs):
        """
        word_vecs : BxD (global_embedding_size) FloatTensor
        """
        v_prime = self.translation_weight(word_vecs)
        
        return v_prime # BxD (embedding_size)

    
    def forward(self, input,input_masking):
        """
        input : B,T (LongTensor)
        input_masking : B,T (PAD 마스킹한 ByteTensor)
        
        <PAD> 제외한 리얼 Context를 다시 만들어서 아웃풋으로
        """
        
        self.hidden = self.init_hidden(input)
        
        embedded = self.embedding(input)
        output, self.hidden = self.lstm(embedded, self.hidden)
        
        real_context=[]
        
        for i,o in enumerate(output): # B,T,D
            real_length = input_masking[i].data.tolist().count(0) # 실제 길이
            real_context.append(o[real_length-1])
            
        return output, torch.cat(real_context).view(input.size(0),-1).unsqueeze(1)

In [25]:
class BiSkipDecoder(nn.Module):
    
    def __init__(self,output_size,embedding_size,hidden_size,max_len=60,n_layers=1,dropout_p=0.1):
        super(BiSkipDecoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_len=max_len
        self.embedding_size = embedding_size


        # Define the layers
        self.embedding = nn.Embedding(self.output_size, self.embedding_size) #TODO encoder와 공유하도록 하고 학습되지 않게..

        #self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.embedding_size+self.hidden_size, self.hidden_size, self.n_layers, batch_first=True)

        self.out = nn.Linear(self.hidden_size, self.output_size)
    
    def init_weights(self):
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.out.bias.data.fill_(0)
        self.out.weight.data.uniform_(-0.1, 0.1)
        #self.lstm.weight.data.
    
    def init_hidden(self,input):
        hidden = Variable(torch.randn(self.n_layers*1, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size))
        context = Variable(torch.randn(self.n_layers*1, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size))
        return (hidden,context)
    
    def forward(self, input,enc_context,training=True):
        """
        input : B,L(length)
        enc_context : B,1,D
        """
        # Get the embedding of the current input word
        embedded = self.embedding(input)
        hidden = self.init_hidden(input)
        #embedded = self.dropout(embedded)
        
        decode=[]
        for i in range(self.max_len):
        
            _, hidden = self.lstm(torch.cat((embedded,enc_context),2), hidden)
            #concated = torch.cat((hidden[0],enc_context.transpose(0,1)),2)
            score = self.out(hidden[0].squeeze(0))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            _,input = torch.max(softmaxed,1)
            embedded = self.embedding(input.unsqueeze(1))

        # Decode hidden states of all time step
        scores = torch.cat(decode,1)
        del decode
        
        return scores.view(input.size(0)*self.max_len,-1)

In [26]:
LEARNING_RATE=0.001
EMBEDDING_SIZE=300
HIDDEN_SIZE=600
BATCH_SIZE=16
LENGTH=60
STEP_SIZE=1

In [27]:
encoder = BiSkipEncoder(len(word2index),EMBEDDING_SIZE,HIDDEN_SIZE)
decoder = BiSkipDecoder(len(word2index),EMBEDDING_SIZE,HIDDEN_SIZE*2,LENGTH)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
encoder.init_weights()
decoder.init_weights()

loss_function = nn.CrossEntropyLoss(ignore_index=0)
enc_optim= optim.Adam(encoder.parameters(), lr=LEARNING_RATE)
dec_optim = optim.Adam(decoder.parameters(),lr=LEARNING_RATE)

In [28]:
for step in range(STEP_SIZE):
    losses=[]
    for i, batch in enumerate(getBatch(BATCH_SIZE,inputs)):
        x,y_1,y_2 = zip(*batch)
        x = torch.cat(x)
        y_1 = torch.cat(y_1)
        y_2 = torch.cat(y_2)

        x_mask = torch.cat([Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))).cuda() if USE_CUDA else Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))) for t in x]).view(BATCH_SIZE,-1)
    
        encoder.zero_grad()
        decoder.zero_grad()

        output, hidden_c = encoder(x,x_mask)
        y_1_input = Variable(torch.LongTensor([[word2index['<SOS>']]*BATCH_SIZE])).cuda().transpose(1,0) if USE_CUDA else Variable(torch.LongTensor([[word2index['<SOS>']]*BATCH_SIZE])).transpose(1,0)
        y_2_input = Variable(torch.LongTensor([[word2index['<SOS>']]*BATCH_SIZE])).cuda().transpose(1,0) if USE_CUDA else Variable(torch.LongTensor([[word2index['<SOS>']]*BATCH_SIZE])).transpose(1,0)

        y_1_score = decoder(y_1_input,hidden_c)
        y_2_score = decoder(y_2_input,hidden_c)

        loss_1 = loss_function(y_1_score,y_1.view(-1))
        loss_2 = loss_function(y_2_score,y_2.view(-1))

        loss = loss_1+loss_2
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0])
        loss.backward()

        torch.nn.utils.clip_grad_norm(encoder.parameters(), 10.0)
        torch.nn.utils.clip_grad_norm(decoder.parameters(), 10.0)

        enc_optim.step()
        dec_optim.step()

        if i % 100==0:
            print("Step",step," epoch",i," : ",np.mean(losses))
            losses=[]

Step 0  epoch 0  :  18.968
Step 0  epoch 10  :  15.6805
Step 0  epoch 20  :  12.9234
Step 0  epoch 30  :  12.6665
Step 0  epoch 40  :  12.6242
Step 0  epoch 50  :  12.5704
Step 0  epoch 60  :  12.4588
Step 0  epoch 70  :  12.2012
Step 0  epoch 80  :  12.0843
Step 0  epoch 90  :  11.9483
Step 0  epoch 100  :  12.0611
Step 0  epoch 110  :  11.8709
Step 0  epoch 120  :  11.6767
Step 0  epoch 130  :  11.2557
Step 0  epoch 140  :  11.5428
Step 0  epoch 150  :  11.1637
Step 0  epoch 160  :  11.3054
Step 0  epoch 170  :  10.7758
Step 0  epoch 180  :  10.9701
Step 0  epoch 190  :  11.1216
Step 0  epoch 200  :  11.5359
Step 0  epoch 210  :  11.1288
Step 0  epoch 220  :  10.7807
Step 0  epoch 230  :  10.2768
Step 0  epoch 240  :  10.4039
Step 0  epoch 250  :  10.2145
Step 0  epoch 260  :  10.1327
Step 0  epoch 270  :  9.57228
Step 0  epoch 280  :  10.0183
Step 0  epoch 290  :  10.0192
Step 0  epoch 300  :  9.66686
Step 0  epoch 310  :  9.35285
Step 0  epoch 320  :  9.18062
Step 0  epoch 330  :  

# TODO

1. LSTM cell 직접 customize
2. LSTM onthogonal init
3. CrossEntropy(ignore_index=0) 적용 # 이건 버전업하면 될듯
4. Vocaburary Expansion with GloVe or Word2Vec
5. 실제 결과 검증 (How to?! => 영어 데이터로)
6. Gutenburg corpus로 학습시키고 SentEval해보기