In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import Hangulpy as hg
import pickle
flatten = lambda l: [item for sublist in l for item in sublist]
from torch.nn.utils.rnn import PackedSequence,pack_padded_sequence

USE_CUDA = torch.cuda.is_available()

In [2]:
pretrained_vector = pickle.load(open("data/pretrained_word2vec.pkl","rb"))
word2index = pickle.load(open("data/vocab.dict","rb"))

In [3]:
index2word = {v:k for k,v in word2index.items()}

In [4]:
vocab = []
for i in range(len(index2word)):
    vocab.append(index2word[i])

In [5]:
len(vocab)

148392

In [13]:
class MimickRNN(nn.Module):
    
    def __init__(self,vocab,D,char_embed,char_hidden,mlp_hidden):
        super(MimickRNN,self).__init__()
        
        V = len(vocab)
        self.word_embed = nn.Embedding(V,D)
        self.vocab = vocab
        
        char_vocab = ['<pad>','<other>','ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 
              'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 
              'ㅎ', 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 
              'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ',
              'ㄳ', 'ㄵ', 'ㄶ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ',
              'ㄾ', 'ㄿ', 'ㅀ', 'ㅄ',
              '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k',
              'l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G',
              'H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',"{","}"
               '-','(',')','!','~','?','[',']',',','.','/','<','>','#','@','$','%','^','&','*','_',
               '+','-','=',':',';',"'",'"']
        
        self.char_hidden = char_hidden
        self.char2index = {v:i for i,v in enumerate(char_vocab)}
        self.char_embed = nn.Embedding(len(self.char2index), char_embed)
        self.mimick_rnn = nn.LSTM(char_embed,char_hidden,1,batch_first=True,bidirectional=True)
        self.mimick_linear = nn.Sequential(nn.Linear(char_hidden*2,mlp_hidden),
                                                           nn.Tanh(),
                                                           nn.Linear(mlp_hidden,D))
        
    def init_word_embed(self,pretrained_vectors):
        self.word_embed.weight = nn.Parameter(torch.from_numpy(pretrained_vectors).float())
        self.word_embed.requires_grad = False # 고정
    
    def init_char_hidden(self,size):
        hidden = Variable(torch.zeros(2,size,self.char_hidden))
        context = Variable(torch.zeros(2,size,self.char_hidden))
        if USE_CUDA:
            hidden = hidden.cuda()
            context = hidden.cuda()
        return hidden, context
    
    def prepare_single_char(self,token):
        idxs=[]
        for s in token:
            if hg.is_hangul(s):
                # 음소 단위 분해
                try:
                    emso = list(hg.decompose(s))
                    if emso[-1]=='':
                        emso.pop()
                except:
                    emso = s
                idxs.extend(list(map(lambda w: self.char2index[w], emso)))
            else:
                candit=s
                if s.isalpha():
                    candit='<alpha>'
                try:
                    idxs.append(self.char2index[candit])
                except:
                    idxs.append(self.char2index['<other>']) # '' 가 OTHER같이
        tensor = torch.LongTensor(idxs)
        tensor = Variable(tensor)
        return tensor
    
    def prepare_char(self,seq,index=None):
        seq = list(map(lambda v: self.prepare_single_char(v), seq))
        if index:
            forsort = list(zip(seq,index))
            forsort = sorted(forsort,key = lambda s: s[0].size(0),reverse=True)
            seq,index = list(zip(*forsort))
            seq,index = list(seq),list(index)
        else:
            seq = sorted(seq,key = lambda s: s.size(0),reverse=True)
        length = [s.size(0) for s in seq]
        max_length = max(length)
        seq = [torch.cat([s,Variable(torch.LongTensor([self.char2index['<pad>']]*(max_length-s.size(0))))]).view(1,-1) for s in seq]
        seq = torch.cat(seq)
        if index:
            return seq, length, Variable(torch.LongTensor(index))
        else:
            return seq, length
        
    def train_mimick(self,step,batch_size=32,lr=0.0001):
        print("start training mimic-rnn with %d batch_size" % batch_size)
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()),lr=lr)
        for step_index in range(step):
            try:
                offset = 0
                iter_index = list(range(len(self.vocab)//batch_size + 1))
                for i in iter_index:
                    voca = self.vocab[offset:offset+batch_size]
                    index = list(range(offset,offset+batch_size))
                    offset+=batch_size
                    
                    inputs, lengths, index = self.prepare_char(voca,index)
                    if USE_CUDA:
                        inputs = inputs.cuda()
                        index = index.cuda()
                    self.zero_grad()
                    outputs = self.mimick(inputs,lengths)
                    targets = self.word_embed(index)
                    loss = F.mse_loss(outputs,targets)
                    loss.backward()
                    optimizer.step()
                    if i % 100==0:
                        print("[%d/%d] [%d/%d] mean_loss : %.7f" % (step_index,step,i,len(iter_index),loss.data[0]))
            except KeyboardInterrupt:
                print("Early Stop!")
                break
            
    def mimick(self,inputs,lengths):
        hidden = self.init_char_hidden(inputs.size(0))
        embedded = self.char_embed(inputs)
        packed = pack_padded_sequence(embedded,lengths,batch_first=True)
        outputs, (hidden,context) = self.mimick_rnn(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        hidden = torch.cat([h for h in hidden], 1) # concat
        return self.mimick_linear(hidden)
        

In [14]:
model = MimickRNN(vocab,300,50,100,200)
model.init_word_embed(pretrained_vector)
if USE_CUDA:
    model.cuda()

In [11]:
STEP = 100
BATCH_SIZE = 128

In [12]:
model.train_mimick(STEP,BATCH_SIZE)

start training mimic-rnn with 128 batch_size


RuntimeError: Expected hidden[0] size (2, 128, 100), got (2, 128, 50)

In [19]:
model.vocab[0:32]

['하',
 '이',
 '있',
 '년',
 '한',
 '들',
 '일',
 '되',
 '월',
 '적',
 '것',
 '분류',
 '수',
 '주',
 '인',
 '했',
 '그',
 '해',
 '않',
 '한다',
 '없',
 '할',
 '사용',
 '된',
 '보',
 '합니다',
 '말',
 '등',
 '때',
 '문서',
 '파일',
 '때문']