In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import Hangulpy as hg
import datetime
import pickle
flatten = lambda l: [item for sublist in l for item in sublist]
from torch.nn.utils.rnn import PackedSequence,pack_padded_sequence

USE_CUDA = torch.cuda.is_available()

In [2]:
torch.__version__

'0.3.1'

In [3]:
pretrained_vector = pickle.load(open("data/pretrained_word2vec.pkl","rb"))
word2index = pickle.load(open("data/vocab_word2vec_300.dict","rb"))

In [4]:
index2word = {v:k for k,v in word2index.items()}

In [5]:
vocab = []
for i in range(len(index2word)):
    vocab.append(index2word[i])

In [6]:
len(vocab)

148392

In [8]:
class MimickRNN(nn.Module):
    
    def __init__(self,vocab,word_embed,char_embed,char_hidden,mlp_hidden):
        super(MimickRNN,self).__init__()
        
        self.word_embed = nn.Embedding(len(vocab),word_embed)
        self.vocab = vocab
        
        char_vocab = ['<pad>','<other>','ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 
              'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 
              'ㅎ', 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 
              'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ',
              'ㄳ', 'ㄵ', 'ㄶ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ',
              'ㄾ', 'ㄿ', 'ㅀ', 'ㅄ',
              '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k',
              'l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G',
              'H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',"{","}"
               '-','(',')','!','~','?','[',']',',','.','/','<','>','#','@','$','%','^','&','*','_',
               '+','-','=',':',';',"'",'"']
        
        self.char_hidden = char_hidden
        self.char2index = {v:i for i,v in enumerate(char_vocab)}
        self.char_embed = nn.Embedding(len(self.char2index), char_embed)
        self.mimick_rnn = nn.LSTM(char_embed,char_hidden,1,batch_first=True,bidirectional=True)
        self.mimick_linear = nn.Sequential(nn.Linear(char_hidden*2,mlp_hidden),
                                                           nn.Tanh(),
                                                           nn.Linear(mlp_hidden,word_embed))
        
    def init_word_embed(self,pretrained_vectors):
        self.word_embed.weight = nn.Parameter(torch.from_numpy(pretrained_vectors).float())
        self.word_embed.requires_grad = False # 고정
    
    def init_char_hidden(self,size):
        hidden = Variable(torch.zeros(2,size,self.char_hidden))
        context = Variable(torch.zeros(2,size,self.char_hidden))
        if USE_CUDA:
            hidden = hidden.cuda()
            context = hidden.cuda()
        return hidden, context
    
    def prepare_single_char(self,token):
        idxs=[]
        for s in token:
            if hg.is_hangul(s):
                # 음소 단위 분해
                try:
                    emso = list(hg.decompose(s))
                    if emso[-1]=='':
                        emso.pop()
                except:
                    emso = s
                idxs.extend(list(map(lambda w: self.char2index[w], emso)))
            else:
                candit=s
                if s.isalpha():
                    candit='<alpha>'
                try:
                    idxs.append(self.char2index[candit])
                except:
                    idxs.append(self.char2index['<other>']) # '' 가 OTHER같이
        tensor = torch.LongTensor(idxs)
        tensor = Variable(tensor)
        return tensor
    
    def prepare_char(self,seq,index=None):
        seq = list(map(lambda v: self.prepare_single_char(v), seq))
        if index:
            forsort = list(zip(seq,index))
            forsort = sorted(forsort,key = lambda s: s[0].size(0),reverse=True)
            seq,index = list(zip(*forsort))
            seq,index = list(seq),list(index)
        else:
            seq = sorted(seq,key = lambda s: s.size(0),reverse=True)
        length = [s.size(0) for s in seq]
        max_length = max(length)
        seq = [torch.cat([s,Variable(torch.LongTensor([self.char2index['<pad>']]*(max_length-s.size(0))))]).view(1,-1) for s in seq]
        seq = torch.cat(seq)
        if index:
            return seq, length, Variable(torch.LongTensor(index))
        else:
            return seq, length
        
    def train_mimick(self,step,batch_size=32,lr=0.0001):
        print("start training mimic-rnn with %d batch_size" % batch_size)
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()),lr=lr)
        for step_index in range(step):
            try:
                offset = 0
                iter_index = list(range(len(self.vocab)//batch_size + 1))
                for i in iter_index:
                    voca = self.vocab[offset:offset+batch_size]
                    index = list(range(offset,offset+batch_size))
                    offset+=batch_size
                    
                    inputs, lengths, index = self.prepare_char(voca,index)
                    if USE_CUDA:
                        inputs = inputs.cuda()
                        index = index.cuda()
                    self.zero_grad()
                    outputs = self.mimick(inputs,lengths)
                    targets = self.word_embed(index)
                    loss = F.mse_loss(outputs,targets)
                    loss.backward()
                    optimizer.step()
                    if i % 100==0:
                        print("[%d/%d] [%d/%d] mean_loss : %.7f" % (step_index,step,i,len(iter_index),loss.data[0]))
            except KeyboardInterrupt:
                print("Early Stop!")
                break
            
    def mimick(self,inputs,lengths):
        hidden = self.init_char_hidden(inputs.size(0))
        embedded = self.char_embed(inputs)
        packed = pack_padded_sequence(embedded,lengths,batch_first=True)
        outputs, (hidden,context) = self.mimick_rnn(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        hidden = torch.cat([h for h in hidden], 1) # concat
        return self.mimick_linear(hidden)
        

In [16]:
model = MimickRNN(vocab,300,50,100,400) # vocab, word_embed, char_embed, char_hidden, mlp_hidden
model.init_word_embed(pretrained_vector)
if USE_CUDA:
    model = model.cuda()

In [17]:
STEP = 50
BATCH_SIZE = 256

In [43]:
model.train_mimick(STEP,BATCH_SIZE)

start training mimic-rnn with 256 batch_size
[0/50] [0/580] mean_loss : 0.0085805
[0/50] [100/580] mean_loss : 0.0117477
[0/50] [200/580] mean_loss : 0.0139335
[0/50] [300/580] mean_loss : 0.0108910
[0/50] [400/580] mean_loss : 0.0072766
[0/50] [500/580] mean_loss : 0.0033978
[1/50] [0/580] mean_loss : 0.0085238
[1/50] [100/580] mean_loss : 0.0116398
[1/50] [200/580] mean_loss : 0.0137777
[1/50] [300/580] mean_loss : 0.0107359
[1/50] [400/580] mean_loss : 0.0071467
[1/50] [500/580] mean_loss : 0.0033152
[2/50] [0/580] mean_loss : 0.0084109
[2/50] [100/580] mean_loss : 0.0114618
[2/50] [200/580] mean_loss : 0.0135673
[2/50] [300/580] mean_loss : 0.0105518
[2/50] [400/580] mean_loss : 0.0070053
[2/50] [500/580] mean_loss : 0.0032310
[3/50] [0/580] mean_loss : 0.0082807
[3/50] [100/580] mean_loss : 0.0112699
[3/50] [200/580] mean_loss : 0.0133459
[3/50] [300/580] mean_loss : 0.0103625
[3/50] [400/580] mean_loss : 0.0068629
[3/50] [500/580] mean_loss : 0.0031478
[4/50] [0/580] mean_loss : 

### Test Rare word

https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%9D%B8%ED%84%B0%EB%84%B7_%EC%8B%A0%EC%A1%B0%EC%96%B4_%EB%AA%A9%EB%A1%9D

In [66]:
rare_words = ["가즈아","고고씽","갠톡","급식충","꿀잼","존잼","존잘","낫닝겐","덕통사고","먹튀","보이루","빝코","띵작","냥이","키드밀리",
                    "빈스빈스","ㅂㅇㄹ","아이오아이","동동키드","안뇽","옼돜","띵언","블랙핑크","프사기","선우정아"]

In [67]:
for word in rare_words:
    try:
        model.vocab.index(word)
    except:
        print("%s is not in voca" % word)

가즈아 is not in voca
갠톡 is not in voca
급식충 is not in voca
꿀잼 is not in voca
존잼 is not in voca
존잘 is not in voca
낫닝겐 is not in voca
덕통사고 is not in voca
먹튀 is not in voca
보이루 is not in voca
빝코 is not in voca
띵작 is not in voca
냥이 is not in voca
키드밀리 is not in voca
빈스빈스 is not in voca
ㅂㅇㄹ is not in voca
아이오아이 is not in voca
동동키드 is not in voca
안뇽 is not in voca
옼돜 is not in voca
띵언 is not in voca
블랙핑크 is not in voca
프사기 is not in voca
선우정아 is not in voca


In [68]:
def get_most_word_embedding(word,num=5):
    matrix = model.word_embed.weight
    inputs,lengths = model.prepare_char([word])
    if USE_CUDA: inputs = inputs.cuda()
    embedding = model.mimick(inputs,lengths)
    similarities = matrix.matmul(embedding.transpose(0,1))
    similarities = similarities.transpose(0,1)
    norm = matrix.norm(dim=1)*embedding.norm()
    similarities = similarities/norm
    
    _ , i = similarities.topk(num)
    index = i.data.tolist()[0]
    similar_words = [model.vocab[i] for i in index]
    print(word)
    print(similar_words)

In [69]:
for word in rare_words:
    get_most_word_embedding(word)
    print("\n")

가즈아
['가와나', '야바시라', '후타라', '고즈카', '기타우라']


고고씽
['밥투정', '압슬형', '훈시규정', '가오핑', '언필칭']


갠톡
['해칭', '거폭', '버섯바위', '뎃전', '저팬타운']


급식충
['간흡충', '십이지장충', '톡소포자충', '활축', '부식기']


꿀잼
['꼬리별', '가래미', '감천만', '큰입구몸', '비늘돔']


존잼
['전소호', '손인형', '이무성', '임두성', '조한구']


존잘
['손절', '약반', '곁말', '세변', '중탈']


낫닝겐
['메허샬레하쉬바즈', '뱌쿠엔', '오도노반', '렝가만', '하이위안']


덕통사고
['중간보고', '남사고', '동문휘고', '부사원', '동정남']


먹튀
['밭머리', '모래그릇', '새앙머리', '도들', '킬킬거리']


보이루
['캰도루', '다보아', '모리우', '소우리', '탐모라']


빝코
['펨코', '필리펜코', '올메카', '헵토미노', '줄라이카']


띵작
['띵호', '와작와작', '매타작', '시작', '팔건장']


냥이
['샹이', '뇽이', '멩이', '팔랑이', '쨩이']


키드밀리
['게이틀리', '케어필리', '킹즐리', '트링코말리', '몰로카이']


빈스빈스
['블레빈스', '라데팡스', '스타인펠드', '게이틀리', '부캐넌']


ㅂㅇㄹ
['흑', '죽', '걱', '잡', 'ㅇㅅㅁ']


아이오아이
['만타가오리', '아이조메', '아노아이', '아메노모리', '안가라노']


동동키드
['월포드', '도거뱅크', '민톤', '아칸서스', '위태로운']


안뇽
['안롱', '우망', '후란', '나궁', '세롱']


옼돜
['낰낰', '폿푸루', '가로놓이', '바께쓰', '시갓']


띵언
['띵호', '떠본', '나무저', '필은', '펍에']


블랙핑크
['메이플스', '스펀지케이크', '플링크', '크랜필드', '스푹스']


프사기
['엑타', '프린트기', '시데리

In [82]:
cdate = datetime.datetime.strftime(datetime.datetime.now(),"%m_%d")
config = {'model_path' : '/models/mimick_params_'+cdate+'.model','vocab_path' : '/models/mimick_'+cdate+'.vocab', 'word_embed' : 300, 'char_embed' : 50, 'char_hidden' : 100, 'mlp_hidden' : 400}


if USE_CUDA:
    model = model.cpu()
    
torch.save(model.state_dict(), 'models/mimick_params_'+cdate+'.model')
pickle.dump(config,open('models/mimick_'+cdate+'.config',"wb"))
pickle.dump(vocab,open('models/mimick_'+cdate+'.vocab',"wb"))