In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import Hangulpy as hg
import pickle
flatten = lambda l: [item for sublist in l for item in sublist]
from torch.nn.utils.rnn import PackedSequence,pack_padded_sequence

USE_CUDA = torch.cuda.is_available()

In [2]:
pretrained_vector = pickle.load(open("data/glove_50.pkl","rb"))
word2index = pickle.load(open("data/vocab_glove_50.dict","rb"))

In [3]:
index2word = {v:k for k,v in word2index.items()}

In [4]:
vocab = []
for i in range(len(index2word)):
    vocab.append(index2word[i])

In [5]:
len(vocab)

442026

In [6]:
class MimickRNN(nn.Module):
    
    def __init__(self,vocab,word_embed,char_embed,char_hidden,mlp_hidden):
        super(MimickRNN,self).__init__()
        
        self.word_embed = nn.Embedding(len(vocab),word_embed)
        self.vocab = vocab
        
        char_vocab = ['<pad>','<other>','ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 
              'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 
              'ㅎ', 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 
              'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ',
              'ㄳ', 'ㄵ', 'ㄶ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ',
              'ㄾ', 'ㄿ', 'ㅀ', 'ㅄ',
              '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k',
              'l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G',
              'H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',"{","}"
               '-','(',')','!','~','?','[',']',',','.','/','<','>','#','@','$','%','^','&','*','_',
               '+','-','=',':',';',"'",'"']
        
        self.char_hidden = char_hidden
        self.char2index = {v:i for i,v in enumerate(char_vocab)}
        self.char_embed = nn.Embedding(len(self.char2index), char_embed)
        self.mimick_rnn = nn.LSTM(char_embed,char_hidden,1,batch_first=True,bidirectional=True)
        self.mimick_linear = nn.Sequential(nn.Linear(char_hidden*2,mlp_hidden),
                                                           nn.Tanh(),
                                                           nn.Linear(mlp_hidden,word_embed))
        
    def init_word_embed(self,pretrained_vectors):
        self.word_embed.weight = nn.Parameter(torch.from_numpy(pretrained_vectors).float())
        self.word_embed.requires_grad = False # 고정
    
    def init_char_hidden(self,size):
        hidden = Variable(torch.zeros(2,size,self.char_hidden))
        context = Variable(torch.zeros(2,size,self.char_hidden))
        if USE_CUDA:
            hidden = hidden.cuda()
            context = hidden.cuda()
        return hidden, context
    
    def prepare_single_char(self,token):
        idxs=[]
        for s in token:
            if hg.is_hangul(s):
                # 음소 단위 분해
                try:
                    emso = list(hg.decompose(s))
                    if emso[-1]=='':
                        emso.pop()
                except:
                    emso = s
                idxs.extend(list(map(lambda w: self.char2index[w], emso)))
            else:
                candit=s
                if s.isalpha():
                    candit='<alpha>'
                try:
                    idxs.append(self.char2index[candit])
                except:
                    idxs.append(self.char2index['<other>']) # '' 가 OTHER같이
        tensor = torch.LongTensor(idxs)
        tensor = Variable(tensor)
        return tensor
    
    def prepare_char(self,seq,index=None):
        seq = list(map(lambda v: self.prepare_single_char(v), seq))
        if index:
            forsort = list(zip(seq,index))
            forsort = sorted(forsort,key = lambda s: s[0].size(0),reverse=True)
            seq,index = list(zip(*forsort))
            seq,index = list(seq),list(index)
        else:
            seq = sorted(seq,key = lambda s: s.size(0),reverse=True)
        length = [s.size(0) for s in seq]
        max_length = max(length)
        seq = [torch.cat([s,Variable(torch.LongTensor([self.char2index['<pad>']]*(max_length-s.size(0))))]).view(1,-1) for s in seq]
        seq = torch.cat(seq)
        if index:
            return seq, length, Variable(torch.LongTensor(index))
        else:
            return seq, length
        
    def train_mimick(self,step,batch_size=32,lr=0.0001):
        print("start training mimic-rnn with %d batch_size" % batch_size)
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()),lr=lr)
        for step_index in range(step):
            try:
                offset = 0
                iter_index = list(range(len(self.vocab)//batch_size + 1))
                for i in iter_index:
                    voca = self.vocab[offset:offset+batch_size]
                    index = list(range(offset,offset+batch_size))
                    offset+=batch_size
                    
                    inputs, lengths, index = self.prepare_char(voca,index)
                    if USE_CUDA:
                        inputs = inputs.cuda()
                        index = index.cuda()
                    self.zero_grad()
                    outputs = self.mimick(inputs,lengths)
                    targets = self.word_embed(index)
                    loss = F.mse_loss(outputs,targets)
                    loss.backward()
                    optimizer.step()
                    if i % 100==0:
                        print("[%d/%d] [%d/%d] mean_loss : %.7f" % (step_index,step,i,len(iter_index),loss.data[0]))
            except KeyboardInterrupt:
                print("Early Stop!")
                break
            
    def mimick(self,inputs,lengths):
        hidden = self.init_char_hidden(inputs.size(0))
        embedded = self.char_embed(inputs)
        packed = pack_padded_sequence(embedded,lengths,batch_first=True)
        outputs, (hidden,context) = self.mimick_rnn(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        hidden = torch.cat([h for h in hidden], 1) # concat
        return self.mimick_linear(hidden)
        

In [35]:
model = MimickRNN(vocab,50,20,50,200) # vocab, word_embed, char_embed, char_hidden, mlp_hidden
model.init_word_embed(pretrained_vector)
if USE_CUDA:
    model = model.cuda()

In [36]:
STEP = 20
BATCH_SIZE = 256

In [38]:
model.train_mimick(STEP,BATCH_SIZE)

start training mimic-rnn with 256 batch_size
[0/20] [0/1727] mean_loss : 0.1575834
[0/20] [100/1727] mean_loss : 0.0663308
[0/20] [200/1727] mean_loss : 0.0603833
[0/20] [300/1727] mean_loss : 0.0354156
[0/20] [400/1727] mean_loss : 0.0559310
[0/20] [500/1727] mean_loss : 0.0354932
[0/20] [600/1727] mean_loss : 0.0389019
[0/20] [700/1727] mean_loss : 0.0283960
[0/20] [800/1727] mean_loss : 0.0220782
[0/20] [900/1727] mean_loss : 0.0194084
[0/20] [1000/1727] mean_loss : 0.0127001
[0/20] [1100/1727] mean_loss : 0.0113896
[0/20] [1200/1727] mean_loss : 0.0073245
[0/20] [1300/1727] mean_loss : 0.0063106
[0/20] [1400/1727] mean_loss : 0.0048851
[0/20] [1500/1727] mean_loss : 0.0027732
[0/20] [1600/1727] mean_loss : 0.0024465
[0/20] [1700/1727] mean_loss : 0.0000853
[1/20] [0/1727] mean_loss : 0.1538887
[1/20] [100/1727] mean_loss : 0.0664462
[1/20] [200/1727] mean_loss : 0.0597725
[1/20] [300/1727] mean_loss : 0.0348659
[1/20] [400/1727] mean_loss : 0.0552871
[1/20] [500/1727] mean_loss : 0

### Test Rare word

https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%9D%B8%ED%84%B0%EB%84%B7_%EC%8B%A0%EC%A1%B0%EC%96%B4_%EB%AA%A9%EB%A1%9D

In [31]:
rare_words = ["가즈아","고고씽","갠톡","급식충","꿀잼","존잼","존잘","낫닝겐","덕통사고","먹튀","보이루","빝코","띵작","댕댕이"]

In [32]:
for word in rare_words:
    try:
        model.vocab.index(word)
    except:
        print("%s is not in voca" % word)

가즈아 is not in voca
갠톡 is not in voca
급식충 is not in voca
꿀잼 is not in voca
존잼 is not in voca
존잘 is not in voca
낫닝겐 is not in voca
덕통사고 is not in voca
먹튀 is not in voca
보이루 is not in voca
빝코 is not in voca
띵작 is not in voca


In [33]:
def get_most_word_embedding(word,num=5):
    matrix = model.word_embed.weight
    inputs,lengths = model.prepare_char([word])
    embedding = model.mimick(inputs,lengths)
    similarities = matrix.matmul(embedding.transpose(0,1))
    similarities = similarities.transpose(0,1)
    norm = matrix.norm(dim=1)*embedding.norm()
    similarities = similarities/norm
    
    _ , i = similarities.topk(num)
    index = i.data.tolist()[0]
    similar_words = [model.vocab[i] for i in index]
    print(word)
    print(similar_words)

In [34]:
for word in rare_words:
    get_most_word_embedding(word)
    print("\n")

가즈아
['대한', '특히', '들', '설빙학', '적']


고고씽
['대한', '설빙학', '대해', '특히', '들']


갠톡
['대한', '설빙학', '들', '특히', '이러']


급식충
['대한', '설빙학', '많', '적', '들']


꿀잼
['설빙학', '대한', '들', '이러', '많']


존잼
['대한', '설빙학', '들', '많', '적']


존잘
['설빙학', '대한', '많', '들', '특히']


낫닝겐
['설빙학', '들', '특크하우젠', '대한', '적']


덕통사고
['대한', '설빙학', '문제', '적극', '대해']


먹튀
['설빙학', '특크하우젠', '대한', '적', '들']


보이루
['설빙학', '대한', '문제', '들', '적']


빝코
['설빙학', '특크하우젠', '합네', '대한', '이러']


띵작
['설빙학', '들', '대한', '적', '특히']


댕댕이
['설빙학', '대한', '들', '대해', '많']




In [24]:
get_most_word_embedding("냥냥이")

냥냥이
['뎬바', '막동이', '응우엔차이', '나사돌리개', '동인전람']
