In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
from konlpy.tag import Kkma
import torch.utils.data as torchdata
from torchtext.data import Field
tagger = Kkma()
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.3.1
3.2.4


In [3]:
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

### Util 함수

In [4]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<unk>"], seq))
    return LongTensor(idxs)

def prepare_word(word, word2index):
    return LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<unk>"]])

## 1. 데이터 로드 

In [5]:
corpus = open('data/corpus.txt','r',encoding="utf-8").readlines()
corpus = [c[:-1] for c in corpus]

In [6]:
corpus[:10]

['음 저는 구글이 저 시스템을 지원할거라 믿습니다. 이번 eager처럼 파이토치에서 좋아보이는건 유저 안뺏기려 사람 갈아 넣는중이니..',
 '시스템 보니까 좋은 건 다 갈아넣었던데 좀 써봐야겠고... 기술된 것만 봐서는 추후 모델 프로토타이핑은 그냥 파이토치로 넘어가야하지 않나 싶을 정도.',
 '진짜 올해 안에 파이토치로 넘어가야하나? 넘어가도 될 유도가 하나씩 늘어나고 있음',
 '파이토치 한번도 공부 안 해봤는데 마이그레이션 하느라 하도 많이 봤더니 이제 나도 짤 수 있을 것 같음',
 '하아.. 파이토치 쓰다가 텐서플로 쓰니 넘나 피곤한 것.. -.-;; 확실히 파이토치가 편하게는 만들었구만. 그치만 프로덕션을 생각한다면 확실히 아직은 텐서플로가 압도적으로 우세한 것 같다.',
 '텐서플로 1.5에 파이토치모드라 불리는(ㅋㅋ) eager 모드의 프리뷰 버전이 들어간다고합니다. 그래프구성+연산을 나눠서하지 않고 합쳐서 하는건데요. 이게 정식으로 들어가면 연구하시는 분들은 텐서플로를 좀 더 편하게 쓰실 수 있을 것 같네요.',
 '인공지능(AI) 유통상품 인식 기술로 소매점 결제 무인화 앞당긴다  딥러닝과 특징점 매칭 기술로 농산물과 같이 개체마다 모양의 차이가 큰 자연물이나 비정형 상품에서부터 공산품까지 다양한 상품을 동시에 정확하게 인식',
 '구글 딥러닝 쌩씨발새끼',
 '근데 딥러닝 뭐시기 비디오게임도 나오지 않을까  보스가 너의 패턴을 학습한다',
 '브랜돈 포스팅에 따봉 1개월 이상 없는 분들은 페북 딥러닝 IoT 5G 블록체인 ICO 알고리즘으로 인해 자동 페삭됩니다.']

## 2. Tokenize 

In [7]:
tokenized = [tagger.morphs(c) for c in corpus]

In [8]:
tokenized[0]

['음',
 '저',
 '는',
 '구',
 '글',
 '이',
 '저',
 '시스템',
 '을',
 '지원',
 '하',
 'ㄹ',
 '거',
 '이',
 '라',
 '믿',
 '습니다',
 '.',
 '이번',
 'eager',
 '처럼',
 '파이',
 '토치',
 '에서',
 '좋',
 '아',
 '보이',
 '는',
 '것',
 '은',
 '유저',
 '안',
 '뺏기',
 '려',
 '사람',
 '갈',
 '아',
 '넣',
 '는',
 '중',
 '이',
 '니',
 '..']

## 3. 빈도수가 적은 단어 제외

stopwords 지정

In [9]:
from collections import Counter

In [10]:
word_count = Counter(flatten(tokenized))

In [11]:
list(reversed(word_count.most_common()))[:10]

[('!!!!!!', 1),
 ('대학', 1),
 ('응용', 1),
 ('깔', 1),
 ('대면', 1),
 ('T', 1),
 ('툴셋', 1),
 ('파시', 1),
 ('도움', 1),
 ('것들', 1)]

In [12]:
MIN_COUNT = 2 # 최소 2번 이상 등장한 단어만 사용
stopwords = []

In [13]:
for w, c in word_count.items():
    if c < MIN_COUNT:
        stopwords.append(w)

In [14]:
len(stopwords)

604

## 3. 단어셋 구축

In [15]:
vocab = list(set(flatten(tokenized)) - set(stopwords))

In [16]:
word2index = {'<unk>' : 0}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

In [17]:
len(word2index)

345

## 4. 학습 데이터 준비 

In [18]:
WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in tokenized])

train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        # stopwords에 속하는 단어는 제외
        if window[i] in stopwords or window[WINDOW_SIZE] in stopwords: 
            continue # min_count
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

X_p = []
y_p = []

for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index))
    y_p.append(prepare_word(tr[1], word2index))
    
train_data = list(zip(X_p, y_p))

In [19]:
len(train_data)

20794

In [20]:
train_data[0] # (center word, context word) pairs

(
  184
 [torch.LongTensor of size 1], 
  43
 [torch.LongTensor of size 1])

각 페어를 토치 데이터셋 클래스로 래핑 후, 데이터로더 로딩

In [21]:
class WordPair(torchdata.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
        self.length = len(self.dataset)
        
    def __getitem__(self, index):
        # 인덱스에 해당하는 데이터셋 리턴
        return self.dataset[index]
        
    def __len__(self):
        # 데이터셋 수
        return self.length

In [22]:
train_data = WordPair(train_data)

In [23]:
train_loader = torchdata.DataLoader(dataset=train_data,
                                           batch_size=256, 
                                           shuffle=True)

### Build Unigram Distribution**0.75 

$$P(w)=U(w)^{3/4}/Z$$

In [24]:
Z = 0.001

In [25]:
word_count = Counter(flatten(tokenized)) # unigram distribution
num_total_words = sum([c for w, c in word_count.items() if w not in stopwords])

In [26]:
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

In [27]:
print(len(vocab), len(unigram_table))

344 3572


### Negative Sampling 

P(w)에서 K개만큼의 Negative samples을 뽑아주는 함수

In [28]:
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].tolist()[0]
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

## Modeling 

In [29]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                
        # xavier init
        self.embedding_v.weight.data = nn.init.xavier_uniform(self.embedding_v.weight.data)
        self.embedding_u.weight.data = nn.init.xavier_uniform(self.embedding_u.weight.data)
        
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        
        neg_embeds = -self.embedding_u(negative_words) # B x K x D
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1
        negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2), 1).view(negs.size(0), -1) # BxK -> Bx1
        
        # loss function
        loss = self.logsigmoid(positive_score) + self.logsigmoid(negative_score)
        
        return -torch.mean(loss)
    
    def get_embedding(self, inputs):
        embeds_v = self.embedding_v(inputs)
        embeds_u = self.embedding_u(inputs)
        
        return (embeds_v+embeds_u)/2

## Train 

In [30]:
EMBEDDING_SIZE = 30 
BATCH_SIZE = 256
EPOCH = 100
NEG = 10 # Num of Negative Sampling

In [31]:
losses = []
model = SkipgramNegSampling(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [32]:
for epoch in range(EPOCH):
    for i,(inputs,targets) in enumerate(train_loader):
        
        # P(w)로부터 Negative Sample NEG개만큼 뽑기
        negs = negative_sampling(targets, unigram_table, NEG)
        inputs = Variable(inputs) # B x 1
        targets = Variable(targets) # B x 1
        negs = Variable(negs) # B x K
        
        model.zero_grad()

        loss = model(inputs, targets, negs)
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.tolist()[0])
    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch, np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 1.32
Epoch : 10, mean_loss : 0.89
Epoch : 20, mean_loss : 0.79
Epoch : 30, mean_loss : 0.71
Epoch : 40, mean_loss : 0.65
Epoch : 50, mean_loss : 0.61
Epoch : 60, mean_loss : 0.58
Epoch : 70, mean_loss : 0.56
Epoch : 80, mean_loss : 0.54
Epoch : 90, mean_loss : 0.53


## Test 

In [33]:
# Cosine Similarity

def word_similarity(target,index2word,num=10):
    target_V = model.get_embedding(Variable(prepare_word(target, word2index))).view(1,-1)
    matrix = (model.embedding_u.weight.data + model.embedding_v.weight.data)/2
    cosine_sim = F.cosine_similarity(target_V.data, matrix,dim=1,eps=1e-6)
    v,i = cosine_sim.topk(num+1)
    
    return [[index2word[ii],vv] for ii,vv in zip(i.tolist()[1:],v.tolist()[1:])]

In [34]:
word_similarity("러닝",index2word)

[['딥', 0.9535467624664307],
 ['머신', 0.7704424262046814],
 ['뇌', 0.6196351051330566],
 ['씨', 0.6168031692504883],
 ['신경망', 0.5668169856071472],
 ['수업', 0.5373671650886536],
 ['게임', 0.49644383788108826],
 ['데', 0.4827418625354767],
 ['뭐', 0.4823964834213257],
 ['나요', 0.47976601123809814]]

## Visualize 

In [54]:
from tensorboardX import SummaryWriter
import pickle
import os
import shutil

In [61]:
# 텐서보드 포트 설정
port = pickle.load(open("port.info","rb"))[os.getcwd().split("/")[-2]]

# 텐서보드 데이터 파일 초기화
try:
    shutil.rmtree('runs/')
except:
    pass

In [62]:
writer = SummaryWriter(comment='-embedding')

In [63]:
matrix = (model.embedding_u.weight.data + model.embedding_v.weight.data)/2
label = [index2word[i] for i in range(len(index2word))]

In [64]:
matrix.size()

torch.Size([345, 30])

In [65]:
writer.add_embedding(matrix,metadata=label)
writer.close()

In [67]:
help(writer.add_embedding)

Help on method add_embedding in module tensorboardX.writer:

add_embedding(mat, metadata=None, label_img=None, global_step=None, tag='default') method of tensorboardX.writer.SummaryWriter instance
    Add embedding projector data to summary.
    
    Args:
        mat (torch.Tensor): A matrix which each row is the feature vector of the data point
        metadata (list): A list of labels, each element will be convert to string
        label_img (torch.Tensor): Images correspond to each data point
        global_step (int): Global step value to record
        tag (string): Name for the embedding
    Shape:
        mat: :math:`(N, D)`, where N is number of data and D is feature dimension
    
        label_img: :math:`(N, C, H, W)`
    
    Examples::
    
        import keyword
        import torch
        meta = []
        while len(meta)<100:
            meta = meta+keyword.kwlist # get some strings
        meta = meta[:100]
    
        for i, v in enumerate(meta):
            meta[i

In [66]:
!tensorboard --logdir runs --port 6006

TensorBoard 0.4.0rc3 at http://dsksd-tf:6006 (Press CTRL+C to quit)
^C


In [None]:
writer.add_embedding()

In [None]:
dataset = vdatasets.MNIST('../../data/MNIST/', train=False, download=True)
images = dataset.test_data[:100].float()
label = dataset.test_labels[:100]

features = images.view(100, 784)
writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))