# Word Vector Representations: word2vec 

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture2.pdf
* https://arxiv.org/abs/1301.3781

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.2.0+1449c2c
3.2.4


# Skip-gram with naiive softmax 

## Load corpus : Gutenburg corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

In [3]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [107]:
corpus = list(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))[:5000]

### Extract Stopwords

불용어를 지정하는 방법은 여러가지가 있지만, 여기선 휴리스틱하게 빈도 분포 꼬리에서 0.01씩을 사용

In [108]:
from collections import Counter

In [111]:
word_count = Counter(corpus)
len(word_count)*0.01 # 16개씩

16.67

In [116]:
stopwords = word_count.most_common()[:15]+list(reversed(word_count.most_common()))[:15]

In [117]:
stopwords = [s[0] for s in stopwords]

In [118]:
stopwords

[',',
 'the',
 '.',
 'of',
 '"',
 '--',
 'and',
 '."',
 'a',
 'in',
 'to',
 "'",
 '-',
 'S',
 'is',
 'authors',
 'however',
 'WALLER',
 'petticoat',
 'southern',
 'ENGLISH',
 'shores',
 'Flounders',
 'enrolled',
 'PROFANE',
 'account',
 'attended',
 'demanded',
 'fuzzing',
 'ECKERMANN']

### Build vocab

In [119]:
vocab = list(set(corpus)-set(stopwords))
vocab.append('<UNK>')

In [120]:
print(len(set(corpus)),len(vocab)) # 40 차이

1667 1638


In [121]:
word2index = {'<UNK>' : 0}

for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)

index2word = {v:k for k,v in word2index.items()} 

# 1. Skip-gram 

### Prepare train data 

먼저 양 옆의 몇 개까지의 단어를 예측할 것인지, WINDOW_SIZE를 정한다

In [122]:
WINDOW_SIZE = 3
windows =  list(nltk.ngrams(corpus,WINDOW_SIZE*2+1))

In [123]:
windows[0]

('[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851')

In [124]:
train_data = []

for window in windows:
    train_data.append((window[3],window[0]))
    train_data.append((window[3],window[1]))
    train_data.append((window[3],window[2]))
    train_data.append((window[3],window[4]))
    train_data.append((window[3],window[5]))
    train_data.append((window[3],window[6]))

print(train_data[:6])

[('by', '['), ('by', 'Moby'), ('by', 'Dick'), ('by', 'Herman'), ('by', 'Melville'), ('by', '1851')]


In [125]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

### Function for word -> tensor

만약 vocab에 없는 단어가 등장한다면 UNKNOWN 태그로 대체한다

In [126]:
def make_vocab_vector(context, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], context))
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

def make_input_vector(input,word2index):
    tensor = torch.LongTensor([word2index[input]]) if input in word2index.keys() else torch.LongTensor([word2index["<UNK>"]])
    return Variable(tensor)

make_input_vector(train_data[0][1], word2index)

Variable containing:
 648
[torch.LongTensor of size 1]

In [127]:
X_p=[]
y_p=[]

In [128]:
for tr in train_data:
    temp = make_input_vector(tr[0],word2index)
    temp = temp.view(1,-1)
    X_p.append(temp)
    
    temp = make_input_vector(tr[1],word2index)
    temp = temp.view(1,-1)
    y_p.append(temp)

In [129]:
train_data = list(zip(X_p,y_p))

In [130]:
len(train_data)

29964

### Modeling

In [222]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size,projection_dim,window_size=2):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)
        self.window_size = window_size

        self.embedding_v.weight.data.uniform_(-0.1, 0.1) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
    def forward(self, inputs,vocabs):
        embeds = self.embedding_v(inputs) # B x 1 x D
        all_embeds = self.embedding_u(vocabs) # B x V x D
        
        #embeds.expand_as(all_embeds)
        
        scores = embeds.bmm(all_embeds.transpose(1,2)) # Bx1xD * BxDxV => Bx1xV
        scores = F.log_softmax(scores)
        return scores.squeeze(1)

    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### Train  

In [223]:
PROJECTION = 30 # 30차원으로 임베딩
BATCH_SIZE = 32
STEP_SIZE = 50

In [226]:
losses = []
loss_function = nn.NLLLoss(ignore_index=0)
model = Skipgram(len(word2index),PROJECTION)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [227]:
for step in range(STEP_SIZE):
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        vocabs = make_vocab_vector(list(vocab),word2index).view(1,-1).expand(inputs.size(0),len(vocab))  # B x V`
        model.zero_grad()

        pred = model(inputs,vocabs)
        loss = loss_function(pred,targets.view(-1))
        
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.numpy()[0])
    if step % 10==0:
        print("Step : %d, mean_loss : %.02f" % (step,np.mean(losses)))
        losses=[]

Step : 0, mean_loss : 3.37
Step : 10, mean_loss : 2.05
Step : 20, mean_loss : 1.85
Step : 30, mean_loss : 1.84
Step : 40, mean_loss : 1.83


### Test

In [228]:
from scipy.spatial.distance import euclidean, cosine

In [254]:
def word_analogy(target,vocabs):
    target_idx = word2index[target]
    target_V = model.prediction(make_input_vector(target,word2index)).data.numpy()
    nearest_idx = -1
    minimum = 100
    similarity=[]
    for i in range(len(vocabs)):
        if i == target_idx: continue
        
        vector = model.prediction(make_input_vector(list(vocabs)[i],word2index)).data.numpy()
        
        similarity.append([vocabs[i],cosine(target_V,vector)])
        
        
        
    return sorted(similarity, key=lambda x: x[1], reverse=True)[:10]

In [257]:
test = random.choice(list(vocab))
test

'therein'

In [258]:
word_analogy(test,vocab)

[['snaps', 1.5047067878554012],
 ['at', 1.4935954840537407],
 ['everything', 1.4818283880800927],
 ['jaws', 1.4617364718637229],
 ['rushes', 1.4614953079681523],
 ['Whale', 1.4497811296634937],
 ['expanded', 1.4294372802666919],
 ['stern', 1.4235365625093919],
 ['around', 1.4217932425397053],
 [';', 1.404129894785769]]

## TODO

* embedding_u로부터 나온 BxVxD가 Parameter처럼 사용되어야 하는걸까?
* test 방법 자체를 좀 검증할 필요가 있음..