# Lec2. Word Vector Representations: word2vec 

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture2.pdf
* https://arxiv.org/abs/1301.3781

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.2.0+1449c2c
3.2.4


## Load corpus : Gutenburg corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

In [3]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [101]:
corpus = list(nltk.corpus.gutenberg.sents('shakespeare-hamlet.txt'))[:1000] # 300 문장
corpus = [[word.lower() for word in sent] for sent in corpus]

### Extract Stopwords

불용어를 지정하는 방법은 여러가지가 있지만, 여기선 휴리스틱하게 빈도 분포 꼬리에서 0.01씩을 사용

In [102]:
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

In [103]:
word_count = Counter(flatten(corpus))
border =int(len(word_count)*0.01) 

In [104]:
stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border]

In [105]:
stopwords = [s[0] for s in stopwords]

In [106]:
stopwords

[',',
 '.',
 'and',
 'the',
 'to',
 'of',
 "'",
 ':',
 'my',
 'i',
 'you',
 'it',
 'a',
 ';',
 'in',
 '?',
 'that',
 'not',
 'is',
 'his',
 'ham',
 'requite',
 'emulate',
 'perturbed',
 'hyes',
 'breach',
 'action',
 'hill',
 'enemy',
 'turn',
 'denote',
 'forgeries',
 'coldly',
 'wisely',
 'euents',
 'fingers',
 'liquid',
 'pledge',
 'fretfull',
 'vsurp',
 'easterne',
 'neerer']

### Build vocab

In [107]:
vocab = list(set(flatten(corpus))-set(stopwords))
vocab.append('<UNK>')

In [108]:
print(len(set(flatten(corpus))),len(vocab))

2178 2137


In [109]:
word2index = {'<UNK>' : 0}

for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)

index2word = {v:k for k,v in word2index.items()} 

# 1. Skip-gram with naiive softmax 

### Prepare train data 

먼저 양 옆의 몇 개까지의 단어를 예측할 것인지, WINDOW_SIZE를 정한다

In [111]:
WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>']*WINDOW_SIZE+c+['<DUMMY>']*WINDOW_SIZE,WINDOW_SIZE*2+1)) for c in corpus])

In [112]:
windows[0]

('<DUMMY>',
 '<DUMMY>',
 '<DUMMY>',
 '<DUMMY>',
 '<DUMMY>',
 '[',
 'the',
 'tragedie',
 'of',
 'hamlet',
 'by')

In [114]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if i==WINDOW_SIZE or window[i]=='<DUMMY>': continue
        train_data.append((window[WINDOW_SIZE],window[i]))

print(train_data[:WINDOW_SIZE*2])

[('[', 'the'), ('[', 'tragedie'), ('[', 'of'), ('[', 'hamlet'), ('[', 'by'), ('the', '['), ('the', 'tragedie'), ('the', 'of'), ('the', 'hamlet'), ('the', 'by')]


In [115]:
len(train_data)

97194

In [116]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

### Function for word -> tensor

만약 vocab에 없는 단어가 등장한다면 UNKNOWN 태그로 대체한다

In [117]:
def make_vocab_vector(context, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], context))
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

def make_input_vector(input,word2index):
    tensor = torch.LongTensor([word2index[input]]) if input in word2index.keys() else torch.LongTensor([word2index["<UNK>"]])
    return Variable(tensor)

make_input_vector(train_data[0][1], word2index)

Variable containing:
 0
[torch.LongTensor of size 1]

In [118]:
X_p=[]
y_p=[]

In [123]:
train_data[0]

('[', 'the')

In [124]:
for tr in train_data:
    temp = make_input_vector(tr[0],word2index)
    temp = temp.view(1,-1)
    X_p.append(temp)
    
    temp = make_input_vector(tr[1],word2index)
    temp = temp.view(1,-1)
    y_p.append(temp)

In [125]:
train_data = list(zip(X_p,y_p))

In [126]:
len(train_data)

97194

### Modeling

In [127]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size,projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-0.1, 0.1) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
        
    def forward(self, inputs,vocabs):
        center_embeds = self.embedding_v(inputs) # B x 1 x D
        outside_embeds = self.embedding_u(vocabs) # B x V x D
        
        #embeds.expand_as(all_embeds)
        
        scores = center_embeds.bmm(outside_embeds.transpose(1,2)) # Bx1xD * BxDxV => Bx1xV
        scores = F.log_softmax(scores)
        return scores.squeeze(1)

    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### Train  

In [128]:
PROJECTION = 30 # Embedding size
BATCH_SIZE = 32
STEP_SIZE = 100

In [129]:
losses = []
loss_function = nn.NLLLoss(ignore_index=0)
model = Skipgram(len(word2index),PROJECTION)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [130]:
for step in range(STEP_SIZE):
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        vocabs = make_vocab_vector(list(vocab),word2index).view(1,-1).expand(inputs.size(0),len(vocab))  # B x V
        model.zero_grad()

        pred = model(inputs,vocabs)
        loss = loss_function(pred,targets.view(-1))
        
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.numpy()[0])
    if step % 10==0:
        print("Step : %d, mean_loss : %.02f" % (step,np.mean(losses)))
        losses=[]

Step : 0, mean_loss : 3.46
Step : 10, mean_loss : 2.88
Step : 20, mean_loss : 2.42
Step : 30, mean_loss : 2.32
Step : 40, mean_loss : 2.28
Step : 50, mean_loss : 2.25
Step : 60, mean_loss : 2.23
Step : 70, mean_loss : 2.22
Step : 80, mean_loss : 2.22
Step : 90, mean_loss : 2.21


### Test

In [131]:
from scipy.spatial.distance import euclidean, cosine

In [132]:
def word_similarity(target,vocab):
    target_V = model.prediction(make_input_vector(target,word2index)).data.numpy()

    similarities=[]
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.numpy()
        
        #similarity.append([vocabs[i],np.dot(target_V,vector.T)[0][0]])
        similarities.append([vocab[i],cosine(target_V,vector)-1])
     
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [171]:
test = random.choice(list(vocab))
test

'silence'

In [172]:
word_similarity(test,vocab)

[['so', 0.48434789194976635],
 ['he', 0.4832677321707064],
 ['heauen', 0.48213056295586987],
 ['out', 0.46229306132042414],
 ['we', 0.45845277426262432],
 ['for', 0.45829357946252558],
 ['there', 0.45660592796940325],
 ['hamlet', 0.45594074903624482],
 ['did', 0.45450833820945125],
 ['good', 0.43294835521489139]]

## TODO

* embedding_u로부터 나온 BxVxD가 Parameter처럼 사용되어야 하는걸까?
* test 방법 자체를 좀 검증할 필요가 있음..

In [44]:
import gensim

Using TensorFlow backend.


In [49]:
model2 = gensim.models.Word2Vec(corpus,window=3,size=30,min_count=1)

In [73]:
model2.most_similar(test)

[('goodly', 0.5525621175765991),
 ('heartily', 0.5233322381973267),
 ('o', 0.4946478605270386),
 ('weary', 0.480996310710907),
 ('mine', 0.48078885674476624),
 ('hyes', 0.4767354130744934),
 ('vp', 0.4576575756072998),
 ('gentlemen', 0.4382712244987488),
 ('fathers', 0.43626904487609863),
 ('abroad', 0.4329777956008911)]

# 2. Skip-gram with negative sampling