# Lec2. Word Vector Representations: word2vec 

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture2.pdf
* https://arxiv.org/abs/1301.3781

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.2.0+751198f
3.2.4


In [3]:
USE_CUDA = torch.cuda.is_available()

## Load corpus : Gutenburg corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

In [76]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [158]:
corpus = list(nltk.corpus.gutenberg.sents('austen-sense.txt'))[:100]
corpus = [[word.lower() for word in sent] for sent in corpus]

### Extract Stopwords

불용어를 지정하는 방법은 여러가지가 있지만, 여기선 휴리스틱하게 빈도 분포 꼬리에서 0.01씩을 사용

In [159]:
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

In [160]:
word_count = Counter(flatten(corpus))
border =int(len(word_count)*0.01) 

In [340]:
stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border]

In [341]:
stopwords = [s[0] for s in stopwords]

In [342]:
stopwords

[',',
 '.',
 'to',
 'of',
 'the',
 'and',
 'his',
 'imprudence',
 'income',
 'source',
 'quiet',
 'into',
 'legacies',
 'two']

### Build vocab

In [343]:
vocab = list(set(flatten(corpus))-set(stopwords))
vocab.append('<UNK>')

In [344]:
print(len(set(flatten(corpus))),len(vocab))

717 704


In [345]:
word2index = {'<UNK>' : 0}

for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)

index2word = {v:k for k,v in word2index.items()} 

# 1. Skip-gram with naiive softmax 

### Prepare train data 

먼저 양 옆의 몇 개까지의 단어를 예측할 것인지, WINDOW_SIZE를 정한다

In [346]:
WINDOW_SIZE = 3
windows =  flatten([list(nltk.ngrams(['<DUMMY>']*WINDOW_SIZE+c+['<DUMMY>']*WINDOW_SIZE,WINDOW_SIZE*2+1)) for c in corpus])

In [347]:
windows[0]

('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'sense', 'and', 'sensibility')

In [348]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if i==WINDOW_SIZE or window[i]=='<DUMMY>': continue
        train_data.append((window[WINDOW_SIZE],window[i]))

print(train_data[:WINDOW_SIZE*2])

[('[', 'sense'), ('[', 'and'), ('[', 'sensibility'), ('sense', '['), ('sense', 'and'), ('sense', 'sensibility')]


In [349]:
len(train_data)

16582

In [350]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

### Function for word -> tensor

만약 vocab에 없는 단어가 등장한다면 UNKNOWN 태그로 대체한다

In [351]:
def make_vocab_vector(vocab, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], vocab))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else  Variable(torch.LongTensor(idxs))
    return tensor

def make_input_vector(input,word2index):
    tensor = Variable(torch.LongTensor([word2index[input]]) if input in word2index.keys() else torch.LongTensor([word2index["<UNK>"]]))
    if USE_CUDA:
        tensor = tensor.cuda()
    return tensor

make_input_vector(train_data[0][1], word2index)

Variable containing:
 555
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [352]:
X_p=[]
y_p=[]

In [353]:
train_data[0]

('[', 'sense')

In [354]:
for tr in train_data:
    temp = make_input_vector(tr[0],word2index)
    temp = temp.view(1,-1)
    X_p.append(temp)
    
    temp = make_input_vector(tr[1],word2index)
    temp = temp.view(1,-1)
    y_p.append(temp)

In [355]:
train_data = list(zip(X_p,y_p))

In [356]:
len(train_data)

16582

### Modeling

In [357]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size,projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-0.1, 0.1) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
        #self.out = nn.Linear(projection_dim,vocab_size)
    def forward(self, inputs,vocabs):
        center_embeds = self.embedding_v(inputs) # B x 1 x D
        outside_embeds = self.embedding_v(vocabs) # B x V x D
        scores = center_embeds.bmm(outside_embeds.transpose(1,2)) # Bx1xD * BxDxV => Bx1xV
        #scores = self.out(center_embeds.squeeze(1))
        scores = F.log_softmax(scores)
        return scores.squeeze(1)

    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### Train  

In [358]:
PROJECTION = 30 # Embedding size
BATCH_SIZE = 128
STEP_SIZE = 100

In [359]:
losses = []
loss_function = nn.NLLLoss(ignore_index=0)
model = Skipgram(len(word2index),PROJECTION)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [360]:
for step in range(STEP_SIZE):
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        vocabs = make_vocab_vector(list(vocab),word2index).view(1,-1).expand(inputs.size(0),len(vocab))  # B x V
        model.zero_grad()

        pred = model(inputs,vocabs)
        loss = loss_function(pred,targets.view(-1))
        
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0])
    if step % 10==0:
        print("Step : %d, mean_loss : %.02f" % (step,np.mean(losses)))
        losses=[]

Step : 0, mean_loss : 4.85
Step : 10, mean_loss : 4.71
Step : 20, mean_loss : 4.39
Step : 30, mean_loss : 4.20
Step : 40, mean_loss : 4.09
Step : 50, mean_loss : 4.03
Step : 60, mean_loss : 4.00
Step : 70, mean_loss : 3.97
Step : 80, mean_loss : 3.96
Step : 90, mean_loss : 3.94


### Test

In [361]:
from scipy.spatial.distance import euclidean, cosine

In [362]:
def word_similarity(target,vocab):
    if USE_CUDA:
        target_V = model.prediction(make_input_vector(target,word2index)).data.cpu().numpy()
    else:
        target_V = model.prediction(make_input_vector(target,word2index)).data.numpy()
    similarities=[]
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.cpu().numpy()
        else:
            vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.numpy()
        #similarity.append([vocabs[i],np.dot(target_V,vector.T)[0][0]])
        similarities.append([vocab[i],cosine(target_V,vector)-1])
     
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [376]:
test = random.choice(list(vocab))
test

'would'

In [377]:
word_similarity(test,vocab)

[['concern', 0.50944383393801074],
 ['joys', 0.50510225300072475],
 ['other', 0.48002952535757171],
 ['settled', 0.47941933282325921],
 ['mind', 0.45196679130085027],
 ['giving', 0.44291185667856858],
 ['caricature', 0.43939598551992365],
 ['regret', 0.40719281077113112],
 ['margaret', 0.40126967298598726],
 ['sense', 0.38915098056465336]]

# 2. Skip-gram with negative sampling

In [415]:
import numpy as np

In [408]:
import random

In [379]:
vocab = list(set(flatten(corpus)))

In [402]:
word2index={}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)

In [381]:
word_count = Counter(flatten(corpus))

In [317]:
num_of_total_words = len(flatten(corpus))

In [389]:
num_total_words =sum(list(word_count.values()))

In [427]:
vocab.index(",")

260

In [426]:
word_count[',']

210

In [428]:
unigram_table[260]

137.3617238445929

In [433]:
unigram_table=[]

for vo in vocab:
    unigram_table.extend([vo]*int(((word_count[vo]/num_total_words)**0.75)/0.001))

In [470]:
random.sample(unigram_table,5)

['was', ';', '.', 'married', 'sure']

In [475]:
word_count['sure']

5