# 2. Skip-gram with negative sampling

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture3.pdf
* http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.2.0+3a8feb7
3.2.2


In [3]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [4]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [5]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word,word2index):
    return Variable(LongTensor([word2index[word]]) if word in word2index.keys() else LongTensor([word2index["<UNK>"]]))

## Load corpus : Gutenburg corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

In [6]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [43]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]
corpus = [[word.lower() for word in sent] for sent in corpus]

### Exclude sparse words 

성능향상?을 위해 min_count 도입..

In [44]:
word_count = Counter(flatten(corpus))

In [55]:
MIN_COUNT=3
exclude=[]

In [56]:
for w,c in word_count.items():
    if c<MIN_COUNT:
        exclude.append(w)

### Prepare train data 

In [57]:
vocab = list(set(flatten(corpus))-set(exclude))

In [58]:
word2index={}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)
        
index2word={v:k for k,v in word2index.items()}

In [59]:
WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>']*WINDOW_SIZE+c+['<DUMMY>']*WINDOW_SIZE,WINDOW_SIZE*2+1)) for c in corpus])

train_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if window[i] in exclude or window[WINDOW_SIZE] in exclude: continue # min_count
        if i==WINDOW_SIZE or window[i]=='<DUMMY>': continue
        train_data.append((window[WINDOW_SIZE],window[i]))

X_p=[]
y_p=[]

for tr in train_data:
    X_p.append(prepare_word(tr[0],word2index).view(1,-1))
    y_p.append(prepare_word(tr[1],word2index).view(1,-1))
    
train_data = list(zip(X_p,y_p))

In [60]:
len(train_data)

50242

### Build Unigram Distribution**0.75 

In [61]:
Z = 0.001

In [62]:
word_count = Counter(flatten(corpus))
num_total_words = sum([c for w,c in word_count.items() if w not in exclude])

In [63]:
unigram_table=[]

for vo in vocab:
    unigram_table.extend([vo]*int(((word_count[vo]/num_total_words)**0.75)/Z))

In [64]:
print(len(vocab),len(unigram_table))

478 3500


### Negative Sampling 

In [65]:
def negative_sampling(targets,unigram_table,k):
    batch_size = targets.size(0)
    neg_samples=[]
    for i in range(batch_size):
        nsample=[]
        target_index = targets[i].data.cpu().tolist()[0] if USE_CUDA else targets[i].data.tolist()[0]
        while len(nsample)<k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg]==target_index:continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample,word2index).view(1,-1))
    
    return torch.cat(neg_samples)

### Modeling 

In [66]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size,projection_dim):
        super(SkipgramNegSampling,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                
        initrange = (2.0 / (vocab_size+projection_dim))**0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
        
    def forward(self, center_words,target_words,negative_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        
        neg_embeds = -self.embedding_u(negative_words) # B x K x D
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2) # Bx1
        negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2),1).view(negs.size(0),-1) # BxK -> Bx1
        
        loss = self.logsigmoid(positive_score) + self.logsigmoid(negative_score)
        
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### Train 

In [67]:
PROJECTION = 30 # Embedding size
BATCH_SIZE = 128
STEP_SIZE = 50
NEG=10 # Num of Negative Sampling

In [68]:
losses = []
model = SkipgramNegSampling(len(word2index),PROJECTION)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [69]:
for step in range(STEP_SIZE):
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        negs = negative_sampling(targets,unigram_table,NEG)
        model.zero_grad()

        loss = model(inputs,targets,negs)
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0])
    if step % 10==0:
        print("Step : %d, mean_loss : %.02f" % (step,np.mean(losses)))
        losses=[]

Step : 0, mean_loss : 0.99
Step : 10, mean_loss : 0.84
Step : 20, mean_loss : 0.76
Step : 30, mean_loss : 0.71
Step : 40, mean_loss : 0.68


### Test 

In [70]:
def word_similarity(target,vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target,word2index))
    else:
        target_V = model.prediction(prepare_word(target,word2index))
    similarities=[]
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i],word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i],word2index))
        
        cosine_sim = F.cosine_similarity(target_V,vector).data.cpu().tolist()[0] if USE_CUDA \
        else F.cosine_similarity(target_V,vector).data.tolist()[0] 
        similarities.append([vocab[i],cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [71]:
test = random.choice(list(vocab))
test

'them'

In [72]:
word_similarity(test,vocab)

[['around', 0.6804569959640503],
 ['...', 0.6669948101043701],
 ['their', 0.6606265902519226],
 ['hands', 0.6202448010444641],
 ['jaws', 0.5952643752098083],
 ['stand', 0.5869953632354736],
 ['run', 0.5584756135940552],
 ['they', 0.5546565651893616],
 ['many', 0.5495254397392273],
 ['those', 0.5292192697525024]]