# 2. Skip-gram with negative sampling

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture3.pdf
* http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf

In [5]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.2.0+1449c2c
3.2.4


In [3]:
USE_CUDA = torch.cuda.is_available()

In [6]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [8]:
def make_vocab_vector(vocab, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], vocab))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else  Variable(torch.LongTensor(idxs))
    return tensor

def make_input_vector(input,word2index):
    tensor = Variable(torch.LongTensor([word2index[input]]) if input in word2index.keys() else torch.LongTensor([word2index["<UNK>"]]))
    if USE_CUDA:
        tensor = tensor.cuda()
    return tensor

#make_input_vector(train_data[0][1], word2index)

## Load corpus : Gutenburg corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

In [4]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [12]:
corpus = list(nltk.corpus.gutenberg.sents('austen-sense.txt'))[:1000]
corpus = [[word.lower() for word in sent] for sent in corpus]

### Stop words 

In [258]:
word_count = Counter(flatten(corpus))
border =int(len(word_count)*0.05) 

In [263]:
stopwords = word_count.most_common()[:border]

### Exclude sparse words 

성능향상?을 위해 min_count 도입..

In [262]:
MIN_COUNT=3
exclude=[]

In [264]:
for w,c in word_count.items():
    if c<MIN_COUNT:
        exclude.append(w)

### Prepare train data 

In [280]:
vocab = list(set(flatten(corpus))-set(exclude))

In [281]:
word2index={}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)
        
index2word={v:k for k,v in word2index.items()}

In [282]:
WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>']*WINDOW_SIZE+c+['<DUMMY>']*WINDOW_SIZE,WINDOW_SIZE*2+1)) for c in corpus])

train_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if window[i] in exclude or window[WINDOW_SIZE] in exclude: continue # min_count
        if i==WINDOW_SIZE or window[i]=='<DUMMY>': continue
        if window[i] in stopwords: continue # stopwords 자체를 트레이닝셋에서 제외
        train_data.append((window[WINDOW_SIZE],window[i]))

X_p=[]
y_p=[]

for tr in train_data:
    temp = make_input_vector(tr[0],word2index)
    temp = temp.view(1,-1)
    X_p.append(temp)
    
    temp = make_input_vector(tr[1],word2index)
    temp = temp.view(1,-1)
    y_p.append(temp)
    
train_data = list(zip(X_p,y_p))

### Build Unigram Distribution**0.75 

In [283]:
Z = 0.001

In [284]:
word_count = Counter(flatten(corpus))
num_total_words = sum([c for w,c in word_count.items() if w not in exclude])
#num_total_words =sum(list(word_count.values()))

In [285]:
unigram_table=[]

for vo in vocab:
    unigram_table.extend([vo]*int(((word_count[vo]/num_total_words)**0.75)/Z))

### Negative Sampling 

In [286]:
def negative_sampling(targets,unigram_table,k):
    batch_size = targets.size(0)
    negs=[]
    for i in range(batch_size):
        temp=[]
        target_index = targets[i].data.cpu().tolist()[0] if USE_CUDA else targets[i].data.tolist()[0]
        while len(temp)<k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg]==target_index:continue
            temp.append(neg)
        negs.append(make_vocab_vector(temp,word2index).view(1,-1))
    
    return torch.cat(negs)

### Modeling 

In [287]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size,projection_dim):
        super(SkipgramNegSampling,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                
        initrange = (2.0 / (vocab_size+projection_dim))**0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
        
    def forward(self, inputs,targets,negs):
        center_embeds = self.embedding_v(inputs) # B x 1 x D
        target_embeds = self.embedding_u(targets) # B x 1 x D
        
        neg_embeds = -self.embedding_u(negs) # B x K x D
        
        positive_score = center_embeds.bmm(target_embeds.transpose(1,2)).squeeze(1) # Bx1
        negative_score = torch.sum(center_embeds.bmm(neg_embeds.transpose(1,2)).squeeze(1),1).view(negs.size(0),-1) # BxK -> Bx1
        
        loss = self.logsigmoid(positive_score) + self.logsigmoid(negative_score)
        
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### Train 

In [288]:
PROJECTION = 30 # Embedding size
BATCH_SIZE = 128
STEP_SIZE = 50
NEG=10 # Num of Negative Sampling

In [289]:
losses = []
model = SkipgramNegSampling(len(word2index),PROJECTION)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [290]:
for step in range(STEP_SIZE):
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        negs = negative_sampling(targets,unigram_table,NEG)
        model.zero_grad()

        loss = model(inputs,targets,negs)
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0])
    if step % 10==0:
        print("Step : %d, mean_loss : %.02f" % (step,np.mean(losses)))
        losses=[]

Step : 0, mean_loss : 0.92
Step : 10, mean_loss : 0.81
Step : 20, mean_loss : 0.74
Step : 30, mean_loss : 0.71
Step : 40, mean_loss : 0.70


### Test 

In [1]:
from scipy.spatial.distance import euclidean, cosine

In [2]:
def word_similarity(target,vocab):
    if USE_CUDA:
        target_V = model.prediction(make_input_vector(target,word2index)).data.cpu().numpy()
    else:
        target_V = model.prediction(make_input_vector(target,word2index)).data.numpy()
    similarities=[]
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.cpu().numpy()
        else:
            vector = model.prediction(make_input_vector(list(vocab)[i],word2index)).data.numpy()
        #similarity.append([vocabs[i],np.dot(target_V,vector.T)[0][0]])
        similarities.append([vocab[i],cosine(target_V,vector)-1])
     
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [333]:
test = random.choice(list(vocab))
test

'nature'

In [334]:
word_similarity(test,vocab)

[['dear', 0.30483197839770826],
 ['!"', 0.29741551949629375],
 ['.', 0.28911258825788511],
 ['necessary', 0.28688188673591553],
 ['so', 0.28375271682590752],
 ['imagine', 0.27965097926123716],
 ['we', 0.27242578016737884],
 ['"', 0.26859377163610665],
 [',"', 0.2647046349401565],
 ['think', 0.25386463676892568]]

# TODO 

* 이게 제대로 짠건지 검증을 못하겠네... 어케하지?
* 자주 등장하는 단어는 트레이닝 셋에서 제외하자 아예