# 1. Skip-gram with naiive softmax 

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture2.pdf
* https://arxiv.org/abs/1301.3781

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.2.0+3a8feb7
3.2.2


In [3]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [4]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [36]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word,word2index):
    return Variable(LongTensor([word2index[word]]) if word in word2index.keys() else LongTensor([word2index["<UNK>"]]))

## Load corpus : Gutenburg corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

In [37]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [38]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] # sampling sentences for test
corpus = [[word.lower() for word in sent] for sent in corpus]

### Extract Stopwords

불용어를 지정하는 방법은 여러가지가 있지만, 여기선 휴리스틱하게 빈도 분포 꼬리에서 0.01씩을 사용

In [39]:
word_count = Counter(flatten(corpus))
border =int(len(word_count)*0.01) 

In [40]:
stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border]

In [41]:
stopwords = [s[0] for s in stopwords]

In [42]:
stopwords

[',', '.', 'the', 'of', 'and', 'lowly', 'can', 'eight', 'michael', 'saxon']

### Build vocab

In [43]:
vocab = list(set(flatten(corpus))-set(stopwords))
vocab.append('<UNK>')

In [44]:
print(len(set(flatten(corpus))),len(vocab))

592 583


In [45]:
word2index = {'<UNK>' : 0} # for masking

for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)

index2word = {v:k for k,v in word2index.items()} 

### Prepare train data 

먼저 양 옆의 몇 개까지의 단어를 예측할 것인지, WINDOW_SIZE를 정한다

In [46]:
WINDOW_SIZE = 3
windows =  flatten([list(nltk.ngrams(['<DUMMY>']*WINDOW_SIZE+c+['<DUMMY>']*WINDOW_SIZE,WINDOW_SIZE*2+1)) for c in corpus])

In [47]:
windows[0]

('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by')

In [48]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if i==WINDOW_SIZE or window[i]=='<DUMMY>': continue
        train_data.append((window[WINDOW_SIZE],window[i]))

print(train_data[:WINDOW_SIZE*2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


In [49]:
X_p=[]
y_p=[]

In [50]:
train_data[0]

('[', 'moby')

In [51]:
for tr in train_data:
    X_p.append(prepare_word(tr[0],word2index).view(1,-1))
    y_p.append(prepare_word(tr[1],word2index).view(1,-1))

In [52]:
train_data = list(zip(X_p,y_p))

In [53]:
len(train_data)

7606

### Modeling

In [54]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size,projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-0.1, 0.1) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
        #self.out = nn.Linear(projection_dim,vocab_size)
    def forward(self, center_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        outer_embeds = self.embedding_v(outer_words) # B x V x D
        scores = outer_embeds.bmm(center_embeds.transpose(1,2)) # BxVxD * BxDx1 => BxVx1
        
        return F.log_softmax(scores.squeeze(2))

    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds 

### Train  

In [55]:
PROJECTION = 30 # Embedding size
BATCH_SIZE = 128
STEP_SIZE = 100

In [56]:
losses = []
loss_function = nn.NLLLoss(ignore_index=0)
model = Skipgram(len(word2index),PROJECTION)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [57]:
for step in range(STEP_SIZE):
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        vocabs = prepare_sequence(list(vocab),word2index).expand(inputs.size(0),len(vocab))  # B x V
        model.zero_grad()

        pred = model(inputs,vocabs)
        loss = loss_function(pred,targets.view(-1))
        
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0])
    if step % 10==0:
        print("Step : %d, mean_loss : %.02f" % (step,np.mean(losses)))
        losses=[]

Step : 0, mean_loss : 6.37
Step : 10, mean_loss : 6.21
Step : 20, mean_loss : 5.81
Step : 30, mean_loss : 5.46
Step : 40, mean_loss : 5.19
Step : 50, mean_loss : 5.02
Step : 60, mean_loss : 4.91
Step : 70, mean_loss : 4.83
Step : 80, mean_loss : 4.78
Step : 90, mean_loss : 4.75


### Test

In [58]:
def word_similarity(target,vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target,word2index))
    else:
        target_V = model.prediction(prepare_word(target,word2index))
    similarities=[]
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i],word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i],word2index))
        cosine_sim = F.cosine_similarity(target_V,vector).data.cpu().tolist()[0] if USE_CUDA \
        else F.cosine_similarity(target_V,vector).data.tolist()[0] 
        similarities.append([vocab[i],cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [144]:
test = random.choice(list(vocab))
test

'therefore'

In [145]:
word_similarity(test,vocab)

[['us', 0.8296162486076355],
 ['let', 0.8022689819335938],
 ['swallowed', 0.7256121039390564],
 ['random', 0.6612909436225891],
 ['forty', 0.6319326758384705],
 ['!', 0.5833513736724854],
 ['fly', 0.5817139148712158],
 ['"', 0.544613778591156],
 ['insomuch', 0.5213627815246582],
 ['is', 0.5161886215209961]]