# Word2Vec (Negative Sampling)

Let's work on negative-sampling based implementation of word2vec.

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import nltk
from nltk.corpus import reuters
import time

In [7]:
# NLTK data is downloade

nltk.download("reuters")
nltk.download("punkt_tab")
nltk.data.path.append('C:/Users/Arunya Senadeera/AppData/Roaming/nltk_data/tokenizers')  

[nltk_data] Downloading package reuters to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 1. Define corpus data 

In [8]:

# Prepare real-world corpus
# Corpus Creation
def prepare_corpus():
    corpus = [nltk.word_tokenize(reuters.raw(fileid)) for fileid in reuters.fileids()[:500]]
    return corpus

corpus = prepare_corpus()
print("Prepared Corpus Sample:", corpus[:5])



In [9]:
# Flatten function
flatten = lambda l: [item for sublist in l for item in sublist]

In [10]:
#numericalization
corpus = prepare_corpus()
vocab = list(set(flatten(corpus)))

vocab.insert(0, "<UNK>")
word2index = {w: i for i, w in enumerate(vocab)}
index2word = {i: w for w, i in word2index.items()}
voc_size = len(vocab)


In [11]:
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [12]:
#vocab size
print(voc_size)

10052


## 2. Prepare train data

In [17]:
# Dynamic window size function
def random_batch(batch_size, word_sequence, window_size=2):
    skip_grams = []
    for sent in word_sequence:
        for i in range(len(sent)):
            target = word2index.get(sent[i], word2index["<UNK>"])
            context_indices = list(range(max(0, i - window_size), min(len(sent), i + window_size + 1)))
            context_indices.remove(i)
            context = [word2index.get(sent[j], word2index["<UNK>"]) for j in context_indices]
            for w in context:
                skip_grams.append([target, w])

    random_inputs = []
    random_labels = []
    random_index = np.random.choice(len(skip_grams), batch_size, replace=False)
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])
        random_labels.append([skip_grams[i][1]])

    return np.array(random_inputs), np.array(random_labels)

### Testing the method

In [18]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, corpus)

print("Input: ",  input_batch)
print("Target: ", target_batch)
input_batch.shape, target_batch.shape

Input:  [[2455]
 [6649]]
Target:  [[3082]
 [3013]]


((2, 1), (2, 1))

## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [19]:
Z = 0.001
from collections import Counter

word_count = Counter(flatten(corpus))
num_total_words = sum([c for w, c in word_count.items()])

In [20]:
word_count[',']

2614

In [21]:
num_total_words

73442

In [22]:
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

In [23]:
Counter(unigram_table)

Counter({'the': 87,
         ',': 81,
         '.': 78,
         'of': 61,
         'to': 56,
         'in': 47,
         'said': 46,
         'and': 44,
         'a': 40,
         'mln': 33,
         'vs': 26,
         'for': 25,
         'The': 24,
         "'s": 23,
         'dlrs': 22,
         'on': 20,
         'it': 19,
         '>': 19,
         'is': 19,
         'lt': 19,
         ';': 19,
         '&': 19,
         'pct': 19,
         'that': 18,
         'its': 18,
         "''": 17,
         '``': 17,
         'at': 17,
         'from': 17,
         'by': 17,
         'cts': 17,
         'was': 17,
         'be': 16,
         'year': 15,
         'will': 14,
         'billion': 14,
         'with': 14,
         'has': 14,
         'not': 12,
         'U.S.': 12,
         'as': 12,
         'an': 12,
         'company': 12,
         '1986': 11,
         'would': 11,
         'are': 11,
         '(': 10,
         'have': 10,
         'which': 10,
         ')': 10,
         '

### Negative Sampling

In [24]:
import random

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].item()
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    return torch.cat(neg_samples)

### Testing the negative sampling

In [25]:
input_batch  = torch.Tensor(input_batch)
target_batch = torch.LongTensor(target_batch)

In [26]:
target_batch.shape

torch.Size([2, 1])

In [27]:
input_batch

tensor([[2455.],
        [6649.]])

In [28]:
num_neg = 3
negative_sampling(target_batch, unigram_table, num_neg)

tensor([[9280, 7766, 4794],
        [2255, 3414, 1637]])

In [29]:
target_batch[1]

tensor([3013])

## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [30]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        neg_embeds    = -self.embedding_u(negative_words) # [batch_size, num_neg, emb_size]
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1]
        
        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1)
                
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

## 5. Training

In [35]:
batch_size     = 256 
embedding_size = 100
model          = SkipgramNegSampling(voc_size, embedding_size)
num_neg        = 10 # num of negative sampling

optimizer = optim.Adam(model.parameters(), lr=0.001)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

import time

# Training
num_epochs = 1000
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, corpus)
    
    #input_batch: [batch_size, 1]
    input_batch = torch.LongTensor(input_batch)
    
    #target_batch: [batch_size, 1]
    target_batch = torch.LongTensor(target_batch)
    
    #negs_batch:   [batch_size, num_neg]
    negs_batch = negative_sampling(target_batch, unigram_table, num_neg)
    
    optimizer.zero_grad()
        
    loss = model(input_batch, target_batch, negs_batch)
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | Time: {end - start:.2f}s")


Epoch: 100 | cost: 44.137939 | Time: 0.39s
Epoch: 200 | cost: 40.610195 | Time: 0.40s
Epoch: 300 | cost: 38.706493 | Time: 0.28s
Epoch: 400 | cost: 36.061687 | Time: 0.65s
Epoch: 500 | cost: 34.731659 | Time: 0.51s
Epoch: 600 | cost: 30.346203 | Time: 0.51s
Epoch: 700 | cost: 33.298504 | Time: 0.35s
Epoch: 800 | cost: 33.286488 | Time: 0.47s
Epoch: 900 | cost: 28.263241 | Time: 0.37s
Epoch: 1000 | cost: 27.501404 | Time: 0.41s
