# Word2Vec

Let's work on skipgram-based implementation of word2vec.

In [26]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import reuters
import time

In [41]:
# NLTK data is downloade

nltk.download("reuters")
nltk.download("punkt_tab")
nltk.data.path.append('C:/Users/Arunya Senadeera/AppData/Roaming/nltk_data/tokenizers')  

[nltk_data] Downloading package reuters to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


## 1. Define corpus data 

In [44]:


# Prepare real-world corpus
# Corpus Creation
def prepare_corpus():
    corpus = [nltk.word_tokenize(reuters.raw(fileid)) for fileid in reuters.fileids()[:500]]
    return corpus

corpus = prepare_corpus()
print("Prepared Corpus Sample:", corpus[:5])



In [52]:
# Flatten function
flatten = lambda l: [item for sublist in l for item in sublist]


['takes',
 'broadly',
 'Morris',
 'be',
 'Anhui',
 '12664',
 'STABILITY',
 'alternate',
 'car',
 'weakness',
 'Results',
 'ACQUISITION',
 'counterparts',
 'US',
 'surpluses',
 '20',
 '19.4p',
 'are',
 'Croesus',
 'january',
 '200.6',
 'obvious',
 'signficant',
 '1.26',
 'thrust',
 'Santa',
 'FFOM',
 'owners',
 '14.54',
 'thirds',
 'FOODGRAIN',
 'spokeswoman',
 'stability',
 'dispute',
 'THIS',
 '23.8',
 'policies',
 'Martin',
 '74.2',
 'WEDNESDAY',
 'inadequate',
 'daily',
 'streamlining',
 'feet',
 'extend',
 'premiums',
 'decreases',
 '2.26',
 '3.75',
 'Coffee',
 'guidelines',
 'second',
 'HIGH',
 'GHANA',
 'theme',
 'JAN',
 'Outlook',
 'Taiwan',
 '481p',
 'deficit',
 'assure',
 'supplemental',
 'Heinken',
 'kl',
 'Rains',
 'proposed',
 'attributed',
 'sapped',
 'imbalance',
 'optic',
 'stabilising',
 'ECONOMY',
 'green',
 'intermittently',
 'tissue',
 'proving',
 'BE',
 'stand',
 '165',
 'end-of-fiscal-year',
 'seem',
 '261',
 'lapse',
 'ITALY',
 'builders',
 'Employers',
 'without'

In [53]:
# Prepare vocabulary
#numericalization
corpus = prepare_corpus()
vocab = list(set(flatten(corpus)))
vocab.append("<UNK>")
word2index = {w: i for i, w in enumerate(vocab)}
index2word = {i: w for w, i in word2index.items()}
voc_size = len(vocab)


In [46]:

word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [55]:
#vocab size
print(voc_size)

10052


In [48]:
#append UNK
vocab.append('<UNK>')

In [49]:
vocab

['takes',
 'broadly',
 'Morris',
 'be',
 'Anhui',
 '12664',
 'STABILITY',
 'alternate',
 'car',
 'weakness',
 'Results',
 'ACQUISITION',
 'counterparts',
 'US',
 'surpluses',
 '20',
 '19.4p',
 'are',
 'Croesus',
 'january',
 '200.6',
 'obvious',
 'signficant',
 '1.26',
 'thrust',
 'Santa',
 'FFOM',
 'owners',
 '14.54',
 'thirds',
 'FOODGRAIN',
 'spokeswoman',
 'stability',
 'dispute',
 'THIS',
 '23.8',
 'policies',
 'Martin',
 '74.2',
 'WEDNESDAY',
 'inadequate',
 'daily',
 'streamlining',
 'feet',
 'extend',
 'premiums',
 'decreases',
 '2.26',
 '3.75',
 'Coffee',
 'guidelines',
 'second',
 'HIGH',
 'GHANA',
 'theme',
 'JAN',
 'Outlook',
 'Taiwan',
 '481p',
 'deficit',
 'assure',
 'supplemental',
 'Heinken',
 'kl',
 'Rains',
 'proposed',
 'attributed',
 'sapped',
 'imbalance',
 'optic',
 'stabilising',
 'ECONOMY',
 'green',
 'intermittently',
 'tissue',
 'proving',
 'BE',
 'stand',
 '165',
 'end-of-fiscal-year',
 'seem',
 '261',
 'lapse',
 'ITALY',
 'builders',
 'Employers',
 'without'

In [50]:
word2index['<UNK>'] = 0

In [51]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Prepare train data

In [None]:
for c in corpus:
    print(c)

In [20]:
def random_batch(batch_size, word_sequence):
    
    # Make skip gram of one size window
    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in corpus:
        for i in range(1, len(sent) - 1):
            target = word2index[sent[i]]
            context = [word2index[sent[i - 1]], word2index[sent[i + 1]]]
            for w in context:
                skip_grams.append([target, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

### Testing the method

In [None]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, corpus)

print("Input: ", input_batch)
print("Target: ", target_batch)

#we will convert them to tensor during training, so don't worry...

## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [22]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

## 4. Training

In [23]:
batch_size     = 2 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Skipgram(voc_size, embedding_size)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
all_vocabs.shape

In [25]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time

# Training
num_epochs = 5000
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, corpus)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


## 5. Plotting the embeddings

In [None]:
#list of vocabs
vocab[:10]

In [28]:
word = vocab[0]

In [None]:
#numericalization
id = word2index[word]
id

In [None]:
id_tensor = torch.LongTensor([id])
id_tensor

In [None]:
#get the embedding by averaging
v_embed = model.embedding_v(id_tensor)
u_embed = model.embedding_u(id_tensor)

v_embed, u_embed

In [None]:
#average to get the word embedding
word_embed = (v_embed + u_embed) / 2
word_embed[0][1]

In [33]:
#let's write a function to get embedding given a word
def get_embed(word):
    id_tensor = torch.LongTensor([word2index[word]])
    v_embed = model.embedding_v(id_tensor)
    u_embed = model.embedding_u(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

In [None]:
plt.figure(figsize=(6,3))
for i, word in enumerate(vocab[:20]): #loop each unique vocab
    x, y = get_embed(word)
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')
plt.show()

## 6. Cosine similarity

Formally the [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) $s$ between two vectors $p$ and $q$ is defined as:

$$s = \frac{p \cdot q}{||p|| ||q||}, \textrm{ where } s \in [-1, 1] $$ 

If $p$ and $q$ is super similar, the result is 1 otherwise 0.

In [None]:
vocab

In [36]:
#let's try similarity between first and second, and second and third
cat          = get_embed('cat')
fruit        = get_embed('fruit')
animal       = get_embed('animal')

In [None]:
#numpy version
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim
    
print(f"cat vs. fruit: ",        cos_sim(cat, fruit))
print(f"cat vs. animal: ",       cos_sim(cat, animal))
print(f"cat vs. cat: ",          cos_sim(cat, cat))

In [None]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

print(f"cat vs. fruit: ",        cos_sim(cat, fruit))
print(f"cat vs. animal: ",       cos_sim(cat, animal))
print(f"cat vs. cat: ",          cos_sim(cat, cat))