In [154]:
import nltk
import numpy as np
from numpy import dot
from numpy.linalg import norm
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import math
import time
import random
from nltk.corpus import brown
from collections import Counter
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Bidhan\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

# Corpus setup

In [155]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [156]:
np.random.seed(42)

In [157]:
# choosing news as suggested
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [158]:
news_corpus = brown.sents(categories='news')
news_corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [159]:
news_corpus = news_corpus[:300] # taking small subset for faster training

In [160]:
flatten = lambda l: [item for sublist in l for item in sublist]
news_flatten = flatten(news_corpus)
len(news_flatten)

6642

In [161]:
vocab = list(set(news_flatten))
vocab.append('<UNK>')
vocab[:5]

['needed', "Daniel's", 'outmoded', 'process', 'experienced']

In [162]:
word2index = {w:i for (i, w) in enumerate(vocab)}

In [163]:
index2word = {v:k for k, v in word2index.items()} 

In [164]:
vocab_size = len(vocab)
vocab_size

1939

# Prepare training data

In [165]:
def random_batch(batch_size, corpus, window_size=2):
    # Make skip gram of custom size window
    skip_grams = []

    for sent in corpus:
        for target_index in range(window_size, len(sent) - window_size):
            target = word2index[sent[target_index]]
            context = []
            count = window_size # count of context words to pick on the left and right
            while count > 0:
                # for default window, it will get the left most and right most word
                # then the second left most and second right most word
                context.append(word2index[sent[target_index - count]])
                context.append(word2index[sent[target_index + count]])
                count -= 1

            for word in context:
                skip_grams.append([target, word])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

In [166]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, news_corpus)

print("Input: ", input_batch)
print("Target: ", target_batch)

Input:  [[1081]
 [ 552]]
Target:  [[ 156]
 [1291]]


# Skipgram

In [167]:
class Skipgram(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(Skipgram, self).__init__()

        self.embedding_center   = nn.Embedding(vocab_size, emb_size)
        self.embedding_outside  = nn.Embedding(vocab_size, emb_size)

    def forward(self, center, outside, all_vocab):
        center_embedding    = self.embedding_center(center)     # as seen in above example, size: (batch_size, 1, embedding_size)
        outside_embedding   = self.embedding_outside(outside)   # (batch_size, 1, embedding_size)
        all_vocab_embedding = self.embedding_outside(all_vocab) # (batch_size, vocab_size, embedding_size)

        numerator   = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        # (b_size, 1, emb_size) @ (b_size, emb_size, 1) = (b_size, 1, 1) -> (b_size, 1)


        denominator = all_vocab_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        # (b_size, vocab_size, emb_size) @ (b_size, emb_size, 1) = (b_size, vocab_size, 1) -> (b_size, vocab_size)

        denominator_sum = torch.sum(torch.exp(denominator), 1)

        loss = -torch.mean(torch.log(numerator / denominator_sum)) # scalar

        return loss

In [168]:
batch_size     = 2 # mini-batch size
embedding_size = 2
model          = Skipgram(vocab_size, embedding_size)
model_skipgram = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [169]:
def test(w):
    if word2index.get(w) is not None:
        return word2index[w]
    else:
        return word2index['<UNK>']

In [170]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
all_vocabs.shape

torch.Size([2, 1939])

In [171]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Training

In [172]:
# Training
num_epochs = 5000
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, news_corpus)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    # changing to cuda
    input_batch  = input_batch.to(device)
    target_batch = target_batch.to(device)
    all_vocabs   = all_vocabs.to(device)

    optimizer.zero_grad()
    loss = model_skipgram(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 1000 | cost: 7.214520 | time: 0m 0s
Epoch: 2000 | cost: 7.755567 | time: 0m 0s
Epoch: 3000 | cost: 6.652462 | time: 0m 0s
Epoch: 4000 | cost: 7.302613 | time: 0m 0s
Epoch: 5000 | cost: 7.604928 | time: 0m 0s


In [173]:
# Save the model
torch.save(model_skipgram.state_dict(), 'model/skipgram_model.pth')

In [174]:
import pickle
pickle.dump(model_skipgram, open('model/skipgram.pkl', 'wb'))

In [175]:
word = vocab[0]

In [176]:
#numericalization
id = word2index[word]
id

0

In [177]:
id_tensor = torch.LongTensor([id])
id_tensor = id_tensor.to(device)

In [178]:
#get the embedding by averaging
v_embed = model_skipgram.embedding_center(id_tensor)
u_embed = model_skipgram.embedding_outside(id_tensor)

v_embed, u_embed

(tensor([[0.4518, 0.2702]], device='cuda:0', grad_fn=<EmbeddingBackward0>),
 tensor([[ 0.0529, -0.0310]], device='cuda:0', grad_fn=<EmbeddingBackward0>))

In [179]:
#average to get the word embedding
word_embed = (v_embed + u_embed) / 2
word_embed[0][1]

tensor(0.1196, device='cuda:0', grad_fn=<SelectBackward0>)

In [180]:
def get_embed_skip_gram(word):
    id_tensor = torch.LongTensor([word2index[word]])
    id_tensor = id_tensor.to(device)
    v_embed = model_skipgram.embedding_center(id_tensor)
    u_embed = model_skipgram.embedding_outside(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

In [181]:
def cos_sim(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [182]:
government = get_embed_skip_gram('government')
officials = get_embed_skip_gram('officials')
administration = get_embed_skip_gram('administration')

In [183]:
print(f"government vs officials: {cos_sim(government, officials):.4f}")
print(f"government vs administration: {cos_sim(government, administration):.4f}")
print(f"government vs government: {cos_sim(government, government):.4f}")

government vs officials: 0.7794
government vs administration: -0.9817
government vs government: 1.0000


# Skipgram (Negative Sampling)

In [184]:
Z = 0.001

In [185]:
word_count = Counter(news_flatten)
num_total_words = sum([c for w, c in word_count.items()])

In [186]:
num_total_words

6642

In [187]:
unigram_table = []

for vo in vocab:
    uw = word_count[vo] / num_total_words
    uw_alpha = int((uw ** 0.75) / Z)
    unigram_table.extend([vo] * uw_alpha)

In [188]:
Counter(unigram_table)

Counter({'the': 113,
         ',': 93,
         '.': 87,
         'of': 77,
         'to': 64,
         'a': 53,
         'and': 45,
         'in': 45,
         "''": 36,
         '``': 36,
         'for': 36,
         'The': 31,
         'would': 27,
         'said': 27,
         'by': 25,
         'that': 25,
         'was': 22,
         'on': 22,
         'be': 22,
         'is': 19,
         'as': 17,
         'it': 15,
         'he': 15,
         'will': 15,
         'which': 15,
         'his': 14,
         'at': 13,
         'who': 13,
         'Texas': 13,
         'jury': 13,
         'year': 13,
         'an': 12,
         '--': 12,
         'not': 12,
         'election': 11,
         'bill': 11,
         'with': 11,
         'has': 11,
         'from': 11,
         'million': 11,
         'House': 11,
         'plan': 11,
         'this': 10,
         'are': 10,
         'schools': 10,
         'medical': 10,
         'or': 10,
         'Dallas': 9,
         'pay': 9,
     

## Negative Sampling

In [189]:
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [190]:
batch_size = 2
x, y = random_batch(batch_size, news_corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
x_tensor = x_tensor.to(device)
y_tensor = y_tensor.to(device)

In [191]:
k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [192]:
y_tensor[0]

tensor([463], device='cuda:0')

In [193]:
neg_samples[0]

tensor([ 356, 1140,  688, 1520,  701])

In [194]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)

        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

## Testing

In [195]:
emb_size = 2
voc_size = len(vocab)
model = SkipgramNeg(voc_size, emb_size)
model_neg = model.to(device)
neg_samples = neg_samples.to(device)

In [196]:
loss = model_neg(x_tensor, y_tensor, neg_samples)
loss

tensor(0.7988, device='cuda:0', grad_fn=<NegBackward0>)

In [197]:
optimizer = optim.Adam(model_neg.parameters(), lr=0.001)

In [198]:
num_epochs = 5000

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, news_corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)

    #move to cuda
    input_tensor = input_tensor.to(device)
    label_tensor = label_tensor.to(device)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    neg_samples = neg_samples.to(device)

    loss = model_neg(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

Epoch   1000 | Loss: 2.428288
Epoch   2000 | Loss: 0.939606
Epoch   3000 | Loss: 1.476746
Epoch   4000 | Loss: 1.084400
Epoch   5000 | Loss: 1.893235


In [199]:
# Save the model
torch.save(model_neg.state_dict(), 'model/skipgram_neg_model.pth')

In [200]:
pickle.dump(model_neg, open('model/skipgram_neg.pkl', 'wb'))

In [201]:
def get_embed_neg_sample(word):
    id_tensor = torch.LongTensor([word2index[word]])
    id_tensor = id_tensor.to(device)
    v_embed = model_neg.embedding_center(id_tensor)
    u_embed = model_neg.embedding_outside(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

In [202]:
government = get_embed_neg_sample('government')
officials = get_embed_neg_sample('officials')
administration = get_embed_neg_sample('administration')

In [203]:
print(f"government vs officials: {cos_sim(government, officials):.4f}")
print(f"government vs administration: {cos_sim(government, administration):.4f}")
print(f"government vs government: {cos_sim(government, government):.4f}")

government vs officials: -0.4719
government vs administration: 0.7468
government vs government: 1.0000


# Glove

In [204]:
def get_skipgram(window_size = 2):
    # Make skip gram of custom size window
    skip_grams = []

    for sent in news_corpus:
        for target_index in range(window_size, len(sent) - window_size):
            target = sent[target_index]
            context = []
            count = window_size # count of context words to pick on the left and right
            while count > 0:
                # for default window, it will get the left most and right most word
                # then the second left most and second right most word
                context.append(sent[target_index - count])
                context.append(sent[target_index + count])
                count -= 1

            for word in context:
                skip_grams.append((target, word))
    return skip_grams

In [205]:
skip_grams = get_skipgram(2)
skip_grams[:8]

[('County', 'The'),
 ('County', 'Jury'),
 ('County', 'Fulton'),
 ('County', 'Grand'),
 ('Grand', 'Fulton'),
 ('Grand', 'said'),
 ('Grand', 'County'),
 ('Grand', 'Jury')]

In [206]:
X_ik_skipgram = Counter(skip_grams)

In [207]:
#simply a normalized function...don't worry too much
def weighting(w_i, w_j, X_ik):
        
    #check whether the co-occurrences exist between these two words
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1  #if does not exist, set it to 1
                
    x_max = 100 #100 # fixed in paper  #cannot exceed 100 counts
    alpha = 0.75
    
    #if co-occurrence does not exceed 100, scale it based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha  #scale it
    else:
        result = 1  #if is greater than max, set it to 1 maximum
    
    return result

In [208]:
from itertools import combinations_with_replacement

X_ik = {}  #for keeping the co-occurences
weighting_dic = {} #scaling the percentage of sampling

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgram.get(bigram) is not None:  #matches 
        co_occer = X_ik_skipgram[bigram]  #get the count from what we already counted
        X_ik[bigram] = co_occer + 1 # + 1 for stability issue
        X_ik[(bigram[1],bigram[0])] = co_occer+1   #count also for the opposite
    else:
        pass
        
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

# print(f"{X_ik=}")
# print(f"{weighting_dic=}")

In [209]:
def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    #convert to id since our skip_grams is word, not yet id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3
        
        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
                    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [210]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, news_corpus, skip_grams, X_ik, weighting_dic)

print("Input: ", input_batch)
print("Target: ", target_batch)
print("Cooc: ", cooc_batch)
print("Weighting: ", weighting_batch)


Input:  [[317]
 [390]]
Target:  [[1698]
 [1301]]
Cooc:  [[1.38629436]
 [0.69314718]]
Weighting:  [[0.08944272]
 [0.05318296]]


In [211]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_center = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_outside = nn.Embedding(vocab_size, embed_size) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_center(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_outside(target_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs already got log
        loss = weighting*torch.pow(inner_product +center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

## Testing

In [212]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model_glove    = GloVe(voc_size, embedding_size)
model_glove    = model_glove.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_glove.parameters(), lr=0.001)

In [213]:
num_epochs = 5000
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, news_corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]

    # to cuda
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    cooc_batch = cooc_batch.to(device)
    weighting_batch = weighting_batch.to(device)
    
    optimizer.zero_grad()
    loss = model_glove(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 1000 | cost: 13.994851 | time: 0m 0s
Epoch: 2000 | cost: 9.683171 | time: 0m 0s
Epoch: 3000 | cost: 0.930418 | time: 0m 0s
Epoch: 4000 | cost: 1.735038 | time: 0m 0s
Epoch: 5000 | cost: 2.337914 | time: 0m 0s


In [214]:
# save the model
torch.save(model_glove.state_dict(), 'model/glove_model.pth')

In [215]:
# save the model using pickle
pickle.dump(model_glove, open('model/glove.pkl', 'wb'))

In [216]:
def get_embed_glove(word):
    id_tensor = torch.LongTensor([word2index[word]])
    id_tensor = id_tensor.to(device)
    v_embed = model_glove.embedding_center(id_tensor)
    u_embed = model_glove.embedding_outside(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

In [217]:
government = get_embed_glove('government')
officials = get_embed_glove('officials')
administration = get_embed_glove('administration')

In [218]:
print(f"government vs officials: {cos_sim(government, officials):.4f}")
print(f"government vs administration: {cos_sim(government, administration):.4f}")
print(f"government vs government: {cos_sim(government, government):.4f}")

government vs officials: 0.9994
government vs administration: 0.3368
government vs government: 1.0000


# Glove (Genism)
Source credit: https://nlp.stanford.edu/projects/glove/

In [219]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.100d.txt')
model_genism = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [220]:
# Example: Word similarity
similarity = model_genism.similarity('king', 'queen')
print(f"Similarity between 'king' and 'queen': {similarity:.4f}")

# Example: Word analogy
result = model_genism.most_similar(positive=['king', 'woman'], negative=['man'])
print("King - Man + Woman = ", result[0][0])

Similarity between 'king' and 'queen': 0.7508
King - Man + Woman =  queen


# Task-2

## Evaluating methods

In [221]:
def predict_word(word1, word2, word3, embeddings, word_to_index, index_to_word):
    # Get vectors for w1, w2, w3
    vec1 = np.array(embeddings(word1))
    vec2 = np.array(embeddings(word2))
    vec3 = np.array(embeddings(word3))

    # Calculate the predicted vector
    predicted_vec = vec1 - vec2 + vec3

    # Find the closest word by cosine similarity
    max_similarity = -1
    best_word = None
    for word, index in word_to_index.items():
        if word in [word1, word2, word3]:  # Skip the input words
            continue
        similarity = cos_sim(predicted_vec, embeddings(word))
        if similarity > max_similarity:
            max_similarity = similarity
            best_word = word

    return best_word

In [222]:
# Evaluate accuracy
def evaluate_analogies(analogy_dataset, embeddings, word_to_index):
    correct = 0
    total = 0

    for analogy in analogy_dataset:
        word1, word2, word3, word4 = analogy
        if word1 not in word_to_index or word2 not in word_to_index or word3 not in word_to_index or word4 not in word_to_index:
            continue  # Skip if any word is not in the vocabulary
        predicted_word = predict_word(word1, word2, word3, embeddings, word_to_index, {v: k for k, v in word_to_index.items()})
        if predicted_word == word4:
            correct += 1
        total += 1

    return correct / total if total > 0 else 0

## Dataset
Source credit: https://www.fit.vut.cz/person/imikolov/public/rnnlm/word-test.v1.txt

In [224]:
with open("capital-common-countries.txt", "r") as file:
    lines = file.readlines()

semantic_dataset = []
for line in lines:
    # Split the line into words
    words = line.strip().split()
    if len(words) == 4:
        semantic_dataset.append([words[0], words[1], words[2], words[3]])

In [225]:
with open("past-tense.txt", "r") as file:
    lines = file.readlines()

past_tense_dataset = []
for line in lines:
    words = line.strip().split()
    if len(words) == 4:
        past_tense_dataset.append([words[0], words[1], words[2], words[3]])

In [226]:
# quick check
semantic_dataset[:5]

[['Athens', 'Greece', 'Baghdad', 'Iraq'],
 ['Athens', 'Greece', 'Bangkok', 'Thailand'],
 ['Athens', 'Greece', 'Beijing', 'China'],
 ['Athens', 'Greece', 'Berlin', 'Germany'],
 ['Athens', 'Greece', 'Bern', 'Switzerland']]

In [227]:
past_tense_dataset[:5]

[['dancing', 'danced', 'decreasing', 'decreased'],
 ['dancing', 'danced', 'describing', 'described'],
 ['dancing', 'danced', 'enhancing', 'enhanced'],
 ['dancing', 'danced', 'falling', 'fell'],
 ['dancing', 'danced', 'feeding', 'fed']]

## Syntactic Accuracy

In [228]:
accuracy = evaluate_analogies(past_tense_dataset, get_embed_skip_gram, word2index)
print(f"Syntactic Accuracy - skipgram: {accuracy * 100:.2f}%")

Syntactic Accuracy - skipgram: 0.00%


In [229]:
accuracy = evaluate_analogies(past_tense_dataset, get_embed_neg_sample, word2index)
print(f"Syntactic Accuracy - negative sample: {accuracy * 100:.2f}%")

Syntactic Accuracy - negative sample: 0.00%


In [230]:
accuracy = evaluate_analogies(past_tense_dataset, get_embed_glove, word2index)
print(f"Syntactic Accuracy - glove: {accuracy * 100:.2f}%")

Syntactic Accuracy - glove: 0.00%


In [257]:
accuracy = model_genism.evaluate_word_analogies("past_tense_lines.txt")[0]
print(f"Syntactic Accuracy - gensim: {accuracy * 100:.2f}%")

Syntactic Accuracy - gensim: 55.45%


## Semantic Accuracy 

In [232]:
accuracy = evaluate_analogies(semantic_dataset, get_embed_skip_gram, word2index)
print(f"Semantic Accuracy - skipgram: {accuracy * 100:.2f}%")

Semantic Accuracy - skipgram: 0.00%


In [233]:
accuracy = evaluate_analogies(semantic_dataset, get_embed_neg_sample, word2index)
print(f"Semantic Accuracy - negative sample: {accuracy * 100:.2f}%")

Semantic Accuracy - negative sample: 0.00%


In [234]:
accuracy = evaluate_analogies(semantic_dataset, get_embed_glove, word2index)
print(f"Semantic Accuracy - glove: {accuracy * 100:.2f}%")

Semantic Accuracy - glove: 0.00%


In [258]:
accuracy = model_genism.evaluate_word_analogies("capital.txt")[0]
print(f"Semantic Accuracy - gensim: {accuracy * 100:.2f}%")

Semantic Accuracy - gensim: 93.87%


| Model | Window Size | Training Loss | Training Time (sec) | Syntactic Accuracy (%) | Semantic Accuracy (%) |
|----------|----------|----------|----------|----------|----------|
| Skipgram    | 2     | 7.60     | 310     | 0.0     | 0.0   |
| Skipgram (Neg)   | 2     | 1.89     | 171     | 0.0     | 0.0     |
| Glove    | 2     | 2.33    | 34     | 0.0    | 0.0     |
| Glove (Genism)    | -     | -     | -     | 55.45     | 93.87     |

## Similarity Test
Source credit: http://alfonseca.org/eng/research/wordsim353.html

In [237]:
import pandas as pd

columns = ['Word 1', 'Word 2', 'Similarity Index']

df = pd.read_csv('wordsim_relatedness_goldstandard.txt', sep='\t', header=None, names=columns)

df

Unnamed: 0,Word 1,Word 2,Similarity Index
0,computer,keyboard,7.62
1,Jerusalem,Israel,8.46
2,planet,galaxy,8.11
3,canyon,landscape,7.53
4,OPEC,country,5.63
...,...,...,...
247,rooster,voyage,0.62
248,noon,string,0.54
249,chord,smile,0.54
250,professor,cucumber,0.31


In [238]:
for index, row in df.iterrows():
    word_1 = row['Word 1']
    word_2 = row['Word 2']

    try:
        embed_1_neg_samp    = get_embed_neg_sample(word_1)
        embed_2_neg_samp    = get_embed_neg_sample(word_2)
        embed_1_skip_gram   = get_embed_skip_gram(word_1)
        embed_2_skip_gram   = get_embed_skip_gram(word_2)
        embed_1_glove       = get_embed_glove(word_1)
        embed_2_glove       = get_embed_glove(word_2)

    except KeyError:
        # Replacing missing embeddings with the embedding of '<UNK>'
        embed_1_neg_samp    = get_embed_neg_sample('<UNK>')
        embed_2_neg_samp    = get_embed_neg_sample('<UNK>')
        embed_1_skip_gram   = get_embed_skip_gram('<UNK>')
        embed_2_skip_gram   = get_embed_skip_gram('<UNK>')
        embed_1_glove       = get_embed_glove('<UNK>')
        embed_2_glove       = get_embed_glove('<UNK>')

    # Computing dot product
    df.at[index, 'dot_product_neg_samp'] = np.dot(embed_1_neg_samp, embed_2_neg_samp)
    df.at[index, 'dot_product_skip_gram'] = np.dot(embed_1_skip_gram, embed_2_skip_gram)
    df.at[index, 'dot_product_glove'] = np.dot(embed_1_glove, embed_2_glove)

In [239]:
df.head(10)

Unnamed: 0,Word 1,Word 2,Similarity Index,dot_product_neg_samp,dot_product_skip_gram,dot_product_glove
0,computer,keyboard,7.62,0.942009,3.773402,0.143185
1,Jerusalem,Israel,8.46,0.942009,3.773402,0.143185
2,planet,galaxy,8.11,0.942009,3.773402,0.143185
3,canyon,landscape,7.53,0.942009,3.773402,0.143185
4,OPEC,country,5.63,0.942009,3.773402,0.143185
5,day,summer,3.94,0.123261,0.049107,-0.134245
6,day,dawn,7.53,0.942009,3.773402,0.143185
7,country,citizen,7.31,0.942009,3.773402,0.143185
8,planet,people,5.75,0.942009,3.773402,0.143185
9,environment,ecology,8.81,0.942009,3.773402,0.143185


In [240]:
from scipy.stats import spearmanr

# Computing the Spearman correlation
correlation_pos, _ = spearmanr(df['Similarity Index'], df['dot_product_skip_gram'])
correlation_neg, _ = spearmanr(df['Similarity Index'], df['dot_product_neg_samp'])
correlation_glove, _ = spearmanr(df['Similarity Index'], df['dot_product_glove'])

In [241]:
print(f"Spearman Correlation Coefficient of Skipgram: {correlation_pos:.4f}")
print(f"Spearman Correlation Coefficient of Skipgram with Negative Sampling: {correlation_neg:.4f}")
print(f"Spearman Correlation Coefficient of Glove: {correlation_glove:.4f}")

Spearman Correlation Coefficient of Skipgram: 0.0110
Spearman Correlation Coefficient of Skipgram with Negative Sampling: 0.0385
Spearman Correlation Coefficient of Glove: -0.0356


In [242]:
# Finding y_true based on the mean of similarity index in the df
y_true = df['Similarity Index'].mean()

print(f"y_true: {y_true:.2f}")

y_true: 5.29


In [243]:
correlation_coefficient = model_genism.evaluate_word_pairs('wordsim_relatedness_goldstandard.txt')
print(f"Spearman Correlation Correlation coefficient of Glove (genism): {correlation_coefficient[1][0]:.2f}")

Spearman Correlation Correlation coefficient of Glove (genism): 0.50


| Model | Skipgram | NEG | GloVe | GloVe (genism) | Y_true |
|----------|----------|----------|----------|----------|----------|
| MSE    | 0.011     | 0.0385     | -0.03     | 0.5     | 5.29   |

In [259]:
def get_embed_for_corpus(model, words):
    embeddings = {}

    for word in words:
        try:
            index = word2index[word]
        except KeyError:
            index = word2index['<UNK>']

        word_tensor = torch.LongTensor([index])
        word_tensor = word_tensor.to(device)

        embed_c = model.embedding_center(word_tensor)
        embed_o = model.embedding_outside(word_tensor)
        embed = (embed_c + embed_o) / 2

        # return as dictionary with key as the word and value as the array of its embedding
        embeddings[word] = np.array([embed[0][0].item(), embed[0][1].item()])

    return embeddings

In [260]:
embed_whole_glove = get_embed_for_corpus(model_glove, vocab)
embed_whole_neg_skg = get_embed_for_corpus(model_neg, vocab)
embed_whole_skg = get_embed_for_corpus(model_glove, vocab)

In [264]:
with open('model/model_gensim.pkl', 'wb') as model_file:
    pickle.dump(model_genism, model_file)

In [261]:
with open('model/embed_skipgram_negative.pkl', 'wb') as pickle_file:
    pickle.dump(embed_whole_neg_skg, pickle_file)

print(f"File saved.")

File saved.


In [262]:
with open('model/embed_skipgram.pkl', 'wb') as pickle_file:
    pickle.dump(embed_whole_skg, pickle_file)

print(f"File saved.")

File saved.


In [263]:
with open('model/embed_glove.pkl', 'wb') as pickle_file:
    pickle.dump(embed_whole_glove, pickle_file)

print(f"File saved.")

File saved.


# Observation

As observed, for window size of 2 - skipgram had the highest (7.60) training loss while Negative skipgram had the lowest (1.89). Glove also performed much better compared to skipgram with loss of 2.33.

In case of Training time each model were trained for 5000 epoch. Skipgram took the longest time with 310s and Glove took the least amount of time with 34s. Negative sampling took 171s. 

All three models coded from scratch performed bad compared to Genism. This is because of the small corpus size and window size. All 3 models (Skipgram, Skipgram with negative sampling, Glove) had syantactic and semantic accuracy of 0%. This was expected because of the limitations of our corpus. Glove (Genism) on the other achieved Syntactic accuracy of 55.45% 
and Semantic Accuracy of 93.87%.

Furthermore, for Spearman Correlation Coefficient - Genism outperforms other models with correlation score of `0.5`. The other 3 models showed poor correlation which suggests that predicted rankings do not closely match with ground truth. So our embeddings has poor correlation with human judgement.

In conclusion, given the small corpus size, window size and embedding dimension - our 'made from scratch' models performed poorly. Given better hyperparameter tunings like embeddig dimensions, learning rate and even number of epochs, the models can be refined.