In [None]:
import sys
import time

import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F


In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
device = get_default_device()
print(device)


def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [None]:
import re

al_regex = re.compile(r"[^a-zA-Z]")


class MBCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = "data/mahabharat_gutenberg_lemmatized_sents.txt"
        with open(corpus_path) as fp:
            for line in fp.readlines():
                tokens = line.split()
                tokens = [al_regex.sub('', token) for token in tokens]
                yield tokens



In [None]:

mb_sents = MBCorpus()
vocabulary = []
for sentence in mb_sents:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

vocabulary = sorted(vocabulary)            
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
print(f"vocabulary_size:{vocabulary_size}")

# get all sentences into a tensor    
s_time = time.time()
VECTORIZED_SENTENCES = [torch.tensor([word2idx[word] for word in sentence], dtype=torch.long).cuda() for sentence in mb_sents]
print(f"total sentences: {len(VECTORIZED_SENTENCES)}")
print(f"loading of sentence tensor took: {time.time()-s_time} secs")

In [None]:
import time

def timer_func(func):
    # This function shows the execution time of 
    # the function object passed
    def wrap_func(*args, **kwargs):
        t1 = time.time()
        result = func(*args, **kwargs)
        t2 = time.time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func
  
# @timer_func
def get_input_layer(word_idx):    
    x = torch.zeros(vocabulary_size).type(torch.FloatTensor)
    x[word_idx] = 1.0
    
    return to_device(x, device)
        
    
get_input_layer(10000)


# skipgram

In [None]:
import time
import sys
embedding_dims = 100
# define target matrix
W1 = Variable(to_device(torch.randn(embedding_dims, vocabulary_size), device), requires_grad=True)

# define context martix
W2 = Variable(to_device(torch.randn(vocabulary_size, embedding_dims), device), requires_grad=True)

num_epochs = 101
learning_rate = 0.01
window_size = 2


for epo in range(num_epochs):
    s_time = time.time()
    loss_val = 0
    # for each sentence
    
    s_time = time.time()
    np.random.shuffle(VECTORIZED_SENTENCES)
    print(f"shuffling of sentence tensor took: {time.time()-s_time} secs")
    
    for sent_idx, sentence in enumerate(VECTORIZED_SENTENCES):
        
        if sent_idx!=0 and sent_idx%100000==0:            
            print(f"processing {sent_idx}th sentence")
            # break
            
        for target_idx in sentence:
            context_indices = [sentence[idx] for idx in range(max(0, target_idx - window_size), min(target_idx + window_size + 1, len(sentence))) if idx != target_idx]
                        
            
            if not context_indices:
                continue            
            #print(f"target_idx:{target_idx}, context_indices:{context_indices}")
            
            for context_idx in context_indices:                               
                
                #1.forward pass
                x = Variable(get_input_layer(target_idx))
                y_true = Variable(to_device(torch.tensor([context_idx]).long(), device))
                
                # print(f"x:{x}")
                # print(f"y_true:{y_true}")                
                
                # map one hot vector with it"s correspopnding embedding
                z1 = torch.matmul(W1, x)
                
                # generate score vector by dot product with other context vectors
                z2_scores = torch.matmul(W2, z1)
                
                log_softmax = F.log_softmax(z2_scores, dim=0)
                # print(f"log_softmax.shape: {log_softmax.shape}")
                # print(f"log_softmax.view(1,-1).shape: {log_softmax.view(1,-1).shape}")                
                
                # loss calculation
                loss = F.nll_loss(log_softmax.view(1,-1), y_true)
                loss_val += loss.item()
                loss.backward()
                
                # optimization
                W1.data -= learning_rate * W1.grad.data
                W2.data -= learning_rate * W2.grad.data

                W1.grad.data.zero_()
                W2.grad.data.zero_()        
    print(f'Loss at epo {epo}: {loss_val}')
    print(f"epoch#{epo} took {time.time()-s_time} secs")

# save weights 
torch.save(W1, "W1_sg_simple.pt")
torch.save(W2, "W2_sg_simple.pt")

# skipgram with negative sampling

In [None]:
# Create a list of words for negative sampling
from collections import Counter
s_time = time.time()
word_counts = Counter([word for sentence in VECTORIZED_SENTENCES for word in sentence])
word_freqs = np.array([word_counts[word] for word in word2idx.keys()], dtype=np.float32)
word_probs = word_freqs / word_freqs.sum()
neg_sampling_candidates = list(word2idx.values())
print(neg_sampling_candidates[:5])
print(f"negative sampling candidates generation took: {time.time() - s_time} secs")


In [None]:
# skipgram with negative sampling

import time
import sys
import random
embedding_dims = 100
num_epochs = 101
learning_rate = 0.01
window_size = 2
negative_samples = 4

loss_fn = torch.nn.BCEWithLogitsLoss()


# initialize target matrix
W1 = Variable(to_device(torch.randn(embedding_dims, vocabulary_size), device), requires_grad=True)
# initialize context matrix
W2 = Variable(to_device(torch.randn(vocabulary_size, embedding_dims), device), requires_grad=True)

loss_val = 0


for epo in range(num_epochs):    
    s_time = time.time()
    np.random.shuffle(VECTORIZED_SENTENCES)
    print(f"shuffling of sentence tensor took: {time.time()-s_time} secs")
    
    for sent_idx, sentence in enumerate(VECTORIZED_SENTENCES):
        
        if sent_idx!=0 and sent_idx%100000==0:            
            print(f"processing {sent_idx}th sentence")
            # break
            
        for target_position, target_idx in enumerate(sentence):
            # target_idx = word2idx[target_word]
            context_indices = [sentence[idx] for idx in range(max(0, target_position - window_size), min(target_position + window_size + 1, len(sentence))) if idx != target_position]
                # print(f"target_word:{target_word}, context:{context}")            
            if not context_indices:
                continue
            # context_indices = [word2idx[word] for word in context]
            
            for context_idx in context_indices:                               
                
                #1.forward pass
                x = get_input_layer(target_idx)
                # y_true = Variable(to_device(torch.tensor([context_idx]).long(), device))
                
                # print(f"x:{x}")
                # print(f"y_true:{y_true}")                
                # generate positive score for target_context pair
                
                # get target word embedding
                target_embedding = torch.matmul(W1, x)
                # print(f"target_embedding.shape: {target_embedding.shape}")
                
                # get context word embedding
                p_context_embedding = W2[context_idx]
                # print(f"p_context_embedding.shape: {p_context_embedding.shape}")
                
                # generate positive score vector by dot product with other context vectors
                positive_score = torch.sum(target_embedding * p_context_embedding)                
                positive_score = positive_score.unsqueeze(0)
                # print(positive_score)
                
                # sample 4 negative context words
                negative_samples_idx = to_device(torch.tensor(random.choices(neg_sampling_candidates, k=negative_samples)), device)
                # print(f"negative_samples_idx: {negative_samples_idx}")
                
                # get #negative_sample context vectors
                negative_context_vectors = W2[negative_samples_idx]
                # print(f"negative_context_vectors.shape: {negative_context_vectors.shape}")
                
                # generate negative score vector by dot product with other negative sample context vectors
                negative_sample_scores = torch.matmul(negative_context_vectors, target_embedding)                
                # print(f"negative_scores: {negative_scores}")
                
                # concatenate positive and negative scores
                observed_scores = torch.cat((positive_score, negative_sample_scores))
                # print(f"observed_scores: {observed_scores}")
                
                # concatenate corresponding desired output of one 1 and #negative_sample 0s
                y_true = to_device(torch.concat((torch.ones(1), torch.zeros(negative_samples))), device)
                # print(f"y_true: {y_true}")
                
                
                # loss calculation
                loss = loss_fn(observed_scores, y_true)
                loss_val += loss.item()
                loss.backward()
                
                # optimization
                W1.data -= learning_rate * W1.grad.data
                W2.data -= learning_rate * W2.grad.data
                
                W1.grad.data.zero_()
                W2.grad.data.zero_()        
    print(f'Loss at epo {epo}: {loss_val}')
    print(f"epoch#{epo} took {time.time()-s_time} secs")

# save weights 
torch.save(W1, "W1_sg_ns.pt")
torch.save(W2, "W2_sg_ns.pt")

# cbow with negative sampling

In [None]:
# cbow with negative sampling

import time
import sys
import random
embedding_dims = 100
num_epochs = 5
learning_rate = 0.025
window_size = 5
negative_samples = 5



# initialize target matrix
W1 = Variable(to_device(torch.randn(embedding_dims,vocabulary_size), device), requires_grad=True)
# initialize context matrix
W2 = Variable(to_device(torch.randn(vocabulary_size, embedding_dims), device), requires_grad=True)

loss_fn = torch.nn.BCEWithLogitsLoss()

optimizer = torch.optim.SGD([W1, W2], lr=learning_rate)


loss_val = 0
for epo in range(num_epochs):
    
    # s_time = time.time()
    np.random.shuffle(VECTORIZED_SENTENCES)
    # print(f"shuffling of sentence tensor took: {time.time()-s_time} secs")
    
    #optimizer.zero_grad()  # Clear gradients before the loop
    s_time = time.time()
    for sent_idx, sentence in enumerate(VECTORIZED_SENTENCES):
        # print(sentence)
        
        if sent_idx%1000 ==0:
            print(f"processed {sent_idx}th sentence")
        
        # sentence_on_hots = get_one_hots_by_index(sentence)
        for target_position, target_idx  in enumerate(sentence):
            # target_idx = word2idx[target_word]
            context_indices = [sentence[idx] for idx in range(max(0, target_position - window_size), min(target_position + window_size + 1, len(sentence))) if idx != target_position]
            # print(f"target_idx:{target_idx}, context:{context_indices}")            
            if not context_indices:
                continue
            
            # get target embedding
            x =  get_input_layer(target_idx)
            # print(target_one_hot)
            target_embedding = torch.matmul(W1, x)
            # print(f"target_embedding.shape: {target_embedding.shape}")
            
            # get context words embedding
            
            context_embeds = W2[[context_indices]]
            # print(f"context_embeds:{context_embeds}")
            # print(f"context_vectors.shape:{context_vectors.shape}")
            context_mean = torch.mean(context_embeds, dim=0)
            # print(f"context_mean.shape:{context_mean.shape}")
            # print(f"context_mean:{context_mean}")
            
            # get positive score
            positive_score = torch.sum(target_embedding * context_mean)
            positive_score = positive_score.unsqueeze(0)
            # print(f"positive_score: {positive_score}")
            

#             # sample 4 negative context words
            negative_samples_idx = to_device(torch.tensor(random.choices(neg_sampling_candidates, k=negative_samples)), device)
            # print(f"negative_samples_idx: {negative_samples_idx}")

#             # get #negative_sample context vectors
            negative_context_vectors = W2[negative_samples_idx]
            # print(f"negative_context_vectors.shape: {negative_context_vectors.shape}")
            negative_context_mean = torch.mean(negative_context_vectors, dim=0)
            # print(f"negative_context_mean.shape: {negative_context_mean.shape}")
            negative_score = torch.sum(target_embedding * negative_context_mean)
            negative_score = negative_score.unsqueeze(0)
            
            # concatenate positive and negative scores
            observed_scores = torch.cat((positive_score, negative_score))
            # print(f"observed_scores: {observed_scores}")

            # concatenate corresponding desired output of one 1 and #negative_sample 0s
            y_true = to_device(torch.tensor((torch.ones(1), torch.zeros(1))), device)
            # print(f"y_true: {y_true}")
            
            
            # loss calculation
            loss = loss_fn(observed_scores, y_true)
            # print(f"loss: {loss}")
            loss_val += loss.item()
            # Backpropagate and update weights
            # s_time = time.time()
            loss.backward()            
            optimizer.step()
        if sent_idx%1000 ==0:
            print(f"sentence#{sent_idx} took: {time.time()- s_time} secs")
            s_time = time.time()
            
            
    print(f'Loss at epo {epo}: {loss_val}')
    print(f"epoch#{epo} took {time.time()-s_time} secs")


In [None]:

# W1 = torch.load("W1.pt")

def similarity(w1, w2):
    # find one hot vectors of w1 and w2
    w1_one_hot = get_input_layer(word2idx[w1])
    w2_one_hot = get_input_layer(word2idx[w2])
    # get actual embeddings of words by multiplying one_hot with weight matrix
    w1_embedding = torch.matmul(W1, w1_one_hot)
    w2_embedding = torch.matmul(W1, w2_one_hot)
    # find similarity between embeddings
    return torch.dot(w1_embedding, w2_embedding) / (torch.norm(w1_embedding) * torch.norm(w2_embedding))


In [None]:
similarity_matrix = torch.matmul(W1.T, W1)
# print(idx2word)

def get_top_similar_words(word, similarity_matrix, top_n=10):
    word_index = word2idx[word]
    word_similarity = similarity_matrix[word_index]
    top_n_similar_words = torch.argsort(word_similarity)[::][-top_n+1:]    
    # omit the word itself
    # return [vectorizer.get_feature_names_out()[i] for i in top_n_similar_words[1:]]
    # print(top_n_similar_words)
    return [idx2word[idx.item()] for idx in top_n_similar_words][::-1]


In [None]:
get_top_similar_words("Draupadi", similarity_matrix)

In [None]:
# questions
# 1. Why two matrices required one for target another for context