In [None]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
device = get_default_device()
print(device)


def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [13]:
import re

al_regex = re.compile(r"[^a-zA-Z]")


class MBCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = "data/mahabharat_gutenberg_lemmatized_sents.txt"
        with open(corpus_path) as fp:
            for line in fp.readlines():
                tokens = line.split()
                tokens = [al_regex.sub('', token) for token in tokens]
                yield tokens



In [None]:

mb_sents = MBCorpus()
vocabulary = []
for sentence in mb_sents:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
print(f"vocabulary_size:{vocabulary_size}")

In [None]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).type(torch.cuda.FloatTensor)
    x[word_idx] = 1.0
    
    return to_device(x, device)

In [None]:
# Example of target with class indices
input = torch.randn(3, 5, requires_grad=True)
target = torch.randint(5, (3,), dtype=torch.int64)
loss = F.cross_entropy(input, target)
loss.backward()
# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
print(input)
print(target)
target = torch.randn(3, 5).softmax(dim=1)
loss = F.cross_entropy(input, target)
loss.backward()

In [None]:
import time
embedding_dims = 100
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).type(torch.cuda.FloatTensor), requires_grad=True)
# to_device(W1, device)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).type(torch.cuda.FloatTensor), requires_grad=True)
# to_device(W2, device)
num_epochs = 101
learning_rate = 0.1
window_size = 2

for epo in range(num_epochs):
    s_time = time.time()
    loss_val = 0
    # for each sentence
    for sent_idx, sentence in enumerate(mb_sents):
        if sent_idx!=0 and sent_idx%100000==0:            
            print(f"processing {sent_idx}th sentence")
            # break
            
        for idx, target_word in enumerate(sentence):
            target_idx = word2idx[target_word]
            context = [sentence[idx] for idx in range(max(0, target_idx - window_size), min(target_idx + window_size + 1, len(sentence))) if idx != target_idx]
                # print(f"target_word:{target_word}, context:{context}")            
            if not context:
                continue
            context_indices = [word2idx[word] for word in context]
            
            for context_idx in context_indices:                
                x = Variable(get_input_layer(target_idx))
                # y_true = Variable(torch.from_numpy(np.array([context_idx])).long())
                y_true = Variable(get_input_layer(context_idx))
                # print(y_true)
                # print(f"W1*x: {W1.shape}*{x.shape}")
                z1 = torch.matmul(W1, x)
                # print(f"W2*z1: {W2.shape}*{z1.shape}")
                z2 = torch.matmul(W2, z1)
                # make z2 look like one-hot via softmax and then setting max probability to 1
                # z2_soft_max = F.softmax(z2, dim=0)
                # z2_one_hot = torch.zeros_like(z2_soft_max).type(torch.cuda.FloatTensor)
                # z2_one_hot[torch.argmax(z2_soft_max)] = 1
                
                
                # calculate softmax of z2
                sf_z2 = F.softmax(z2, dim=0)
                # print(f"sf_z2.shape: {sf_z2.shape}")
                # print(f"y_true.shape: {y_true.shape}")
                # apply cross entropy loss
                # print()
                loss = F.cross_entropy(sf_z2, y_true) #
                
                
                # print(f"z2: {z2}")
                # print(f"z2.shape: {z2.shape}")
                # print(f"z2: {z2}")
                # log_softmax = F.log_softmax(z2, dim=0)
                # print(f"log_softmax: {log_softmax}")
                # loss = F.nll_loss(log_softmax.view(1,-1), y_true)
                loss_val += loss.item()
                loss.backward()
                W1.data -= learning_rate * W1.grad.data
                W2.data -= learning_rate * W2.grad.data

                W1.grad.data.zero_()
                W2.grad.data.zero_()
    print(f'Loss at epo {epo}: {loss_val}')
    print(f"epoch#{epo} took {time.time()-s_time} secs")
        # get corresponding context embeddings
        # for each target, context pair train
    

In [None]:
# save weights 
torch.save(W1, "W1.pt")
torch.save(W2, "W2.pt")

In [10]:

W1 = torch.load("W1.pt")

def similarity(w1, w2):
    # find one hot vectors of w1 and w2
    w1_one_hot = get_input_layer(word2idx[w1])
    w2_one_hot = get_input_layer(word2idx[w2])
    # get actual embeddings of words by multiplying one_hot with weight matrix
    w1_embedding = torch.matmul(W1, w1_one_hot)
    w2_embedding = torch.matmul(W1, w2_one_hot)
    # find similarity between embeddings
    return torch.dot(w1_embedding, w2_embedding) / (torch.norm(w1_embedding) * torch.norm(w2_embedding))


In [11]:
similarity_matrix = torch.matmul(W1.T, W1)
# print(idx2word)

def get_top_similar_words(word, similarity_matrix, top_n=10):
    word_index = word2idx[word]
    word_similarity = similarity_matrix[word_index]
    top_n_similar_words = torch.argsort(word_similarity)[::][-top_n+1:]    
    # omit the word itself
    # return [vectorizer.get_feature_names_out()[i] for i in top_n_similar_words[1:]]
    # print(top_n_similar_words)
    return [idx2word[idx.item()] for idx in top_n_similar_words]


In [12]:
get_top_similar_words("Draupadi", similarity_matrix)

['Tryambaka',
 'tribulation',
 'perpetuator',
 'boat',
 'lucky',
 'tamas',
 'echo',
 'Dhirga',
 'Draupadi']

In [None]:
# questions
# 1. What if I replace softmaxed probabilities with 1-0 vector
# a. Is it possible?
# b. How is learning effected?