In [None]:
import torch


In [None]:
%load_ext line_profiler

In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
device = get_default_device()
print(device)


def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [None]:
import re

al_regex = re.compile(r"[^a-zA-Z]")


class MBCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = "data/mahabharat_gutenberg_lemmatized_sents.txt"
        with open(corpus_path) as fp:
            for line in fp.readlines():
                tokens = line.split()
                tokens = [al_regex.sub('', token) for token in tokens]
                yield tokens



In [None]:
# vocabulary management
#1. prepare word-idx dictionary, reverse dictionary
#2. proabability table for negative sampling

from collections import Counter, defaultdict

mb_sents = MBCorpus()
word_freq = defaultdict(int)
for sentence in mb_sents:
    for token in sentence:
            word_freq[token]+=1

# print(f"word_freq: {word_freq}")
vocabulary = sorted(word_freq.keys())            

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
vocabulary_size = len(vocabulary)



total_count = sum([count for count in word_freq.values()])
word_probabilities = [float(word_freq[idx2word[idx]]/total_count) for idx in range(vocabulary_size)]
print(f"vocabulary_size:{vocabulary_size}")
# print(sum(word_probabilities))

# TODO 0: 
"""
calculate TF_IDF matrix by looping from the corpus here. This matrix will be the lookup matric that takes you from 
word index to input vector
"""

In [None]:
class SkipGramBatchModel(torch.nn.Module):
    """ Center word as input, context words as target.
        Objective is to maximize the score of map from input to target.
    """
    def __init__(self, device, vocabulary_size, embedding_dim, neg_num=5, word_count=None):
        """
        
        """
        super(SkipGramBatchModel, self).__init__()
        if word_count is None:
            word_count = []
        self.device = device
        self.neg_num = neg_num
        # TODO1: 
        """
        Current setup uses embedding layer, which takes as input vector indices and return vectors from embedding matrix 
        This works well when you start with one-hot vectors.
        See first line in forward function. that takes centers and returns corresponding word vectors.
        
        Since requirement is to start with TF-IDF vectors of the words. Ypu would no longer have one-hot vectors, so Embedding layer wont work.
        Replace target_embeddings(embedding layer) with Linear layer of same dimensions.
        Refer: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
                
        """
        self.target_embeddings = torch.nn.Embedding(vocabulary_size, embedding_dim)
        self.context_embeddings = torch.nn.Embedding(vocabulary_size, embedding_dim)
        initrange = 0.5 / embedding_dim
        self.target_embeddings.weight.data.uniform_(-initrange, initrange)
        self.context_embeddings.weight.data.uniform_(-initrange, initrange)
        # if self.neg_num > 0:
        #     self.table = create_sample_table(word_count)

    def forward(self, centers, contexts):
        # print(f"batch_size: {batch_size}")
        # print(f"self.target_embeddings.shape: {self.target_embeddings.shape}")
        
        # TODO2: 
        """
        instead of passing just the target indices, pass actual TF_IDF vector here
        
        Forward pass will look like
        u_embed = self.linear(batch of TF-IDF input_vectors)
        """
        u_embeds = self.target_embeddings(centers)  
        # print(f"u_embeds.shape: {u_embeds.shape}")        
        v_embeds = self.context_embeddings(contexts)
        # print(f"v_embeds.shape: {v_embeds.shape}")
        # convert context embeddings from row vectors to column vectors via transpose
        scores = torch.bmm(u_embeds, v_embeds.transpose(1,2)).squeeze()        
        return scores 

    def get_embeddings(self):
        return self.target_embeddings.weight.data

In [None]:
# training
from tqdm import tqdm
embedding_dim = 100
num_epochs = 5
learning_rate = 0.025
window_size = 5
neg_num = 5
word_count = None
batch_size=128


vocabulary_tensor = torch.arange(len(vocabulary)).to(device)  # Assuming vocabulary is a range or similar
word_probabilities_tensor = torch.Tensor(word_probabilities).to(device)



def generate_batch(data, batch_size=128, neg_num=4):
    """
    generates batch_size number of  targets, batch_size * (1 positive + neg_num negative) contexts 
    """
    # centers, contexts = [], []
    with tqdm(total=305796, position=0, leave=True) as pbar:
        centers, contexts=[], []
        for sent_idx, sentence in enumerate(data):
            sentence = [word2idx[word] for word in sentence]

            if sent_idx!=0 and sent_idx%100000==0:            
                print(f"processing {sent_idx}th sentence")
                # break

            for target_position, target_idx in enumerate(sentence):                
                context_indices = [sentence[idx] for idx in range(max(0, target_position - window_size), min(target_position + window_size + 1, len(sentence))) if idx != target_position]


                if not context_indices:
                    continue            
                #print(f"target_idx:{target_idx}, context_indices:{context_indices}")

                for context_idx in context_indices:
                    centers.append([target_idx])
                    p_contexts = [context_idx]
                    # n_contexts = np.random.choice(len(vocabulary), neg_num, p=word_probabilities).tolist()
                    n_contexts = torch.multinomial(word_probabilities_tensor, neg_num, replacement=True).cpu().tolist()

                    # print(f"{len(p_contexts)}+{len(n_contexts)} = {len(p_contexts + n_contexts)}")
                    contexts.append(p_contexts + n_contexts)
                    
                    # add ne
                    if len(centers) == batch_size:
                        yield torch.LongTensor(centers).to(device), torch.LongTensor(contexts).to(device)
                        centers, contexts = [], [] 
            # break
            pbar.update(1)


def train_batch(clip=1.0, batch_size=128, neg_num=4):
    batch_model = SkipGramBatchModel(device=device, vocabulary_size=vocabulary_size, embedding_dim=embedding_dim, neg_num=neg_num)
    batch_model.to(device)
    optimizer = torch.optim.Adam(
        batch_model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    for i in tqdm(range(num_epochs)):
        loss_val= 0
        for j, (centers, contexts) in enumerate(generate_batch(mb_sents, batch_size=batch_size, neg_num=neg_num)):
            y_pred = batch_model(centers, contexts)
            y_true = [[1] + [0]*neg_num]*batch_size
            y_true = torch.FloatTensor(y_true).to(device)
            
            loss = loss_fn(y_pred, y_true)
            loss_val += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(batch_model.parameters(), clip)
            optimizer.step()
            # Print loss value at certain step
            loss_val += loss.item()
            
        print(f"Total loss at step {i}: {loss_val}")
        loss_val = 0
    return batch_model.get_embeddings()

# x = train()
x = train_batch(batch_size=128)

# %lprun -f generate_batch list(generate_batch(mb_sents,batch_size=2))
# %lprun -f train_batch train_batch(batch_size=2)

# todo: fix average loss
# think batch variant: done
# profile batch varient

In [None]:
# saving embeddings
import pickle
import numpy as np

def save_embeddings(filename="embeddings.bin", embeddings=None, dictionary=None):
    """Embeddings and reverse dictionary serialization and dump to a file."""
    data = {
        'emb': embeddings,
        'dict': dictionary
    }
    file = open(filename, 'wb')
    print("Saving embeddings to file:", filename)
    pickle.dump(data, file)

save_embeddings(filename="batchwise_embeddings.bin", embeddings=x, dictionary=word2idx)

In [None]:
# inference
import pickle
import numpy as np

class Word2Vec(object):
    """Inference interface of Word2Vec embeddings
        Before inference the embdedding result of a word, data need to be initialized
        by calling method from_file or from_object.
    """

    def __init__(self):
        self.embeddings = None
        self.dictionary = None
        self.reverse_dictionary = None

    def from_file(self, filename):
        file = open(filename, 'rb')
        data = pickle.load(file)
        self.embeddings = data['emb']
        self.dictionary = data['dict']
        self.reverse_dictionary = {v:k for k,v in self.dictionary.items()}

    def from_object(self, embeddings, dictionary):
        self.embeddings = embeddings
        self.dictionary = dictionary

    def inference(self, word):
        assert self.embeddings is not None and self.dictionary is not None, \
            'Embeddings not initialized, use from_file or from_object to load data.'
        word_idx = self.dictionary.get(word)
        # Unknown word returns UNK's word_idx
        if word_idx is None:
            word_idx = 0
        return self.embeddings[word_idx]

    def similarity(self, word1, word2):
        v1 = self.inference(word1)
        v1 = v1.cpu().numpy()
        v2 = self.inference(word2)
        v2 = v2.cpu().numpy()
        # perform cosine similarity using torch
        return np.dot(v1, v2)  # / (np.linalg.norm(v1) * np.linalg.norm(v2))

    def most_similar(self, word, topk=10):
        assert self.embeddings is not None and self.dictionary is not None, \
            'Embeddings not initialized, use from_file or from_object to load data.'
        word_idx = self.dictionary.get(word)
        # Unknown word returns UNK's word_idx
        if word_idx is None:
            word_idx = 0
        word_emb = self.embeddings[word_idx].cpu().numpy()
        # word_emb = word_emb / np.linalg.norm(word_emb)
        similarity = np.dot(self.embeddings.cpu().numpy(), word_emb)
        sorted_idx = np.argsort(similarity)[::-1]
        return [(self.reverse_dictionary.get(idx, ""), similarity[idx]) for idx in sorted_idx[:topk]]

    def analogy(self, word1, word2, word3, topk=10):
        assert self.embeddings is not None and self.dictionary is not None, \
            'Embeddings not initialized, use from_file or from_object to load data.'
        word1_idx = self.dictionary.get(word1, 0)
        word2_idx = self.dictionary.get(word2, 0)
        word3_idx = self.dictionary.get(word3, 0)
        # Unknown word returns UNK's word_idx

        word1_emb = self.embeddings[word1_idx].cpu().numpy()
        word2_emb = self.embeddings[word2_idx].cpu().numpy()
        word3_emb = self.embeddings[word3_idx].cpu().numpy()
        word4_emb = word2_emb - word1_emb + word3_emb
        similarity = np.dot(self.embeddings.cpu().numpy(), word4_emb)
        sorted_idx = np.argsort(similarity)[::-1]
        return [(self.reverse_dictionary.get(idx, ""), similarity[idx]) for idx in sorted_idx[:topk]]

    
wv = Word2Vec()
wv.from_file("batchwise_embeddings.bin")
for word in ["Arjuna", "Drona","Bhishma", "Krishna", "mace", "Karna", "Pandu", "Kunti", "Yudhishthira"]:
    print(f"{word}: {wv.most_similar(word)}")
# wv.most_similar("Arjuna")