In [None]:
# !pip install wandb



In [1]:
# NumPy is used for numerical operations
import numpy as np  

# PyTorch is a deep learning framework
import torch  
from torch.utils.data import Dataset, DataLoader  # DataLoader for batching data in PyTorch

# PyTorch's neural network module
import torch.nn as nn  
# PyTorch's functional interface
import torch.nn.functional as F  
# PyTorch's initialization functions
from torch.nn import init  

# tqdm for progress bars
from tqdm.auto import tqdm, trange  

# Counter for counting occurrences
from collections import Counter  

# Random module for generating random numbers
import random  

# PyTorch's optimization module
from torch import optim  

# gzip for reading/writing gzipped files
import gzip  

# Weights & Biases for experiment tracking
import wandb  

# SciPy for scientific computing
from scipy.spatial.distance import cosine  

# argparse for command-line argument parsing
import argparse  

# NLTK for natural language processing tasks
from nltk.tokenize import RegexpTokenizer  

# Gensim for topic modeling and document similarity
from gensim.models import KeyedVectors  

# Warnings module for handling warnings
import warnings  

# Pickle for serializing Python objects
import pickle  

# DataLoader and Dataset for handling data in PyTorch
from torch.utils.tensorboard import SummaryWriter  
from torch.utils.data import DataLoader, Dataset  
# pad_sequence for padding sequences in PyTorch
from torch.nn.utils.rnn import pad_sequence  


2024-03-01 09:39:53.117562: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-01 09:39:53.117656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-01 09:39:53.243942: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
class RandomNumberGenerator:
    '''
    A wrapper class for a random number generator that holds buffers of pre-generated random numbers
    for faster access.
    '''

    def __init__(self, buffer_size, seed=12345):
        '''
        Initializes the random number generator with a seed and a buffer size of random numbers to use

        Args:
            buffer_size: The number of random numbers to pre-generate.
            seed: The seed for the random number generator
        '''
        np.random.seed(seed)
        self.float_buffer = np.random.rand(buffer_size)
        self.int_buffer = np.random.randint(0, 1000000, size=buffer_size)  # Adjust the range as needed
        self.buffer_size = buffer_size
        self.buffer_index = 0

    def random(self):
        '''
        Returns a random float value between 0 and 1
        '''
        value = self.float_buffer[self.buffer_index]
        self.buffer_index = (self.buffer_index + 1) % self.buffer_size
        return value

    def set_max_val(self, max_val):
        '''
        Sets the maximum integer value for randint and creates a buffer of random integers
        '''
        self.max_val = max_val
        self.int_buffer = np.random.randint(0, max_val, size=self.buffer_size)
        self.buffer_index = 0

    def randint(self):
        '''
        Returns a random int value between 0 and self.max_val (inclusive)
        '''
        if self.max_val is None:
            raise ValueError("Need to call set_max_val before calling randint")

        value = self.int_buffer[self.buffer_index]
        self.buffer_index = (self.buffer_index + 1) % self.buffer_size
        return value


In [4]:
class Corpus:
    def __init__(self, rng: RandomNumberGenerator):
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.rng = rng
        self.word_to_index = {}
        self.index_to_word = {}
        self.word_counts = Counter()
        self.negative_sampling_table = []
        self.full_token_sequence_as_ids = None

    def tokenize(self, text):
        '''
        Tokenize the document and returns a list of the tokens
        '''
        return self.tokenizer.tokenize(text)

    def load_data(self, file_name, min_token_freq):
        with open(file_name, 'r') as file:
            text = file.read().lower()
            tokens = self.tokenize(text)

        self.word_counts = Counter(tokens)
        unk_token = '<UNK>'
        tokens = [token if self.word_counts[token] >= min_token_freq else unk_token for token in tokens]
        self.word_counts = Counter(tokens)
        self.word_to_index = {word: i for i, (word, _) in enumerate(self.word_counts.items())}
        self.index_to_word = {i: word for word, i in self.word_to_index.items()}
        total_tokens = sum(self.word_counts.values())
        word_to_sample_prob = {word: 1 - np.sqrt(1e-5 / (count / total_tokens)) for word, count in self.word_counts.items()}
        self.full_token_sequence_as_ids = [self.word_to_index[token] if random.random() < word_to_sample_prob[token] else self.word_to_index[unk_token] for token in tokens]

    def generate_negative_sampling_table(self, exp_power=0.75, table_size=1e6):
        word_counts = list(self.word_counts.values())
        total_word_count = sum(word_counts)
        word_freqs = np.array(word_counts) / total_word_count
        word_weights = np.power(word_freqs, exp_power)
        self.negative_sampling_table = np.zeros(int(table_size), dtype=np.int32)
        idx = 0
        for word_id, weight in enumerate(word_weights):
            num_entries = int(weight * table_size)
            self.negative_sampling_table[idx:idx+num_entries] = word_id
            idx += num_entries

    def generate_negative_samples(self, cur_context_word_id, num_samples):
        results = []
        used_negative_samples = set()
        while len(results) < num_samples:
            negative_sample = self.negative_sampling_table[self.rng.randint() % len(self.negative_sampling_table)]
            if negative_sample != cur_context_word_id and negative_sample not in used_negative_samples:
                results.append(negative_sample)
                used_negative_samples.add(negative_sample)
        return results


In [7]:
corpus = Corpus(RandomNumberGenerator(buffer_size=100000))  # Initialize Corpus with RNG

FILE = 'reviews-word2vec.large.txt'
TYPE = FILE.split('.')[-2]
PATH = '/kaggle/input/word2vec-large/reviews-word2vec.large.txt'
corpus.load_data(PATH, min_token_freq=2)
corpus.generate_negative_sampling_table()


In [8]:
window_size = 2
num_negative_samples_per_target = 2

training_data = []
corpus.rng.set_max_val(len(corpus.negative_sampling_table))

max_context_words = 2 * window_size

for i, target_word_id in tqdm(enumerate(corpus.full_token_sequence_as_ids), total=len(corpus.full_token_sequence_as_ids)):
    context_start = max(0, i - window_size)
    context_end = min(i + window_size + 1, len(corpus.full_token_sequence_as_ids))
    context_words = corpus.full_token_sequence_as_ids[context_start:i] + corpus.full_token_sequence_as_ids[i+1:context_end]

    num_negative_samples = num_negative_samples_per_target * len(context_words)
    negative_samples = corpus.generate_negative_samples(target_word_id, num_negative_samples)

    labels = [1] * len(context_words) + [0] * num_negative_samples

    context_words.extend(negative_samples)

    num_samples_needed = max_context_words * num_negative_samples_per_target
    context_words += corpus.generate_negative_samples(target_word_id, num_samples_needed - len(context_words))

    training_data.append((np.array([target_word_id]), np.array(context_words), np.array(labels)))


  0%|          | 0/21240259 [00:00<?, ?it/s]

In [9]:
class Word2VecDataset(Dataset):
    def __init__(self, training_data):
        self.training_data = training_data

    def __len__(self):
        return len(self.training_data)

    def __getitem__(self, idx):
        target_word_id, context_words, labels = self.training_data[idx]
        return torch.tensor(target_word_id, dtype=torch.long), torch.tensor(context_words, dtype=torch.long), torch.tensor(labels, dtype=torch.float)

def collate_fn(batch):
    target_ids, context_ids, labels = zip(*batch)

    target_ids = torch.stack(target_ids)

    # Pad context_ids and labels to have the same length within each batch
    context_ids_padded = pad_sequence(context_ids, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)

    return target_ids, context_ids_padded, labels_padded

dataset = Word2VecDataset(training_data)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

In [10]:
class Word2Vec(nn.Module):

    def __init__(self, vocab_size, embedding_size):
        super(Word2Vec, self).__init__()
        self.embedding_size = embedding_size
        self.target_embeddings = nn.Embedding(vocab_size, embedding_size)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_size)
        self.init_emb(init_range=0.5/vocab_size)

    def init_emb(self, init_range):
        nn.init.uniform_(self.target_embeddings.weight, -init_range, init_range)
        nn.init.uniform_(self.context_embeddings.weight, -init_range, init_range)

    def forward(self, target_word_id, context_word_ids):
        '''
        Predicts whether each context word was actually in the context of the target word.
        '''
        target_emb = self.target_embeddings(target_word_id)
        context_emb = self.context_embeddings(context_word_ids)
        target_emb = target_emb.view(-1, 1, self.embedding_size)
        context_emb = context_emb.permute(0, 2, 1)
        dot_product = torch.bmm(target_emb, context_emb)
        predictions = torch.sigmoid(dot_product.squeeze(1))
        return predictions


In [11]:
batch_size = 16
embedding_size = 10
learning_rate = 5e-5
window_size = 2
min_token_freq = 5
epochs = 1
optimizer_type = optim.AdamW
vocab_size = len(corpus.word_to_index)

model = Word2Vec(vocab_size=vocab_size, embedding_size=embedding_size)

criterion = nn.BCELoss()
optimizer = optimizer_type(model.parameters(), lr=learning_rate)

writer = SummaryWriter()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

total_batches = len(data_loader)

for epoch in range(epochs):
    model.train()
    loss_sum = 0

    epoch_progress = tqdm(enumerate(data_loader), total=total_batches, desc=f"Epoch {epoch + 1}")

    for step, data in epoch_progress:
        target_ids, context_ids, labels = data

        target_ids = target_ids.to(device)
        context_ids = context_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        predictions = model(target_ids, context_ids)

        loss = criterion(predictions, labels.float())
        loss_sum += loss.item()

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            writer.add_scalar('Loss/Running_Sum', loss_sum, epoch * total_batches + step)
            loss_sum = 0

        epoch_progress.set_postfix({'Epoch Progress': f'{100 * (step + 1) / total_batches:.2f}%'})

model.eval()


Epoch 1:   0%|          | 0/1327517 [00:00<?, ?it/s]

Word2Vec(
  (target_embeddings): Embedding(72366, 10)
  (context_embeddings): Embedding(72366, 10)
)

In [12]:
def get_neighbors(model, word_to_index, target_word):
    """
    Finds the top 10 most similar words to a target word
    """
    outputs = []
    for word, index in tqdm(word_to_index.items(), total=len(word_to_index)):
        similarity = compute_cosine_similarity(model, word_to_index, target_word, word)
        result = {"word": word, "score": similarity}
        outputs.append(result)

    neighbors = sorted(outputs, key=lambda o: o['score'], reverse=True)
    return neighbors[1:11]

def compute_cosine_similarity(model, word_to_index, word_one, word_two):
    try:
        word_one_index = word_to_index[word_one]
        word_two_index = word_to_index[word_two]
    except KeyError:
        return 0

    device = next(model.parameters()).device
    
    embedding_one = model.target_embeddings(torch.LongTensor([word_one_index]).to(device))
    embedding_two = model.target_embeddings(torch.LongTensor([word_two_index]).to(device))
    
    embedding_one = embedding_one.detach().squeeze().cpu().numpy()
    embedding_two = embedding_two.detach().squeeze().cpu().numpy()
    
    similarity = 1 - abs(float(cosine(embedding_one, embedding_two)))
    return similarity


In [13]:
get_neighbors(model, corpus.word_to_index, "recommend")

  0%|          | 0/72366 [00:00<?, ?it/s]

[{'word': 'bought', 'score': 0.9855127517065054},
 {'word': 'purchased', 'score': 0.9743140568960408},
 {'word': 'ordered', 'score': 0.9651900713224616},
 {'word': 'highly', 'score': 0.9589681598832654},
 {'word': 'finished', 'score': 0.951435950381768},
 {'word': 'received', 'score': 0.9503970747982254},
 {'word': 'picked', 'score': 0.944669909705924},
 {'word': 'fare', 'score': 0.9437674089321846},
 {'word': 'finish', 'score': 0.9341909659028254},
 {'word': 'buy', 'score': 0.9275848835286713}]

In [14]:
get_neighbors(model, corpus.word_to_index, "son")

  0%|          | 0/72366 [00:00<?, ?it/s]

[{'word': 'daughter', 'score': 0.9978689607721258},
 {'word': 'hold', 'score': 0.9968446925337442},
 {'word': 'taste', 'score': 0.9864854193885922},
 {'word': 'husband', 'score': 0.9853780789845797},
 {'word': 'year', 'score': 0.9792884536815938},
 {'word': 'person', 'score': 0.9758235343259654},
 {'word': 'girl', 'score': 0.9746600945897244},
 {'word': 'attention', 'score': 0.9724571609096164},
 {'word': 'eyes', 'score': 0.9710125503014785},
 {'word': 'shame', 'score': 0.9702706487771607}]

In [18]:
get_neighbors(model, corpus.word_to_index, "purchase")

  0%|          | 0/72366 [00:00<?, ?it/s]

[{'word': 'item', 'score': 0.9849877762546954},
 {'word': 'skip', 'score': 0.9744265526270527},
 {'word': 'product', 'score': 0.9742208201060402},
 {'word': 'sophmoric', 'score': 0.9720293958661856},
 {'word': 'wrote', 'score': 0.9715892528353488},
 {'word': 'buying', 'score': 0.9714442704471521},
 {'word': 'rating', 'score': 0.9712061048570415},
 {'word': 'cookbook', 'score': 0.9704730329823473},
 {'word': 'than', 'score': 0.9692135764330851},
 {'word': 'after', 'score': 0.9664517013723515}]

In [19]:
get_neighbors(model, corpus.word_to_index, "book")

  0%|          | 0/72366 [00:00<?, ?it/s]

[{'word': 'horseman', 'score': 0.9761644347846146},
 {'word': 'mio', 'score': 0.9532793327232035},
 {'word': 'meek', 'score': 0.9330475123129532},
 {'word': 'alliteration', 'score': 0.9279052861267372},
 {'word': 'voldemort', 'score': 0.9274831512594193},
 {'word': 'unheard', 'score': 0.9213583407786088},
 {'word': 'brotherton', 'score': 0.9191594586721591},
 {'word': 'glycemic', 'score': 0.917155667817038},
 {'word': 'cheryl', 'score': 0.9159567151419128},
 {'word': 'organizations', 'score': 0.9158994768048214}]

In [23]:
get_neighbors(model, corpus.word_to_index, "man")

  0%|          | 0/72366 [00:00<?, ?it/s]

[{'word': 'person', 'score': 0.9925608662188807},
 {'word': 'normal', 'score': 0.9917423795458259},
 {'word': 'girl', 'score': 0.9904438431262258},
 {'word': 'woman', 'score': 0.9904035973558548},
 {'word': 'student', 'score': 0.9892947756255813},
 {'word': 'data', 'score': 0.9883560551034535},
 {'word': 'mystery', 'score': 0.9875488828201026},
 {'word': 'month', 'score': 0.9867243225373346},
 {'word': 'classic', 'score': 0.986413254529686},
 {'word': 'trip', 'score': 0.9851067149313025}]

In [24]:
get_neighbors(model, corpus.word_to_index, "woman")

  0%|          | 0/72366 [00:00<?, ?it/s]

[{'word': 'data', 'score': 0.9921887545382136},
 {'word': 'man', 'score': 0.9904035973558548},
 {'word': 'classic', 'score': 0.9903699869506286},
 {'word': 'young', 'score': 0.9859635186711917},
 {'word': 'child', 'score': 0.9857379306218793},
 {'word': 'student', 'score': 0.9853039653686705},
 {'word': 'normal', 'score': 0.9837775564776217},
 {'word': 'single', 'score': 0.9813626897566151},
 {'word': 'link', 'score': 0.9805324074781475},
 {'word': 'person', 'score': 0.980074651322763}]

In [25]:
get_neighbors(model, corpus.word_to_index, "late")

  0%|          | 0/72366 [00:00<?, ?it/s]

[{'word': 'conflict', 'score': 0.9992055019456896},
 {'word': 'biggest', 'score': 0.9978390896322081},
 {'word': 'west', 'score': 0.9978140354678537},
 {'word': 'civil', 'score': 0.9977691208711382},
 {'word': 'web', 'score': 0.9967901064430117},
 {'word': 'meat', 'score': 0.9967816517915443},
 {'word': 'outcome', 'score': 0.9967808761203321},
 {'word': 'artwork', 'score': 0.9967717935313682},
 {'word': 'rules', 'score': 0.9967149714329959},
 {'word': 'projects', 'score': 0.9967116170019259}]

In [26]:
get_neighbors(model, corpus.word_to_index, "useless")

  0%|          | 0/72366 [00:00<?, ?it/s]

[{'word': 'delivered', 'score': 0.9956216665369004},
 {'word': 'beginners', 'score': 0.9952016601227688},
 {'word': 'misleading', 'score': 0.9949247662091528},
 {'word': 'kinda', 'score': 0.994560495596948},
 {'word': 'patterson', 'score': 0.9945461061320225},
 {'word': 'entirely', 'score': 0.9942399468941984},
 {'word': 'expensive', 'score': 0.9942101545453287},
 {'word': 'disturbing', 'score': 0.9941996407179896},
 {'word': 'inspirational', 'score': 0.9938875056825057},
 {'word': 'damaged', 'score': 0.9938701686947284}]

In [15]:
def save(model, corpus, filename):
    '''
    Saves the model to the specified filename as a gensim KeyedVectors in the
    text format so you can load it separately.
    '''
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = model.to(device)
    
    kv = KeyedVectors(vector_size=model.embedding_size)
    vectors = []
    words = []
    for index in trange(model.target_embeddings.num_embeddings):
        word = corpus.index_to_word[index]
        vector = model.target_embeddings(torch.LongTensor([index]).to(device)).detach().cpu().numpy()[0]
        vectors.append(vector)
        words.append(word)

    kv.add_vectors(words, vectors)
    kv.save_word2vec_format(filename, binary=False)


In [16]:
MODEL_NAME = TYPE + "test-5e.kv"
save(model, corpus, MODEL_NAME)

  0%|          | 0/72366 [00:00<?, ?it/s]

In [17]:
CORPUS_NAME = TYPE + "corpus.pkl"
with open(CORPUS_NAME, 'wb') as f:
    pickle.dump(corpus, f)