# Mounting google drive and importing packages



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pyarrow



In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import torch
from torch.utils.data import Dataset
from torch.utils.data import IterableDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Data loading

In [None]:
df_train_wikitext_2= pd.read_parquet('/content/drive/MyDrive/002_Sem_2/758O - AI/wikitext-2-raw-v1/train-00000-of-00001.parquet')
df_test_wikitext_2 = pd.read_parquet('/content/drive/MyDrive/002_Sem_2/758O - AI/wikitext-2-raw-v1/test-00000-of-00001.parquet')
df_valid_wikitext_2 = pd.read_parquet('/content/drive/MyDrive/002_Sem_2/758O - AI/wikitext-2-raw-v1/validation-00000-of-00001.parquet')
df_train_wikitext103_1= pd.read_parquet('/content/drive/MyDrive/002_Sem_2/758O - AI/wikitext-103-raw-v1/test-00000-of-00001.parquet')
df_train_wikitext103_2= pd.read_parquet('/content/drive/MyDrive/002_Sem_2/758O - AI/wikitext-103-raw-v1/train-00001-of-00002.parquet')
df_test_wikitext103 = pd.read_parquet('/content/drive/MyDrive/002_Sem_2/758O - AI/wikitext-103-raw-v1/test-00000-of-00001.parquet')
df_valid_wikitext103 = pd.read_parquet('/content/drive/MyDrive/002_Sem_2/758O - AI/wikitext-103-raw-v1/validation-00000-of-00001.parquet')

In [None]:
# List of all DataFrames
dataframes = {
    "df_train_wikitext_2": df_train_wikitext_2,
    "df_test_wikitext_2": df_test_wikitext_2,
    "df_valid_wikitext_2": df_valid_wikitext_2,
    "df_train_wikitext103_1": df_train_wikitext103_1,
    "df_train_wikitext103_2": df_train_wikitext103_2,
    "df_test_wikitext103": df_test_wikitext103,
    "df_valid_wikitext103": df_valid_wikitext103
}

# Inspect the top 5 rows and column names
for name, df in dataframes.items():
    print(f"\n--- {name} ---")
    print("Columns:", df.columns.tolist())  # Display column names
    print(df.head(5))  # Display the first 5 rows
    print("\n" + "="*50)  # Separator for readability



--- df_train_wikitext_2 ---
Columns: ['text']
                                                text
0                                                   
1                     = Valkyria Chronicles III = \n
2                                                   
3   Senjō no Valkyria 3 : Unrecorded Chronicles (...
4   The game began development in 2010 , carrying...


--- df_test_wikitext_2 ---
Columns: ['text']
                                                text
0                                                   
1                              = Robert Boulter = \n
2                                                   
3   Robert Boulter is an English film , televisio...
4   In 2006 , Boulter starred alongside Whishaw i...


--- df_valid_wikitext_2 ---
Columns: ['text']
                                                text
0                                                   
1                            = Homarus gammarus = \n
2                                                   
3   Homaru

In [None]:
df_train_1 = pd.concat([df_train_wikitext_2, df_train_wikitext103_1, df_train_wikitext103_2], ignore_index=True) # Merging training data

df_test = pd.concat([df_test_wikitext_2, df_test_wikitext103], ignore_index=True) # Merging testing data

df_valid = pd.concat([df_valid_wikitext_2, df_valid_wikitext103], ignore_index=True) # Merging validation data

# Checking shape of dataframaes after merging to ensure consistency
print("Train Data Shape:", df_train_1.shape)
print("Test Data Shape:", df_test.shape)
print("Validation Data Shape:", df_valid.shape)

# Preview first few rows of each combined dataset
print("\n--- Train Data Sample ---\n", df_train_1.head())
print("\n--- Test Data Sample ---\n", df_test.head())
print("\n--- Validation Data Sample ---\n", df_valid.head())


Train Data Shape: (941751, 1)
Test Data Shape: (8716, 1)
Validation Data Shape: (7520, 1)

--- Train Data Sample ---
                                                 text
0                                                   
1                     = Valkyria Chronicles III = \n
2                                                   
3   Senjō no Valkyria 3 : Unrecorded Chronicles (...
4   The game began development in 2010 , carrying...

--- Test Data Sample ---
                                                 text
0                                                   
1                              = Robert Boulter = \n
2                                                   
3   Robert Boulter is an English film , televisio...
4   In 2006 , Boulter starred alongside Whishaw i...

--- Validation Data Sample ---
                                                 text
0                                                   
1                            = Homarus gammarus = \n
2                          

In [None]:
df_train = df_train_1.sample(n=200000, random_state=42).reset_index(drop=True)


# Data Cleaning

In [None]:
# Clean the data by defining a fucntion for it which does the following :-
# Convert to lowercas.
# Remove all characters that are not alphabets or spaces
# Replace multiple spaces with a single space
# Strip leading and trailing spaces
def text_clean(text):

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

# Apply the cleaing to the 'text' column in each DataFrame
df_train['clean_text'] = df_train['text'].apply(text_clean)
df_test['clean_text'] = df_test['text'].apply(text_clean)
df_valid['clean_text'] = df_valid['text'].apply(text_clean)

# Preview the cleaned text for verification
print("Cleaned Train Data Sample:")
print(df_train[['text', 'clean_text']].head())

Cleaned Train Data Sample:
                                                text  \
0                           = = = Stalemate = = = \n   
1   On 6 April 1320 the Scottish Parliament met a...   
2   Following his return from Algeria and his com...   
3   = = = Roman occupation to Medieval period = =...   
4                                                      

                                          clean_text  
0                                          stalemate  
1  on april the scottish parliament met at arbroa...  
2  following his return from algeria and his comp...  
3                roman occupation to medieval period  
4                                                     


In [None]:
# dropping the text column with unclean data and only keeping the clean text column
df_train.drop('text', axis=1, inplace=True)
df_test.drop('text', axis=1, inplace=True)
df_valid.drop('text', axis=1, inplace=True)

In [None]:
# removeing all the blank cells from the dataframes
df_train = df_train[df_train['clean_text'].str.strip() != '']
df_test = df_test[df_test['clean_text'].str.strip() != '']
df_valid = df_valid[df_valid['clean_text'].str.strip() != '']

In [None]:
df_train.head(100) # for visual confirmation

Unnamed: 0,clean_text
0,stalemate
1,on april the scottish parliament met at arbroa...
2,following his return from algeria and his comp...
3,roman occupation to medieval period
6,nebraska was widely lauded by television comme...
...,...
147,on june mccormack returned to duty with the wa...
149,falkland islands
150,my head cool bedded in the flowery grass
152,in line with christian interpretation of the d...


In [None]:
# To take a look at the shape pf the model
print("Train Data Shape:", df_train.shape)
print("Test Data Shape:", df_test.shape)
print("Validation Data Shape:", df_valid.shape)

Train Data Shape: (129029, 1)
Test Data Shape: (5778, 1)
Validation Data Shape: (4908, 1)


# Building Vocabulary

In [None]:
# Function to build the vocabulary and get word frequencies from training data, because this a skip gram model we will direclty have the every single word as a token.
def build_vocab_from_train(df, min_freq=1):
    counter = Counter()
    for text in df['clean_text']:
        words = text.split()  # Simple whitespace tokenization
        counter.update(words)
    # Filter words by minimum frequency if desired, we have kept it 1  this is to reduce the number odf words in the vaocab which could have a lesser contrinutions to the model
    filtered_counter = {word: freq for word, freq in counter.items() if freq >= min_freq}
    return Counter(filtered_counter)

# Now to set our vocab we set the frquency in a descending manner and take the top 11000 wordsas they also cover 90% the dataset.
vocab_counter = build_vocab_from_train(df_train, min_freq=1)
total_occurrences = sum(vocab_counter.values())
sorted_words = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
top_k = 11000
top_words = sorted_words[:top_k]

top_occurrences = sum(freq for word, freq in top_words)
coverage = top_occurrences / total_occurrences
coverage_percentage = coverage * 100

print(f"Top {top_k} words cover {coverage_percentage:.2f}% of the corpus.")

# Build the vocabulary for the Word2Vec model using only these 11,000 words
word2idx = {word: idx for idx, (word, _) in enumerate(top_words)} # word to unique index paring
idx2word = {idx: word for word, idx in word2idx.items()}

print("Vocabulary size for Word2Vec model:", len(word2idx))


Top 11000 words cover 90.34% of the corpus.
Vocabulary size for Word2Vec model: 11000


# Custom Data Loader

In [None]:
# Generator Function, this yields training pairs using a sliding window approach where we decide the window sixe to be of 2.
def generate_training_pairs_stream(texts, word2idx, window_size=2):
    """
    Generator function to yield (center, context) training pairs one at a time.

    Parameters:
        texts (iterable): An iterable of cleaned text strings.
        word2idx (dict): Mapping from words to unique indices.
        window_size (int): The size of the context window.

    Yields:
        tuple: (center_word_index, context_word_index)
    """
    for text in texts:
        words = text.split()
        # Convert words to indices, only keeping words in our vocabulary.
        indices = [word2idx[word] for word in words if word in word2idx]
        for center_pos, center_word in enumerate(indices):
            start = max(0, center_pos - window_size)
            end = min(len(indices), center_pos + window_size + 1)
            for context_pos in range(start, end):
                if context_pos == center_pos:
                    continue  # Skip the center word itself.
                yield center_word, indices[context_pos]

# IterableDataset Subclass: Streams training pairs without loading all into RAM.
# Used this approach as I was facing an issue of RAM constraints when  tred to creat
class SkipGramIterableDataset(IterableDataset):
    def __init__(self, texts, word2idx, window_size=2):
        self.texts = texts
        self.word2idx = word2idx
        self.window_size = window_size

    def __iter__(self):
        return generate_training_pairs_stream(self.texts, self.word2idx, self.window_size)

# Custom Collate Function: Convert a list of tuples into a single tensor. This was later added when I realised the context pair generator yeiled out put as a Tuple and not a tensor.
def collate_fn(batch):
    return torch.tensor(batch, dtype=torch.long)

# DataLoader Setup is such that it creates the dataset and loads mini-batches on the fly.
window_size = 2
dataset = SkipGramIterableDataset(df_train['clean_text'], word2idx, window_size)

# Creates the dataLoader with the custom collate function earlier defind to give a tensor
dataloader = DataLoader(dataset, batch_size=1024, collate_fn=collate_fn)

# Fetch and print one batch to verify the setup
for batch in dataloader:
    print(batch)
    break


tensor([[  8, 184],
        [  8,   0],
        [184,   8],
        ...,
        [162, 106],
        [162, 283],
        [106,  25]])


# Defining the model

In [None]:
class Word2VecSkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim): # Initializes the skip-gram model with two embedding layers.

        super(Word2VecSkipGram, self).__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embed_dim) # converts center word indices into dense vector
        self.context_embeddings = nn.Embedding(vocab_size, embed_dim) # converts context word indices into dense vectors.

        # Initializes the embeddings uniformly to stablise training
        initrange = 0.5 / embed_dim
        self.center_embeddings.weight.data.uniform_(-initrange, initrange)
        self.context_embeddings.weight.data.uniform_(-initrange, initrange)

    def forward(self, center_words, context_words): #Forward pass for the skip-gram model.

        center_embed = self.center_embeddings(center_words) # Retrieves the embeddings for the center  words.
        context_embed = self.context_embeddings(context_words)  # Retrieves context word embeddings.

        # Compute the dot product between the center and context embeddings.
        scores = torch.sum(center_embed * context_embed, dim=1)

        return scores

# Example usage of the model -
vocab_size = len(word2idx)
embed_dim = 100

model = Word2VecSkipGram(vocab_size, embed_dim)


# Training the model

In [None]:
# Hyperparameters
embed_dim = 100              # Dimension of word embeddings
vocab_size = len(word2idx)
neg_sample_size = 5           # Number of negative samples per positive pair
learning_rate = 0.001
num_epochs = 1                # Will keep it one as this we check how much time it takes to train a single one

# Instantiating the model, optimizer, and loss function
model = Word2VecSkipGram(vocab_size, embed_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.BCEWithLogitsLoss()  # Using the Binary Cross Entropy with Logits Loss fucntion - since the model outputs a dot product score for each pair. This loss fucntion turns them into probabilities, and then compares them with the target labels (1 for positive pairs, 0 for negative pairs) to compute the loss.

model.train()
update_count = 0  # Counter for parameter updates to actully see how many time are the parameter getting updated

for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in dataloader:
        update_count += 1

        # Each batch is of shape (batch_size, 2) containing (center, context) pairs
        center_words = batch[:, 0]       # shape: (batch_size,)
        pos_context_words = batch[:, 1]    # shape: (batch_size,)
        batch_size = center_words.size(0)

        # Positive Examples
        pos_score = model(center_words, pos_context_words)  # shape: (batch_size,)
        pos_labels = torch.ones_like(pos_score)  # Target label = 1 for positive pairs

        # Negative Examples
        neg_context_words = torch.randint(low=0, high=vocab_size, size=(batch_size, neg_sample_size),
                                          device=center_words.device)
        center_embed = model.center_embeddings(center_words)  # (batch_size, embed_dim)
        neg_context_embed = model.context_embeddings(neg_context_words)  # (batch_size, neg_sample_size, embed_dim)
        neg_score = torch.bmm(neg_context_embed, center_embed.unsqueeze(2)).squeeze(2)  # (batch_size, neg_sample_size)
        neg_labels = torch.zeros_like(neg_score)  # Target label = 0 for negative pairs

        # Loss Computation
        loss_pos = loss_function(pos_score, pos_labels)
        loss_neg = loss_function(neg_score, neg_labels)
        loss = loss_pos + loss_neg.mean()

        # Backpropagation & Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print loss for the current mini-batch update
        print(f"Update {update_count}: Loss = {loss.item():.4f}")

    avg_loss = total_loss / update_count
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

print(f"Total parameter updates: {update_count}")

Update 1: Loss = 1.3863
Update 2: Loss = 1.3863
Update 3: Loss = 1.3863
Update 4: Loss = 1.3863
Update 5: Loss = 1.3863
Update 6: Loss = 1.3862
Update 7: Loss = 1.3862
Update 8: Loss = 1.3862
Update 9: Loss = 1.3861
Update 10: Loss = 1.3861
Update 11: Loss = 1.3860
Update 12: Loss = 1.3860
Update 13: Loss = 1.3859
Update 14: Loss = 1.3859
Update 15: Loss = 1.3854
Update 16: Loss = 1.3857
Update 17: Loss = 1.3852
Update 18: Loss = 1.3851
Update 19: Loss = 1.3849
Update 20: Loss = 1.3845
Update 21: Loss = 1.3848
Update 22: Loss = 1.3837
Update 23: Loss = 1.3839
Update 24: Loss = 1.3829
Update 25: Loss = 1.3824
Update 26: Loss = 1.3829
Update 27: Loss = 1.3825
Update 28: Loss = 1.3814
Update 29: Loss = 1.3818
Update 30: Loss = 1.3818
Update 31: Loss = 1.3806
Update 32: Loss = 1.3802
Update 33: Loss = 1.3787
Update 34: Loss = 1.3784
Update 35: Loss = 1.3761
Update 36: Loss = 1.3742
Update 37: Loss = 1.3724
Update 38: Loss = 1.3729
Update 39: Loss = 1.3737
Update 40: Loss = 1.3707
Update 41

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "word2vec_model_after_1_training_epoch.pt") # here we save the model and use the same one train it again for 4 more epochs.
print("Model saved as 'word2vec_model_after_1_training_epoch.pt'")

Model saved as 'word2vec_model_after_1_training_epoch.pt'


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Hyperparameters for further training
embed_dim = 100              # Dimension of word embeddings same as the earlier saved model
vocab_size = len(word2idx)   # e.g., 11000 from our vocabulary
neg_sample_size = 5          # Number of negative samples per positive pair
learning_rate = 0.001
num_epochs = 4               # Further train for 4 epochs

# Re-instantiate the model and load the saved state
model = Word2VecSkipGram(vocab_size, embed_dim)
model.load_state_dict(torch.load("word2vec_model_after_1_training_epoch.pt"))
model.train()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.BCEWithLogitsLoss()

update_count = 0            # Reset or continue counting as desired
interval_loss = 0.0
interval_counter = 0
interval_updates = 10000    # Print average loss every 10,000 updates

for epoch in range(num_epochs):
    total_loss_epoch = 0.0
    batch_count = 0

    for batch in dataloader:
        update_count += 1
        interval_counter += 1
        batch_count += 1

        center_words = batch[:, 0]
        pos_context_words = batch[:, 1]
        batch_size = center_words.size(0)

        # Positive Examples
        pos_score = model(center_words, pos_context_words)
        pos_labels = torch.ones_like(pos_score)

        # Negative Examples
        neg_context_words = torch.randint(low=0, high=vocab_size,
                                          size=(batch_size, neg_sample_size),
                                          device=center_words.device)
        center_embed = model.center_embeddings(center_words)
        neg_context_embed = model.context_embeddings(neg_context_words)
        neg_score = torch.bmm(neg_context_embed, center_embed.unsqueeze(2)).squeeze(2)
        neg_labels = torch.zeros_like(neg_score)

        # Loss Computation
        loss_pos = loss_function(pos_score, pos_labels)
        loss_neg = loss_function(neg_score, neg_labels)
        loss = loss_pos + loss_neg.mean()

        # Backpropagation & Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss_epoch += loss.item()
        interval_loss += loss.item()

        if update_count % interval_updates == 0:
            avg_interval_loss = interval_loss / interval_counter
            print(f"After {update_count} updates: Average Loss (last {interval_updates} updates) = {avg_interval_loss:.4f}")
            interval_loss = 0.0
            interval_counter = 0

    avg_loss_epoch = total_loss_epoch / batch_count
    print(f"Epoch {epoch+1}/{num_epochs}: Average Loss = {avg_loss_epoch:.4f}")

print(f"Total parameter updates in further training: {update_count}")

torch.save(model.state_dict(), "Word2Vec_final_model.pt")
print("Final model saved as 'Word2Vec_final_model.pt'")

model.load_state_dict(torch.load("word2vec_model_after_1_training_epoch.pt"))
After 10000 updates: Average Loss (last 10000 updates) = 0.8028
After 20000 updates: Average Loss (last 10000 updates) = 0.7445
After 30000 updates: Average Loss (last 10000 updates) = 0.7199
Epoch 1/4: Average Loss = 0.7533
After 40000 updates: Average Loss (last 10000 updates) = 0.7082
After 50000 updates: Average Loss (last 10000 updates) = 0.6978
After 60000 updates: Average Loss (last 10000 updates) = 0.6929
Epoch 2/4: Average Loss = 0.6977
After 70000 updates: Average Loss (last 10000 updates) = 0.6886
After 80000 updates: Average Loss (last 10000 updates) = 0.6839
After 90000 updates: Average Loss (last 10000 updates) = 0.6810
Epoch 3/4: Average Loss = 0.6830
After 100000 updates: Average Loss (last 10000 updates) = 0.6786
After 110000 updates: Average Loss (last 10000 updates) = 0.6768
After 120000 updates: Average Loss (last 10000 updates) = 0.6745
Epoch 4/4: Average Loss = 0.6755
Total parameter updates in further training: 127596
Final model saved as 'Word2Vec_final_model.pt

# Implementation

In [None]:
import torch
import torch.nn.functional as F

def print_top_k_similar_words(input_word, word2idx, idx2word, model, top_k=10):

    if input_word not in word2idx:
        print(f"'{input_word}' is not in the vocabulary.")
        return


    input_idx = word2idx[input_word]  # This retrives  the index and embedding for the input word
    input_embedding = model.center_embeddings.weight[input_idx]
    normalized_input = F.normalize(input_embedding, dim=0) # Normalize the input embedding to calcualate cosine similarity computation

    all_embeddings = model.center_embeddings.weight  # Get all center embeddings
    normalized_embeddings = F.normalize(all_embeddings, dim=1) # Normalize all center embeddings

    cosine_similarities = torch.mv(normalized_embeddings, normalized_input)  # Computing cosine similarities

    top_similarities, top_indices = torch.topk(cosine_similarities, top_k + 1)  # Getting the top K+1 indices including the inout word

    print(f"Top {top_k} words similar to '{input_word}':")
    count = 0
    for idx, similarity in zip(top_indices, top_similarities):
        if idx.item() == input_idx:
            continue  # Skip the input word itself
        similar_word = idx2word[idx.item()]
        print(f"{similar_word}: {similarity.item():.4f}")
        count += 1
        if count == top_k:
            break

# Prompt the user for K and the input word:
try:
    top_k_input = int(input("Enter the number of similar words to display (K): "))
except ValueError:
    print("Invalid input for K. Please enter an integer.")
    top_k_input = 10  # default value

user_word = input("Enter the input word: ")

print_top_k_similar_words(user_word, word2idx, idx2word, model, top_k=top_k_input) # At the end we call the function using the user's input


Enter the number of similar words to display (K): 10
Enter the input word: cat
Top 10 words similar to 'cat':
rabbi: 0.4042
substances: 0.3567
forty: 0.3454
queen: 0.3351
reducing: 0.3346
southampton: 0.3344
mw: 0.3338
serbia: 0.3168
confirmed: 0.3127
producers: 0.3084
