In [37]:
import cupy as cp
import numpy as np
import pandas as pd
import time
import os

# Hyperparameters
embedding_dim = 100
learning_rate = 0.01
num_epochs = 5
batch_size = 64
context_size = 2  # Number of context words on each side
max_context_length = context_size * 2  # Maximum context size

# Load vocabulary and CBOW pairs
vocabulary_df = pd.read_csv('vocabulary_main.csv')  # The vocabulary CSV generated earlier
cbow_pairs_df = pd.read_csv('cbow_pairs.csv')  # The CBOW pairs CSV

# Get vocabulary size and map words to indices
vocab_size = len(vocabulary_df)
word_to_index = dict(zip(vocabulary_df['word'], vocabulary_df['index']))

# Initialize weights
W1 = cp.random.randn(vocab_size, embedding_dim) * 0.01  # Input weights
W2 = cp.random.randn(embedding_dim, vocab_size) * 0.01  # Output weights

In [38]:
cbow_pairs_df['Context']

0             [3561, 722, 3002, 33806, 3561, 0, 0, 0, 0, 0]
1           [3002, 722, 3002, 33806, 3561, 782, 0, 0, 0, 0]
2         [3002, 3561, 3002, 33806, 3561, 782, 308, 0, 0...
3         [3002, 3561, 722, 33806, 3561, 782, 308, 3002,...
4         [3002, 3561, 722, 3002, 3561, 782, 308, 3002, ...
                                ...                        
887767    [4954, 27, 17834, 8195, 41, 912, 73, 1309, 326...
887768    [27, 17834, 8195, 41, 10321, 73, 1309, 3266, 0...
887769    [17834, 8195, 41, 10321, 912, 1309, 3266, 0, 0...
887770         [8195, 41, 10321, 912, 73, 3266, 0, 0, 0, 0]
887771            [41, 10321, 912, 73, 1309, 0, 0, 0, 0, 0]
Name: Context, Length: 887772, dtype: object

In [39]:
def softmax(x):
    exp_x = cp.exp(x - cp.max(x))  # Subtract max for numerical stability
    return exp_x / cp.sum(exp_x, axis=1, keepdims=True)

def cbow_forward(context_indices):
    # Average the context word embeddings
    context_vectors = W1[context_indices]
    hidden_layer = cp.mean(context_vectors, axis=0)  # Average context vectors

    # Compute output layer scores
    output_scores = cp.dot(hidden_layer, W2)
    output_probabilities = softmax(output_scores.reshape(1, -1))

    return output_probabilities, hidden_layer

def cbow_backward(center_word_index, output_probabilities, hidden_layer):
    # Create one-hot encoding for the target word
    target = cp.zeros((1, vocab_size))
    target[0, center_word_index] = 1

    # Compute gradients
    output_error = output_probabilities - target
    dW2 = cp.dot(hidden_layer.reshape(-1, 1), output_error)  # Gradient for W2
    dW1 = cp.dot(output_error, W2.T) * (1 / max_context_length)  # Gradient for W1

    return dW1, dW2

# Training loop
for epoch in range(num_epochs):
    start_time = time.time()
    
    # Calculate the number of batches for 10% of the data
    num_batches = len(cbow_pairs_df) // batch_size + (1 if len(cbow_pairs_df) % batch_size != 0 else 0)
    num_batches_10_percent = num_batches // 10  # Get the number of batches for the first 10%

    for batch_num in range(num_batches_10_percent):
        batch_start = batch_num * batch_size
        batch = cbow_pairs_df.iloc[batch_start:batch_start + batch_size]
        
        for _, row in batch.iterrows():
            context_indices = eval(row['Context'])  # Convert string representation to list
            center_word_index = row['Center_Word']

            # Forward pass
            output_probabilities, hidden_layer = cbow_forward(context_indices)

            # Backward pass
            dW1, dW2 = cbow_backward(center_word_index, output_probabilities, hidden_layer)

            # Update weights
            W1[context_indices] -= learning_rate * dW1
            W2 -= learning_rate * dW2

        # Print progress every 10 batches
        if (batch_num + 1) % 500 == 0 or batch_num == num_batches_10_percent - 1:  # Print every 10 batches or the last batch
            print(f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_num + 1}/{num_batches_10_percent}")

    elapsed_time = time.time() - start_time
    print(f"Epoch {epoch + 1}/{num_epochs} completed in {elapsed_time:.2f} seconds")

# Save the learned embeddings
embeddings = W1  # Final embeddings for words
np.save('word_embeddings.npy', cp.asnumpy(embeddings))  # Convert to NumPy array and save

print("Training completed and embeddings saved to 'word_embeddings.npy'")

Epoch 1/5, Batch 500/1387
Epoch 1/5, Batch 1000/1387
Epoch 1/5, Batch 1387/1387
Epoch 1/5 completed in 145.92 seconds
Epoch 2/5, Batch 500/1387
Epoch 2/5, Batch 1000/1387
Epoch 2/5, Batch 1387/1387
Epoch 2/5 completed in 145.92 seconds
Epoch 3/5, Batch 500/1387
Epoch 3/5, Batch 1000/1387
Epoch 3/5, Batch 1387/1387
Epoch 3/5 completed in 145.96 seconds
Epoch 4/5, Batch 500/1387
Epoch 4/5, Batch 1000/1387
Epoch 4/5, Batch 1387/1387
Epoch 4/5 completed in 145.96 seconds
Epoch 5/5, Batch 500/1387
Epoch 5/5, Batch 1000/1387
Epoch 5/5, Batch 1387/1387
Epoch 5/5 completed in 145.97 seconds
Training completed and embeddings saved to 'word_embeddings.npy'


In [58]:
import numpy as np
import pandas as pd
from numpy.linalg import norm

In [59]:
# Load the saved embeddings
word_embeddings = np.load('word_embeddings.npy')  # Update path as necessary
vocabulary_df = pd.read_csv('vocabulary_main.csv')  # Ensure you have the vocabulary
word_to_index = dict(zip(vocabulary_df['word'], vocabulary_df['index']))

# Load test pairs
test_pairs = pd.read_csv('pairs_test_2.csv')  # Ensure this file has 'Center_Word_Index' and 'Context_Word_Index'

In [60]:
# Select the first 20% of the test pairs
num_test_pairs = len(test_pairs)
first_20_percent_test_pairs = test_pairs.iloc[:num_test_pairs // 5]  # First 20%

In [61]:
def get_embedding(word_index):
    """Retrieve the embedding for a word index or return the embedding for <UNK> if unseen."""
    if word_index >= len(word_embeddings):  # If word_index is out of range
        return word_embeddings[word_to_index['<UNK>']]  # Replace with <UNK> embedding
    return word_embeddings[word_index]

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def compute_rank(pred_embedding, true_word_idx):
    """Compute the rank of the true word in the sorted similarity list."""
    similarities = np.dot(word_embeddings, pred_embedding)
    sorted_indices = np.argsort(similarities)[::-1]  # Sort by descending order
    rank = np.where(sorted_indices == true_word_idx)[0][0] + 1  # +1 to make it 1-based
    return rank

def compute_mrr(test_pairs):
    """Compute Mean Reciprocal Rank (MRR) over all test pairs."""
    total_mrr = 0
    for i, row in test_pairs.iterrows():
        center_word_idx, context_word_idx = row['Center_Word_Index'], row['Context_Word_Index']

        # Get the target embedding (center word) using the index
        center_embedding = get_embedding(center_word_idx)  # Handle unseen words

        # Calculate the rank of the true context word using its index
        rank = compute_rank(center_embedding, context_word_idx)
        
        # Calculate reciprocal rank
        mrr_i = 1 / rank
        total_mrr += mrr_i
        
        # Print progress every 50000 iterations
        if i % 1000 == 0:
            print(f"Processed {i} pairs, total MRR: {total_mrr:.4f}")

    # Final MRR score
    avg_mrr = total_mrr / len(test_pairs)
    return avg_mrr

In [62]:
print(test_pairs.shape)

(403482, 2)


In [64]:
# Run MRR computation on the first 20% of test pairs
mrr_score = compute_mrr(first_20_percent_test_pairs)
print(f'MRR for the first 20% of the test data: {mrr_score:.4f}')

Processed 0 pairs, total MRR: 0.0014
Processed 1000 pairs, total MRR: 2.4936
Processed 2000 pairs, total MRR: 6.8308
Processed 3000 pairs, total MRR: 21.1767
Processed 4000 pairs, total MRR: 31.6735
Processed 5000 pairs, total MRR: 38.8989
Processed 6000 pairs, total MRR: 51.0123
Processed 7000 pairs, total MRR: 64.6896
Processed 8000 pairs, total MRR: 77.6309
Processed 9000 pairs, total MRR: 85.0523
Processed 10000 pairs, total MRR: 92.1855
Processed 11000 pairs, total MRR: 107.8881
Processed 12000 pairs, total MRR: 112.5430
Processed 13000 pairs, total MRR: 118.1966
Processed 14000 pairs, total MRR: 129.0416
Processed 15000 pairs, total MRR: 137.9220
Processed 16000 pairs, total MRR: 143.0388
Processed 17000 pairs, total MRR: 148.1727
Processed 18000 pairs, total MRR: 151.4654
Processed 19000 pairs, total MRR: 159.4704
Processed 20000 pairs, total MRR: 176.3019
Processed 21000 pairs, total MRR: 180.1070
Processed 22000 pairs, total MRR: 193.1161
Processed 23000 pairs, total MRR: 200.

In [67]:
test_pairs_window_4 = pd.read_csv('pairs_test_4.csv')  # Ensure this file has 'Center_Word_Index' and 'Context_Word_Index'

In [68]:
# Select the first 20% of the test pairs for window size 4
num_test_pairs_window_4 = len(test_pairs_window_4)
first_20_percent_test_pairs_window_4 = test_pairs_window_4.iloc[:num_test_pairs_window_4 // 5]  # First 20%

In [69]:
def get_embedding(word_index):
    """Retrieve the embedding for a word index or return the embedding for <UNK> if unseen."""
    if word_index >= len(word_embeddings):  # If word_index is out of range
        return word_embeddings[word_to_index['<UNK>']]  # Replace with <UNK> embedding
    return word_embeddings[word_index]

def compute_rank(pred_embedding, true_word_idx):
    """Compute the rank of the true word in the sorted similarity list."""
    similarities = np.dot(word_embeddings, pred_embedding)
    sorted_indices = np.argsort(similarities)[::-1]  # Sort by descending order
    rank = np.where(sorted_indices == true_word_idx)[0][0] + 1  # +1 to make it 1-based
    return rank

def compute_mrr(test_pairs):
    """Compute Mean Reciprocal Rank (MRR) over all test pairs."""
    total_mrr = 0
    for i, row in test_pairs.iterrows():
        center_word_idx, context_word_idx = row['Center_Word_Index'], row['Context_Word_Index']

        # Get the target embedding (center word) using the index
        center_embedding = get_embedding(center_word_idx)  # Handle unseen words

        # Calculate the rank of the true context word using its index
        rank = compute_rank(center_embedding, context_word_idx)
        
        # Calculate reciprocal rank
        mrr_i = 1 / rank
        total_mrr += mrr_i
        
        # Print progress every 50000 iterations
        if i % 10000 == 0:
            print(f"Processed {i} pairs, total MRR: {total_mrr:.4f}")

    # Final MRR score
    avg_mrr = total_mrr / len(test_pairs)
    return avg_mrr


In [70]:
# Run MRR computation on the first 20% of test pairs for window size 4
mrr_score_window_4 = compute_mrr(first_20_percent_test_pairs_window_4)
print(f'MRR for the first 20% of the test data (window size 4): {mrr_score_window_4:.4f}')

Processed 0 pairs, total MRR: 0.0014
Processed 10000 pairs, total MRR: 75.6442
Processed 20000 pairs, total MRR: 174.9479
Processed 30000 pairs, total MRR: 259.6337
Processed 40000 pairs, total MRR: 352.7514
Processed 50000 pairs, total MRR: 447.0202
Processed 60000 pairs, total MRR: 559.1271
Processed 70000 pairs, total MRR: 653.8185
Processed 80000 pairs, total MRR: 725.3170
Processed 90000 pairs, total MRR: 794.6552
Processed 100000 pairs, total MRR: 861.0494
Processed 110000 pairs, total MRR: 950.8984
Processed 120000 pairs, total MRR: 1013.3488
Processed 130000 pairs, total MRR: 1074.8235
Processed 140000 pairs, total MRR: 1150.1395
Processed 150000 pairs, total MRR: 1220.4922
MRR for the first 20% of the test data (window size 4): 0.0083


In [72]:
# Load the test pairs for window size 5
test_pairs_window_5 = pd.read_csv('pairs_test_5.csv')# Ensure this file has 'Center_Word_Index' and 'Context_Word_Index'
test_pairs_window_5.shape

(976012, 2)

In [73]:
# Select the first 20% of the test pairs for window size 5
num_test_pairs_window_5 = len(test_pairs_window_5)
first_20_percent_test_pairs_window_5 = test_pairs_window_5.iloc[:num_test_pairs_window_5 // 5]  # First 20%

In [75]:
def get_embedding(word_index):
    """Retrieve the embedding for a word index or return the embedding for <UNK> if unseen."""
    if word_index >= len(word_embeddings):  # If word_index is out of range
        return word_embeddings[word_to_index['<UNK>']]  # Replace with <UNK> embedding
    return word_embeddings[word_index]

def compute_rank(pred_embedding, true_word_idx):
    """Compute the rank of the true word in the sorted similarity list."""
    similarities = np.dot(word_embeddings, pred_embedding)
    sorted_indices = np.argsort(similarities)[::-1]  # Sort by descending order
    rank = np.where(sorted_indices == true_word_idx)[0][0] + 1  # +1 to make it 1-based
    return rank

def compute_mrr(test_pairs):
    """Compute Mean Reciprocal Rank (MRR) over all test pairs."""
    total_mrr = 0
    for i, row in test_pairs.iterrows():
        center_word_idx, context_word_idx = row['Center_Word_Index'], row['Context_Word_Index']

        # Get the target embedding (center word) using the index
        center_embedding = get_embedding(center_word_idx)  # Handle unseen words

        # Calculate the rank of the true context word using its index
        rank = compute_rank(center_embedding, context_word_idx)
        
        # Calculate reciprocal rank
        mrr_i = 1 / rank
        total_mrr += mrr_i
        
        # Print progress every 50000 iterations
        if i % 25000 == 0:
            print(f"Processed {i} pairs, total MRR: {total_mrr:.4f}")

    # Final MRR score
    avg_mrr = total_mrr / len(test_pairs)
    return avg_mrr

In [76]:
# Run MRR computation on the first 20% of test pairs for window size 5
mrr_score_window_5 = compute_mrr(first_20_percent_test_pairs_window_5)
print(f'MRR for the first 20% of the test data (window size 5): {mrr_score_window_5:.4f}')

Processed 0 pairs, total MRR: 0.0014
Processed 25000 pairs, total MRR: 222.6207
Processed 50000 pairs, total MRR: 440.0809
Processed 75000 pairs, total MRR: 698.4106
Processed 100000 pairs, total MRR: 912.9227
Processed 125000 pairs, total MRR: 1082.6599
Processed 150000 pairs, total MRR: 1281.1896
Processed 175000 pairs, total MRR: 1446.0686
MRR for the first 20% of the test data (window size 5): 0.0084
