In [2]:
import cupy as cp
import numpy as np
import pandas as pd
import time
import random

In [3]:
print(f"Using CuPy version: {cp.__version__}")

Using CuPy version: 13.3.0


In [4]:
# Hyperparameters
embedding_dim = 100
learning_rate = 0.01
num_epochs = 5
window_size = 5  # Context window size
num_negative_samples = 5  # Number of negative samples
batch_size = 64

In [5]:
# Load vocabulary from CSV
vocab_df = pd.read_csv('vocabulary_main.csv')
word_to_index = dict(zip(vocab_df['word'], vocab_df['index']))
index_to_word = dict(zip(vocab_df['index'], vocab_df['word']))

In [7]:
# Get the vocabulary size
vocab_size = len(vocab_df)
vocab_size

59419

In [8]:
# Load skip-gram pairs from CSV
pairs_df = pd.read_csv('pairs_main.csv')
pairs = [(row['Center_Word_Index'], row['Context_Word_Index']) for _, row in pairs_df.iterrows()]

In [9]:
# Use only the first 20% of the dataset
num_pairs = len(pairs)
pairs = pairs[:int(0.2 * num_pairs)]
print(f"Training on {len(pairs)} pairs (20% of the original dataset)")

Training on 1653814 pairs (20% of the original dataset)


In [10]:
# Assuming word frequencies are known for negative sampling
word_freqs = cp.random.rand(vocab_size)  # Replace with actual word frequencies
word_freqs = word_freqs / cp.sum(word_freqs)

# Negative sampling distribution (raising to 3/4 power for better performance)
neg_sample_probs = cp.power(word_freqs, 3/4)
neg_sample_probs = neg_sample_probs / cp.sum(neg_sample_probs)

In [11]:
# Skip-gram model definition
class SkipGramModel:
    def __init__(self, vocab_size, embedding_dim):
        self.in_embeddings = cp.random.uniform(-1, 1, (vocab_size, embedding_dim)).astype(cp.float32)
        self.out_embeddings = cp.random.uniform(-1, 1, (vocab_size, embedding_dim)).astype(cp.float32)

    def forward(self, center_word, context_word, negative_samples):
        center_embedding = self.in_embeddings[center_word]
        context_embedding = self.out_embeddings[context_word]
        negative_embedding = self.out_embeddings[negative_samples]

        positive_score = cp.sum(center_embedding * context_embedding, axis=1)
        positive_score = self.sigmoid(positive_score)

        negative_score = cp.einsum('bnd,bd->bn', negative_embedding, center_embedding)
        negative_score = self.sigmoid(-negative_score)

        loss = -cp.log(positive_score + 1e-9) - cp.sum(cp.log(negative_score + 1e-9), axis=1)

        return loss.mean()

    def backward(self, center_word, context_word, negative_samples, learning_rate):
        center_embedding = self.in_embeddings[center_word]
        context_embedding = self.out_embeddings[context_word]
        negative_embedding = self.out_embeddings[negative_samples]

        positive_score = cp.sum(center_embedding * context_embedding, axis=1)
        positive_score = self.sigmoid(positive_score)

        negative_score = cp.einsum('bnd,bd->bn', negative_embedding, center_embedding)
        negative_score = self.sigmoid(-negative_score)

        # Gradients
        grad_center = (positive_score - 1)[:, cp.newaxis] * context_embedding + \
                      cp.einsum('bn,bnd->bd', negative_score, negative_embedding)
        grad_context = (positive_score - 1)[:, cp.newaxis] * center_embedding
        grad_negative = cp.einsum('bn,bd->bnd', negative_score, center_embedding)

        # Update embeddings
        self.in_embeddings[center_word] -= learning_rate * grad_center
        self.out_embeddings[context_word] -= learning_rate * grad_context
        
        # Use a loop to update negative samples
        for i in range(negative_samples.shape[0]):
            for j in range(negative_samples.shape[1]):
                self.out_embeddings[negative_samples[i, j]] -= learning_rate * grad_negative[i, j]

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + cp.exp(-x))

# Function to get negative samples
def get_negative_samples(batch_size, num_negative_samples, vocab_size, neg_sample_probs):
    return cp.random.choice(vocab_size, size=(batch_size, num_negative_samples), p=neg_sample_probs.get()).astype(cp.int32)

# Generate batches of data
def generate_batches(pairs, batch_size):
    random.shuffle(pairs)
    for i in range(0, len(pairs), batch_size):
        batch = pairs[i:i + batch_size]
        center_words = cp.array([pair[0] for pair in batch], dtype=cp.int32)
        context_words = cp.array([pair[1] for pair in batch], dtype=cp.int32)
        yield center_words, context_words


In [12]:
# Initialize the model
model = SkipGramModel(vocab_size, embedding_dim)

# Training loop with timing and progress tracking
total_batches = len(pairs) // batch_size + (1 if len(pairs) % batch_size != 0 else 0)

for epoch in range(num_epochs):
    start_time = time.time()  # Start time of the epoch
    total_loss = 0
    batches_processed = 0

    for center_words, context_words in generate_batches(pairs, batch_size):
        negative_samples = get_negative_samples(len(center_words), num_negative_samples, vocab_size, neg_sample_probs)

        loss = model.forward(center_words, context_words, negative_samples)
        model.backward(center_words, context_words, negative_samples, learning_rate)

        total_loss += loss.item()
        batches_processed += 1

        # Print progress
        progress = batches_processed / total_batches
        print(f"\rEpoch {epoch + 1}, Progress: [{('=' * int(50 * progress)):50s}] {progress:.1%}", end="")

    end_time = time.time()  # End time of the epoch
    epoch_time = end_time - start_time  # Time taken for this epoch

    print(f"\rEpoch {epoch + 1}, Loss: {total_loss:.4f}, Time Taken: {epoch_time:.2f} seconds")

# Save embeddings
cp.save('in_embeddings_skipgram_cupy.npy', model.in_embeddings)
cp.save('out_embeddings_skipgram_cupy.npy', model.out_embeddings)

print("Training completed and embeddings saved.")

Training completed and embeddings saved.


In [16]:
import cupy as cp
import pandas as pd
from numpy.linalg import norm

# Load saved embeddings from skip-gram model (as numpy, then convert to cupy)
in_embeddings = cp.asarray(np.load('in_embeddings_skipgram_cupy.npy'))
out_embeddings = cp.asarray(np.load('out_embeddings_skipgram_cupy.npy'))

# Function to compute cosine similarity between two vectors using cupy
def cosine_similarity(vec1, vec2):
    return cp.dot(vec1, vec2) / (cp.linalg.norm(vec1) * cp.linalg.norm(vec2))

# Function to compute the rank of the true word in the sorted similarity list using cupy
def compute_rank(pred_embedding, true_word_idx, out_embeddings):
    similarities = cp.dot(out_embeddings, pred_embedding)
    sorted_indices = cp.argsort(similarities)[::-1]  # Sort by descending similarity
    rank = cp.where(sorted_indices == true_word_idx)[0][0] + 1  # +1 to make it 1-based
    return rank

# Function to compute Mean Reciprocal Rank (MRR) using cupy
def compute_mrr(test_pairs, in_embeddings, out_embeddings):
    total_mrr = 0
    num_samples = len(test_pairs)

    for i, row in test_pairs.iterrows():
        center_word_idx = row['Center_Word_Index']
        context_word_idx = row['Context_Word_Index']

        # Get the embedding for the center word from the GPU
        center_embedding = in_embeddings[center_word_idx]

        # Compute the rank of the true context word
        rank = compute_rank(center_embedding, context_word_idx, out_embeddings)

        # Add reciprocal rank to total MRR
        total_mrr += 1 / rank

        # Print progress every 10,000 samples
        if i % 10000 == 0:
            print(f"Processed {i} test pairs, current MRR: {total_mrr / (i + 1):.4f}")

    avg_mrr = total_mrr / num_samples
    return avg_mrr

# Load the test pairs for window size 2, 4, and 5
pairs_test_2 = pd.read_csv('pairs_test_2.csv')
pairs_test_4 = pd.read_csv('pairs_test_4.csv')
pairs_test_5 = pd.read_csv('pairs_test_5.csv')

# Subset to 20% of the data (you can change this to load the full data if needed)
num_samples_2 = int(0.2 * len(pairs_test_2))
num_samples_4 = int(0.2 * len(pairs_test_4))
num_samples_5 = int(0.2 * len(pairs_test_5))

pairs_test_2_subset = pairs_test_2.iloc[:num_samples_2]
pairs_test_4_subset = pairs_test_4.iloc[:num_samples_4]
pairs_test_5_subset = pairs_test_5.iloc[:num_samples_5]

# Compute MRR for window size 2
print("Computing MRR for window size 2...")
mrr_2 = compute_mrr(pairs_test_2_subset, in_embeddings, out_embeddings)
print(f"MRR for window size 2: {mrr_2:.4f}")

# Compute MRR for window size 4
print("Computing MRR for window size 4...")
mrr_4 = compute_mrr(pairs_test_4_subset, in_embeddings, out_embeddings)
print(f"MRR for window size 4: {mrr_4:.4f}")

# Compute MRR for window size 5
print("Computing MRR for window size 5...")
mrr_5 = compute_mrr(pairs_test_5_subset, in_embeddings, out_embeddings)
print(f"MRR for window size 5: {mrr_5:.4f}")

print("MRR calculations completed for all window sizes.")

Computing MRR for window size 2...
Processed 0 test pairs, current MRR: 0.0004
Processed 10000 test pairs, current MRR: 0.0008
Processed 20000 test pairs, current MRR: 0.0007
Processed 30000 test pairs, current MRR: 0.0007
Processed 40000 test pairs, current MRR: 0.0008
Processed 50000 test pairs, current MRR: 0.0008
Processed 60000 test pairs, current MRR: 0.0009
Processed 70000 test pairs, current MRR: 0.0009
Processed 80000 test pairs, current MRR: 0.0008
MRR for window size 2: 0.0008
Computing MRR for window size 4...
Processed 0 test pairs, current MRR: 0.0004
Processed 10000 test pairs, current MRR: 0.0005
Processed 20000 test pairs, current MRR: 0.0008
Processed 30000 test pairs, current MRR: 0.0007
Processed 40000 test pairs, current MRR: 0.0007
Processed 50000 test pairs, current MRR: 0.0007
Processed 60000 test pairs, current MRR: 0.0007
Processed 70000 test pairs, current MRR: 0.0007
Processed 80000 test pairs, current MRR: 0.0007
Processed 90000 test pairs, current MRR: 0.0