In [None]:
import cupy as cp
import numpy as np
import pandas as pd
import time
import os  # Import os for directory handling
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

# Hyperparameters
embedding_dim = 100
learning_rate = 0.01
num_epochs = 10
batch_size = 64

# Load vocabulary and skipgram pairs
vocabulary_df = pd.read_csv('vocabulary_main.csv')
pairs_df = pd.read_csv('pairs_main.csv')

# Get vocabulary size and map words to indices
vocab_size = len(vocabulary_df)
word_to_index = dict(zip(vocabulary_df['word'], vocabulary_df['index']))

# Prepare skipgram pairs (center and context word indices)
center_words = cp.asarray(pairs_df['Center_Word_Index'].values)
context_words = cp.asarray(pairs_df['Context_Word_Index'].values)

# Initialize word embeddings
target_embeddings = (cp.random.randn(vocab_size, embedding_dim) * 0.01).astype(cp.float32)
context_embeddings = (cp.random.randn(vocab_size, embedding_dim) * 0.01).astype(cp.float32)

# Softmax function
def softmax(x):
    exp_x = cp.exp(x - cp.max(x, axis=1, keepdims=True))
    return exp_x / exp_x.sum(axis=1, keepdims=True)

# Cross-entropy loss
def cross_entropy_loss(predicted, true_index):
    return -cp.log(predicted[cp.arange(len(true_index)), true_index])

# Cosine similarity for evaluation
def cosine_similarity_gpu(a, b):
    a_norm = cp.linalg.norm(a, axis=1)
    b_norm = cp.linalg.norm(b, axis=1)
    return cp.dot(a, b.T) / cp.outer(a_norm, b_norm)

# Evaluation function
def evaluate_embeddings(embeddings, word_to_index, test_words):
    test_indices = [word_to_index[word] for word in test_words if word in word_to_index]
    test_embeddings = embeddings[test_indices]
    similarities = cosine_similarity_gpu(test_embeddings, embeddings)
    return similarities

# Create 'results' directory if it doesn't exist
results_dir = 'results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

# Train the model
start_time = time.time()

for epoch in range(num_epochs):
    total_loss = 0
    indices = cp.arange(len(center_words))
    cp.random.shuffle(indices)
    center_words_shuffled = center_words[indices]
    context_words_shuffled = context_words[indices]
    
    progress_bar = tqdm(range(0, len(center_words), batch_size), desc=f"Epoch {epoch+1}/{num_epochs}")
    for i in progress_bar:
        batch_center_words = center_words_shuffled[i:i+batch_size]
        batch_context_words = context_words_shuffled[i:i+batch_size]
        
        # Forward pass
        target_embeddings_batch = target_embeddings[batch_center_words]
        context_scores = cp.dot(target_embeddings_batch, context_embeddings.T)
        predicted_probs = softmax(context_scores)
        
        # Compute loss
        loss = cross_entropy_loss(predicted_probs, batch_context_words)
        total_loss += cp.sum(loss)
        
        # Backward pass
        predicted_probs[cp.arange(len(batch_context_words)), batch_context_words] -= 1
        d_context_embeddings = cp.dot(predicted_probs.T, target_embeddings_batch)
        d_target_embeddings_batch = cp.dot(predicted_probs, context_embeddings)
        
        # Update embeddings
        target_embeddings[batch_center_words] -= learning_rate * d_target_embeddings_batch
        context_embeddings -= learning_rate * d_context_embeddings
        
        progress_bar.set_postfix({'loss': total_loss.get() / (i + batch_size)})
    
    # Learning rate decay
    learning_rate *= 0.9
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss.get()/len(center_words)}')
    
    # Save intermediate results in the 'results' directory
    if (epoch + 1) % 5 == 0:
        cp.save(os.path.join(results_dir, f'target_embeddings_epoch_{epoch+1}.npy'), target_embeddings)
        cp.save(os.path.join(results_dir, f'context_embeddings_epoch_{epoch+1}.npy'), context_embeddings)

total_time = time.time() - start_time
print(f'Total training time: {total_time:.2f} seconds')

# Normalize embeddings
target_embeddings /= cp.linalg.norm(target_embeddings, axis=1, keepdims=True)
context_embeddings /= cp.linalg.norm(context_embeddings, axis=1, keepdims=True)

# Save final embeddings in the 'results' directory
cp.save(os.path.join(results_dir, 'target_embeddings_final.npy'), target_embeddings)
cp.save(os.path.join(results_dir, 'context_embeddings_final.npy'), context_embeddings)

# Evaluate embeddings
test_words = ['king', 'queen', 'man', 'woman', 'paris', 'france']
similarities = evaluate_embeddings(target_embeddings, word_to_index, test_words)

print("Sample cosine similarities:")
for i, word in enumerate(test_words):
    most_similar = cp.argsort(similarities[i])[-5:][::-1]
    print(f"{word}: {[list(word_to_index.keys())[idx] for idx in most_similar.get()]}")

print("Training completed and embeddings saved.")

Epoch 1/10:  88%|████████▊ | 113454/129205 [03:23<00:27, 573.19it/s, loss=9.74]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 2/10:  62%|██████▏   | 80258/129205 [02:21<01:24, 577.07it/s, loss=8.89]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 3/10:  36%|███▌      | 46722/129205 [01:22<02:21, 581.97it/s, loss=8.67]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.

In [13]:
# Evaluate embeddings
test_words = ['king', 'queen', 'man', 'woman', 'light', 'kick']
similarities = evaluate_embeddings(target_embeddings, word_to_index, test_words)

print("Sample cosine similarities:")
for i, word in enumerate(test_words):
    # Get the number of words in the vocabulary (i.e., the size of similarities)
    num_words_in_vocab = similarities.shape[1]

    # Limit the number of most similar words to the available vocabulary size
    most_similar = cp.argsort(similarities[i])[-min(5, num_words_in_vocab):][::-1]

    # Get the actual words corresponding to the indices of the most similar ones
    similar_words = [list(word_to_index.keys())[idx] for idx in most_similar.get()]

    print(f"{word}: {similar_words}")


Sample cosine similarities:
king: ['king', 'Hairan', 'assyrian', 'lord', 'Esarhaddon']
queen: ['queen', 'Aribi', 'elkhunu', 'lord', 'Zabibe']
man: ['man', 'wound', 'dusk', 'heterosexuality', 'FN']
woman: ['woman', 'sexual', 'unmarried', 'Fornication', 'heterosexuality']
light: ['light', 'shade', 'bactericidal', 'HID', 'diode']
kick: ['kick', 'Kinzer', 'timeout', 'Brotzman', 'kicker']


In [2]:
import pandas as pd
test_pairs = pd.read_csv('pairs_test_5.csv')
test_pairs.head()

Unnamed: 0,Center_Word_Index,Context_Word_Index
0,1097,906
1,906,1097
2,1097,906
3,1097,2240
4,1097,6


In [9]:
import numpy as np
import pandas as pd
from numpy.linalg import norm

# Load saved embeddings
target_embeddings = np.load('results/target_embeddings_final.npy')
context_embeddings = np.load('results/context_embeddings_final.npy')

# Load test pairs (contains indices)
test_pairs = pd.read_csv('pairs_test_2.csv')

len(test_pairs)

403482

In [12]:
# Hyperparameters
window_size = 2  # Context size window
embedding_dim = 100  # Make sure this matches with the model
context_size = 2 * window_size  # Total context size

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def compute_rank(pred_embedding, true_word_idx):
    """Compute the rank of the true word in the sorted similarity list."""
    similarities = np.dot(context_embeddings, pred_embedding)
    sorted_indices = np.argsort(similarities)[::-1]  # Sort by descending order
    rank = np.where(sorted_indices == true_word_idx)[0][0] + 1  # +1 to make it 1-based
    return rank

def compute_mrr(test_pairs, context_size=10):
    """Compute Mean Reciprocal Rank (MRR) over all test pairs."""
    total_mrr = 0
    for i, row in test_pairs.iterrows():
        center_word_idx, context_word_idx = row['Center_Word_Index'], row['Context_Word_Index']

        # Get the target embedding (center word) using the index
        center_embedding = target_embeddings[center_word_idx]

        # Calculate the rank of the true context word using its index
        rank = compute_rank(center_embedding, context_word_idx)
        
        # Calculate reciprocal rank
        mrr_i = 1 / rank
        total_mrr += mrr_i
        if(i%50000 == 0):
            print(f"Window size {window_size}, processed {i} pairs, total MRR: {total_mrr:.4f}")

    # Final MRR score
    avg_mrr = total_mrr / len(test_pairs)
    return avg_mrr

# Run MRR computation
mrr_score = compute_mrr(test_pairs, context_size=context_size)
print(f'MRR for the test data (window size 2): {mrr_score:.4f}')

Window size 2, processed 0 pairs, total MRR: 0.0026
Window size 2, processed 50000 pairs, total MRR: 88.6753
Window size 2, processed 100000 pairs, total MRR: 199.7818
Window size 2, processed 150000 pairs, total MRR: 278.3159
Window size 2, processed 200000 pairs, total MRR: 381.1548
Window size 2, processed 250000 pairs, total MRR: 476.8200
Window size 2, processed 300000 pairs, total MRR: 574.8599
Window size 2, processed 350000 pairs, total MRR: 686.6888
Window size 2, processed 400000 pairs, total MRR: 791.1328
MRR for the test data (window size 2): 0.0020


In [13]:
print(f'MRR for the test data (window size 2): {mrr_score:.4f}')

MRR for the test data (window size 2): 0.0020


In [14]:
# Load test pairs (contains indices)
test_pairs = pd.read_csv('pairs_test_4.csv')

len(test_pairs)

789074

In [15]:
# Hyperparameters
window_size = 4  # Context size window
embedding_dim = 100  # Make sure this matches with the model
context_size = 2 * window_size  # Total context size

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def compute_rank(pred_embedding, true_word_idx):
    """Compute the rank of the true word in the sorted similarity list."""
    similarities = np.dot(context_embeddings, pred_embedding)
    sorted_indices = np.argsort(similarities)[::-1]  # Sort by descending order
    rank = np.where(sorted_indices == true_word_idx)[0][0] + 1  # +1 to make it 1-based
    return rank

def compute_mrr(test_pairs, context_size=10):
    """Compute Mean Reciprocal Rank (MRR) over all test pairs."""
    total_mrr = 0
    for i, row in test_pairs.iterrows():
        center_word_idx, context_word_idx = row['Center_Word_Index'], row['Context_Word_Index']

        # Get the target embedding (center word) using the index
        center_embedding = target_embeddings[center_word_idx]

        # Calculate the rank of the true context word using its index
        rank = compute_rank(center_embedding, context_word_idx)
        
        # Calculate reciprocal rank
        mrr_i = 1 / rank
        total_mrr += mrr_i
        if(i%50000 == 0):
            print(f"Window size {window_size}, processed {i} pairs, total MRR: {total_mrr:.4f}")

    # Final MRR score
    avg_mrr = total_mrr / len(test_pairs)
    return avg_mrr

# Run MRR computation
mrr_score = compute_mrr(test_pairs, context_size=context_size)
print(f'MRR for the test data (window size 4): {mrr_score:.4f}')

Window size 4, processed 0 pairs, total MRR: 0.0026
Window size 4, processed 50000 pairs, total MRR: 151.8821
Window size 4, processed 100000 pairs, total MRR: 232.6650
Window size 4, processed 150000 pairs, total MRR: 333.1237
Window size 4, processed 200000 pairs, total MRR: 523.3636
Window size 4, processed 250000 pairs, total MRR: 635.3796
Window size 4, processed 300000 pairs, total MRR: 728.5030
Window size 4, processed 350000 pairs, total MRR: 849.5308
Window size 4, processed 400000 pairs, total MRR: 1010.0404
Window size 4, processed 450000 pairs, total MRR: 1145.5435
Window size 4, processed 500000 pairs, total MRR: 1239.9683
Window size 4, processed 550000 pairs, total MRR: 1362.1226
Window size 4, processed 600000 pairs, total MRR: 1522.6606
Window size 4, processed 650000 pairs, total MRR: 1677.9166
Window size 4, processed 700000 pairs, total MRR: 1841.1805
Window size 4, processed 750000 pairs, total MRR: 1978.5968
MRR for the test data (window size 2): 0.0026


In [19]:
print(f'MRR for the test data (window size 4): 0.0026')

MRR for the test data (window size 4): 0.0026


In [16]:
# Load test pairs (contains indices)
test_pairs = pd.read_csv('pairs_test_5.csv')

len(test_pairs)

976012

In [17]:
# Hyperparameters
window_size = 5  # Context size window
embedding_dim = 100  # Make sure this matches with the model
context_size = 2 * window_size  # Total context size

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def compute_rank(pred_embedding, true_word_idx):
    """Compute the rank of the true word in the sorted similarity list."""
    similarities = np.dot(context_embeddings, pred_embedding)
    sorted_indices = np.argsort(similarities)[::-1]  # Sort by descending order
    rank = np.where(sorted_indices == true_word_idx)[0][0] + 1  # +1 to make it 1-based
    return rank

def compute_mrr(test_pairs, context_size=10):
    """Compute Mean Reciprocal Rank (MRR) over all test pairs."""
    total_mrr = 0
    for i, row in test_pairs.iterrows():
        center_word_idx, context_word_idx = row['Center_Word_Index'], row['Context_Word_Index']

        # Get the target embedding (center word) using the index
        center_embedding = target_embeddings[center_word_idx]

        # Calculate the rank of the true context word using its index
        rank = compute_rank(center_embedding, context_word_idx)
        
        # Calculate reciprocal rank
        mrr_i = 1 / rank
        total_mrr += mrr_i
        if(i%50000 == 0):
            print(f"Window size {window_size}, processed {i} pairs, total MRR: {total_mrr:.4f}")

    # Final MRR score
    avg_mrr = total_mrr / len(test_pairs)
    return avg_mrr

# Run MRR computation
mrr_score = compute_mrr(test_pairs, context_size=context_size)
print(f'MRR for the test data (window size 5): {mrr_score:.4f}')

Window size 5, processed 0 pairs, total MRR: 0.0026
Window size 5, processed 50000 pairs, total MRR: 165.7831
Window size 5, processed 100000 pairs, total MRR: 269.6007
Window size 5, processed 150000 pairs, total MRR: 349.8160
Window size 5, processed 200000 pairs, total MRR: 498.7737
Window size 5, processed 250000 pairs, total MRR: 704.7958
Window size 5, processed 300000 pairs, total MRR: 837.4665
Window size 5, processed 350000 pairs, total MRR: 952.3520
Window size 5, processed 400000 pairs, total MRR: 1066.4647
Window size 5, processed 450000 pairs, total MRR: 1193.5025
Window size 5, processed 500000 pairs, total MRR: 1343.2588
Window size 5, processed 550000 pairs, total MRR: 1503.7496
Window size 5, processed 600000 pairs, total MRR: 1616.0950
Window size 5, processed 650000 pairs, total MRR: 1724.7557
Window size 5, processed 700000 pairs, total MRR: 1877.9371
Window size 5, processed 750000 pairs, total MRR: 2070.0777
Window size 5, processed 800000 pairs, total MRR: 2237.3

In [18]:
print(f'MRR for the test data (window size 5): {mrr_score:.4f}')

MRR for the test data (window size 5): 0.0029
