In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Enable tqdm in pandas
tqdm.pandas()

# Select device (GPU if available)
use_gpu = True
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# Set random seed
seed = 1234
if seed is not None:
    print(f'random seed: {seed}')
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


In [2]:
# Initialize Transformer Model and Tokenizer
model_name = "roberta-base"
print(f"Loading model: {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval() # Set to evaluation mode

print("Model loaded.")

Loading model: roberta-base...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.


In [4]:
# Data structures to store sums and counts
token_sums = defaultdict(lambda: np.zeros(model.config.hidden_size))
token_counts = defaultdict(int)

# Path to your dataset
dataset_path = 'assignment4-dataset.txt' 

print(f"Processing dataset from {dataset_path}...")

# Set the limit to 1 million lines
LIMIT_LINES = 100_000

# Open file and process
with open(dataset_path, 'r', encoding='utf-8', errors='ignore') as f:
    # loop with progress bar
    for i, line in tqdm(enumerate(f), total=LIMIT_LINES, desc="Computing Token Embeddings"):
        
        # Stop after 1 million lines
        if i >= LIMIT_LINES:
            break
            
        line = line.strip()
        if not line: continue
        
        # Tokenize
        inputs = tokenizer(line, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get Contextual Embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Shape: (1, seq_len, hidden_size)
            embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()
            
        # Map IDs to Embeddings
        input_ids = inputs['input_ids'].squeeze(0).cpu().numpy()
        
        # Accumulate
        for token_id, vector in zip(input_ids, embeddings):
            token_sums[token_id] += vector
            token_counts[token_id] += 1

# --- AVERAGING & SAVING ---
print("Computing final averages...")
static_token_embeddings = {}
for token_id, total_vector in token_sums.items():
    static_token_embeddings[token_id] = total_vector / token_counts[token_id]

print(f"Computed embeddings for {len(static_token_embeddings)} unique tokens.")

# Save to file
save_path = 'token_embeddings.pt'
torch.save(static_token_embeddings, save_path)
print(f"Save complete! Embeddings saved to {save_path}")

Processing dataset from assignment4-dataset.txt...


Computing Token Embeddings:   0%|          | 0/100000 [00:00<?, ?it/s]

Computing final averages...
Computed embeddings for 39833 unique tokens.
Save complete! Embeddings saved to token_embeddings.pt


In [5]:
def get_word_embedding(word, tokenizer, token_embeddings):
    """
    Tokenizes a word and averages the static embeddings of its sub-word tokens.
    """
    # Tokenize the word (add_special_tokens=False to avoid <s> and </s> wrapper tokens)
    token_ids = tokenizer.encode(word, add_special_tokens=False)
    
    vectors = []
    for tid in token_ids:
        # Only use vectors if we actually saw this token in the large dataset
        if tid in token_embeddings:
            vectors.append(token_embeddings[tid])
            
    if not vectors:
        # If the word consists entirely of tokens we never saw in the dataset
        return np.zeros(768) 
        
    # Average the sub-word vectors to get the whole word vector
    return np.mean(vectors, axis=0)

# Path to vocabulary file
vocab_path = 'glove.6B.300d-vocabulary.txt'
word_embeddings = {}

print("Building word embeddings from vocabulary...")

# Open as standard text file
with open(vocab_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in tqdm(f, desc="Processing Vocab"):
        word = line.strip()
        if not word: continue
        
        word_embeddings[word] = get_word_embedding(word, tokenizer, static_token_embeddings)

print(f"Computed embeddings for {len(word_embeddings)} words.")

Building word embeddings from vocabulary...


Processing Vocab: 0it [00:00, ?it/s]

Computed embeddings for 400000 words.


In [6]:
def most_similar(word, k=5):
    """
    Finds the k most similar words to the input word using Cosine Similarity.
    """
    if word not in word_embeddings:
        print(f"Word '{word}' not found in vocabulary.")
        return
    
    # Get the vector for the target word
    target_vector = word_embeddings[word].reshape(1, -1)
    
    # Prepare matrix of all vocab vectors for fast calculation
    vocab_words = list(word_embeddings.keys())
    vocab_vectors = np.array([word_embeddings[w] for w in vocab_words])
    
    # Compute similarity scores
    sim_scores = cosine_similarity(target_vector, vocab_vectors)[0]
    
    # Get top k indices (sorting descending)
    # We grab k+1 because the most similar word is always the word itself (score 1.0)
    top_indices = sim_scores.argsort()[-(k+1):][::-1]
    
    print(f"Most similar words to '{word}':")
    count = 0
    for idx in top_indices:
        sim_word = vocab_words[idx]
        
        # Skip the word itself
        if sim_word == word: 
            continue 
            
        score = sim_scores[idx]
        print(f"  {sim_word}: {score:.4f}")
        
        count += 1
        if count >= k:
            break
    print("-" * 30)

# --- Run Examples ---
# These are standard test words, but verify if your assignment PDF 
# specified different ones in a separate area not visible in the screenshot.
test_words = ["apple", "king", "happy", "car", "university", "science"]

for w in test_words:
    most_similar(w)

Most similar words to 'apple':
  applecart: 1.0000
  applewood: 0.9745
  applejack: 0.9705
  appleby: 0.9697
  applebaum: 0.9695
------------------------------
Most similar words to 'king':
  kingdoms: 1.0000
  kingman: 0.9874
  kingma: 0.9869
  kingston: 0.9863
  peking: 0.9857
------------------------------
Most similar words to 'happy':
  unhappy: 0.9631
  happyland: 0.9628
  happyness: 0.9626
  trigger-happy: 0.9580
  witgood: 0.9207
------------------------------
Most similar words to 'car':
  microcar: 1.0000
  carloads: 1.0000
  truecar: 1.0000
  carreon: 1.0000
  carousel: 1.0000
------------------------------
Most similar words to 'university':
  interuniversity: 0.9931
  inter-university: 0.9875
  university-wide: 0.9871
  university-educated: 0.9870
  university-level: 0.9867
------------------------------
Most similar words to 'science':
  materialscience: 1.0000
  neuroscience: 0.9625
  e-science: 0.9546
  science-fiction: 0.9520
  science-related: 0.9515
-----------------