# Word Embeddings

In this notebook, we'll put into practice the concepts explored in the embedding directory:

- [Bag-of-Words (BoW)](./bow.md)
- [TF-IDF](./tfidf.md)
- [Word2Vec](./word2vec.md)
- [nn.Embedding](./nnembedding.md)

We will start from the simplest count-based methods and move towards learned dense embeddings. We'll use the brown corpus from NLTK at 10000 randomly-sampled sentences, with the subsampling thing we talked about in Word2Vec.


In [1]:
from collections import Counter
from nltk.corpus import brown
import torch.optim as optim
import torch.nn as nn
import numpy as np
import random
import torch
import math

# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [None]:
text_data = [" ".join(s) for s in random.sample(sorted(brown.sents()), 10000)]

#  Count frequencies
tokens = []
for sentence in text_data:
    tokens.extend(sentence.lower().split())

word_counts = Counter(tokens)

# drop super infrequent words
min_count = 5
tokens = [t for t in tokens if word_counts[t] >= min_count]

# update these based on the filtered list
word_counts = Counter(tokens)
total_count = len(tokens)

# NOW build the vocab
vocab = {word: i for i, word in enumerate(set(tokens))}
idx2word = {i: word for word, i in vocab.items()}
VOCAB_SIZE = len(vocab)

# Subsampling Threshold
# standard values are usually between 1e-3 and 1e-5
threshold = 1e-5


def should_keep(word):
    count = word_counts[word]
    freq = count / total_count

    # The Word2Vec formula for keeping a word
    # P(keep) = sqrt(threshold / freq)
    # (If freq is small, this is > 1, so we always keep it)
    p_keep = math.sqrt(threshold / (freq + 1e-8))

    return random.random() < p_keep


# Generate skip-gram pairs
WINDOW_SIZE = 5
data = []

for sentence in text_data:
    sent_tokens = sentence.lower().split()

    # Filter and close the gaps, so "the cat sat" might become "cat sat" if "the" is dropped
    subsampled_tokens = [w for w in sent_tokens if w in vocab and should_keep(w)]

    # Generate pairs from the CLEANED list
    for i, word in enumerate(subsampled_tokens):
        center_idx = vocab[word]

        for w in range(-WINDOW_SIZE, WINDOW_SIZE + 1):
            context_pos = i + w
            # bounds check on the NEW length
            if w != 0 and 0 <= context_pos < len(subsampled_tokens):
                context_word = subsampled_tokens[context_pos]
                context_idx = vocab[context_word]
                data.append((center_idx, context_idx))

print(f"Vocab Size: {VOCAB_SIZE}")
print(f"Number of training pairs: {len(data)}")
print(f"Sample pairs: {[(idx2word[c], idx2word[t]) for c, t in data[:3]]}")

Vocab Size: 4244
Number of training pairs: 72558
Sample pairs: [('he', 'wanting'), ('wanting', 'he'), ('group', 'willing')]


## 1. Bag-of-Words (BoW)

As discussed in [bow.md](./bow.md), BoW represents text as a fixed-length vector of word counts. It ignores grammar and word order.

Let's implement a simple BoW vectorizer from scratch.


In [None]:
class BoWVectorizer:
    def __init__(self):
        self.vocab = {}
        self.idx2word = {}

    def build_vocab(
        self, corpus
    ):  # A rlly simple vocab builder, but since we alrd did this up there, we skip it
        # Tokenize and build vocabulary
        unique_words = set()
        for doc in corpus:
            # Simple tokenization: lowercase and split by space
            tokens = doc.lower().split()
            unique_words.update(tokens)

        # Assign indices
        self.vocab = {word: i for i, word in enumerate(sorted(list(unique_words)))}
        self.idx2word = {i: word for word, i in self.vocab.items()}
        print(f"Vocabulary Size: {len(self.vocab)}")
        print(f"Vocab: {self.vocab}")

    def transform(self, corpus):
        vectors = []
        for doc in corpus:
            tokens = doc.lower().split()
            # Create a vector of zeros
            vec = [0] * len(self.vocab)

            # Count frequencies
            for token in tokens:
                if token in self.vocab:
                    idx = self.vocab[token]
                    vec[idx] += 1
            vectors.append(vec)
        return np.array(vectors)


bow = BoWVectorizer()
bow.vocab = vocab
bow.idx2word = idx2word
bow_vectors = bow.transform(text_data)

print("\nBoW Vectors:")
print(bow_vectors)


BoW Vectors:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## 2. TF-IDF

BoW gives too much weight to common words like "the". [TF-IDF](./tfidf.md) addresses this by re-weighting counts based on how rare a word is across documents.

$$ \text{tfidf}(t, d) = \text{tf}(t,d)\times \text{idf}(t) $$


In [None]:
class TFIDFVectorizer(BoWVectorizer):
    def __init__(self):
        super().__init__()

        # keeping idf as an array aligned with vocab indices is way faster than a dict
        self.idf = None

    def fit(self, corpus):
        if not (self.idx2word or self.vocab):
            self.build_vocab(corpus)
        N = len(corpus)

        # get the raw counts first (n_docs, n_vocab)
        # this assumes super().transform() returns a numpy array
        tf_matrix = super().transform(corpus)

        # calculate doc freq (df) by checking where count > 0
        # axis=0 collapses the rows (docs), giving us count per word
        df = np.sum(tf_matrix > 0, axis=0)

        # vectorised idf calc. added small epsilon to avoid div by zero
        self.idf = np.log(N / (df + 1e-9))

    def transform(self, corpus):
        tf_vectors = super().transform(corpus)

        # numpy broadcasting automatically multiplies the idf vector
        return tf_vectors * self.idf


tfidf = TFIDFVectorizer()
tfidf.vocab = vocab
tfidf.idx2word = idx2word
tfidf.fit(text_data)
tfidf_vectors = tfidf.transform(text_data)

print("\nTF-IDF Vectors:")
print(np.round(tfidf_vectors, 3))


TF-IDF Vectors:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## 3. Word2Vec (Skip-Gram)

Moving to dense embeddings. As described in [word2vec.md](./word2vec.md), we'll implement an SGNS model. The goal is to predict context words given a center word. We will implement this using PyTorch.


In [None]:
# 1. Define Model (SGNS)
class SkipGramNSModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        # Two embeddings: one for when the word is 'center', one for 'context'
        self.center_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embed_dim)

    def forward(self, center_idxs, context_idxs):
        # center_idxs: [batch_size]
        # context_idxs: [batch_size]

        center_embeds = self.center_embeddings(center_idxs)  # [batch, dim]
        context_embeds = self.context_embeddings(context_idxs)  # [batch, dim]

        # Dot product between center and context vectors
        # Result shape: [batch]
        scores = torch.sum(center_embeds * context_embeds, dim=1)
        return scores


# Configuration
VOCAB_SIZE = len(vocab)
EMBED_DIM = 200
NUM_NEGATIVES = 5  # Number of negative samples per positive pair

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SkipGramNSModel(VOCAB_SIZE, EMBED_DIM).to(device)

# SGNS uses Binary Cross Entropy (with Logits for stability)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

# 3. Train
print("\nTraining Word2Vec (SGNS)...")

# 1. Calculate Unigram Distribution
# flatten the data to count word occurrences
all_indices = [idx for pair in data for idx in pair]
counts = Counter(all_indices)

# create a frequency tensor sorted by vocab index
freqs = torch.zeros(VOCAB_SIZE)
for idx in range(VOCAB_SIZE):
    freqs[idx] = counts.get(idx, 1)  # default to 1 to avoid zero division/errors

# apply the magic 3/4 power
unigram_weights = freqs.pow(0.75)
# normalize so it sums to 1 (probabilities)
unigram_weights = unigram_weights / unigram_weights.sum()

print("Unigram weights ready.")
print("\nTraining with Unigram Negatives...")
for epoch in range(201):
    random.shuffle(data)

    # positive samples
    pos_centers = torch.LongTensor([p[0] for p in data]).to(device)
    pos_contexts = torch.LongTensor([p[1] for p in data]).to(device)
    pos_labels = torch.ones(len(data)).to(device)

    # negative samples
    neg_centers = pos_centers.repeat(NUM_NEGATIVES)

    # sample from our calculated weights instead of uniform random
    # we need (batch_size * num_negatives) samples
    num_neg_samples = len(data) * NUM_NEGATIVES

    # torch.multinomial samples indices based on the probability distribution we made
    neg_contexts = torch.multinomial(
        unigram_weights, num_neg_samples, replacement=True
    ).to(device)

    neg_labels = torch.zeros(num_neg_samples).to(device)

    all_centers = torch.cat([pos_centers, neg_centers])
    all_contexts = torch.cat([pos_contexts, neg_contexts])
    all_labels = torch.cat([pos_labels, neg_labels])

    optimizer.zero_grad()
    output = model(all_centers, all_contexts)
    loss = criterion(output, all_labels)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# 4. Inspect Embeddings
print("\nLearned Embeddings (first 3 words):")
with torch.no_grad():
    for i in range(3):
        vec = model.center_embeddings.weight[i]
        print(f"Word Index {i}: {vec.cpu().numpy().round(2)}")


Training Word2Vec (SGNS)...
Unigram weights ready.

Training with Unigram Negatives...
Epoch 0, Loss: 5.6791
Epoch 50, Loss: 0.0380
Epoch 100, Loss: 0.0306
Epoch 150, Loss: 0.0309
Epoch 200, Loss: 0.0322

Learned Embeddings (first 3 words):
Word Index 0: [ 0.58  0.05  1.22 -1.68 -0.19 -0.02  0.24  0.31  0.83  0.33  0.96 -0.72
 -0.54 -1.77 -1.25 -1.06  0.43  1.4  -0.41  0.04  0.03  0.01  0.3   1.18
  0.62  0.12  0.39  2.5   1.1  -0.74 -0.47  0.01 -0.71 -1.    0.53  0.62
 -0.68 -0.53  0.92  0.94 -1.32  1.47 -0.55  0.5  -0.72  0.38 -0.93 -1.65
 -0.81  0.21 -1.18  0.42 -0.75  0.28  0.5   0.43  0.19  0.74 -0.88  0.7
  1.14  0.61  0.18  0.74 -0.04 -0.22  0.17  0.31  0.39  1.19 -0.92  0.39
  0.93  0.5  -2.05  1.23  1.44  0.82 -0.39 -0.93  0.18  0.39 -1.34 -2.14
  0.8   0.33  0.33 -0.18  0.64 -1.66 -0.75 -0.31 -0.43 -0.71  1.43  0.03
 -0.62  0.4   0.79  0.21  0.37 -0.55 -0.23  0.13  0.09  1.06 -0.27  0.32
  0.94 -0.39  0.24  0.47 -1.31 -1.28 -2.79  0.76  0.6   0.38 -0.95  1.28
  0.07  1.48 -0

## 4. nn.Embedding & Hashed Embedding

As mentioned in [nnembedding.md](./nnembedding.md), standard `nn.Embedding` is just a lookup table.

Here, we'll try implementing the hash embedding concept we went over as the optimization.


In [None]:
class HashedEmbedding(nn.Module):
    def __init__(self, num_buckets, embed_dim, num_hashes=2):
        super().__init__()
        self.num_buckets = num_buckets
        self.num_hashes = num_hashes
        self.embed_dim = embed_dim

        self.table = nn.Embedding(num_buckets, embed_dim)
        self.importance = nn.Embedding(num_buckets, 1)

        self.register_buffer(
            "primes", torch.tensor([2654435761, 2246822507])[:num_hashes]
        )
        self.register_buffer(
            "weight_primes", torch.tensor([387420489, 982451653])[:num_hashes]
        )

    def forward(self, indices):
        batch_sum = torch.zeros(indices.shape[0], self.embed_dim, device=indices.device)
        for i in range(self.num_hashes):
            hashed_idx = (indices * self.primes[i] + i) % self.num_buckets
            weight_idx = (indices * self.weight_primes[i] + i) % self.num_buckets
            vec = self.table(hashed_idx)
            p_w = torch.sigmoid(self.importance(weight_idx))
            batch_sum += vec * p_w
        return batch_sum


# Define the Hashed SGNS Model
class HashedSkipGramNSModel(nn.Module):
    def __init__(self, vocab_size, num_buckets, embed_dim):
        super().__init__()
        # instead of allocating VOCAB_SIZE vectors, we allocate NUM_BUCKETS
        # for both center and context parts
        self.center_embeddings = HashedEmbedding(num_buckets, embed_dim)
        self.context_embeddings = HashedEmbedding(num_buckets, embed_dim)

    def forward(self, center_idxs, context_idxs):
        # the HashedEmbedding.forward() returns a vector [batch, dim]
        # just like nn.Embedding does
        center_embeds = self.center_embeddings(center_idxs)
        context_embeds = self.context_embeddings(context_idxs)

        # dot product
        scores = torch.sum(center_embeds * context_embeds, dim=1)
        return scores


# Configuration
# lets assume vocab is huge but we only want small memory usage
VOCAB_SIZE = len(vocab)
PHYSICAL_BUCKETS = VOCAB_SIZE // 8  # effectively compresses the vocab
EMBED_DIM = 200
NUM_NEGATIVES = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# init the new model
hashed_model = HashedSkipGramNSModel(VOCAB_SIZE, PHYSICAL_BUCKETS, EMBED_DIM).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(hashed_model.parameters(), lr=0.01)

# Calculate Unigram Distribution
# flatten the data to count word occurrences
all_indices = [idx for pair in data for idx in pair]
counts = Counter(all_indices)

# create a frequency tensor sorted by vocab index
freqs = torch.zeros(VOCAB_SIZE)
for idx in range(VOCAB_SIZE):
    freqs[idx] = counts.get(idx, 1)  # default to 1 to avoid zero division/errors

# apply the magic 3/4 power
unigram_weights = freqs.pow(0.75)
# normalize so it sums to 1 (probabilities)
unigram_weights = unigram_weights / unigram_weights.sum()

print("Unigram weights ready.")
optimizer = optim.Adam(hashed_model.parameters(), lr=0.1)

print("\nTraining with Unigram Negatives...")
for epoch in range(201):
    random.shuffle(data)

    # positive samples
    pos_centers = torch.LongTensor([p[0] for p in data]).to(device)
    pos_contexts = torch.LongTensor([p[1] for p in data]).to(device)
    pos_labels = torch.ones(len(data)).to(device)

    # negative samples
    # repeat centers to match num_negatives
    neg_centers = pos_centers.repeat(NUM_NEGATIVES)

    # sample from our calculated weights instead of uniform random
    # we need (batch_size * num_negatives) samples
    num_neg_samples = len(data) * NUM_NEGATIVES

    # torch.multinomial samples indices based on the probability distribution we made
    neg_contexts = torch.multinomial(
        unigram_weights, num_neg_samples, replacement=True
    ).to(device)

    neg_labels = torch.zeros(num_neg_samples).to(device)

    all_centers = torch.cat([pos_centers, neg_centers])
    all_contexts = torch.cat([pos_contexts, neg_contexts])
    all_labels = torch.cat([pos_labels, neg_labels])

    optimizer.zero_grad()
    output = hashed_model(all_centers, all_contexts)
    loss = criterion(output, all_labels)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

print("\nLearned Hashed Embeddings (SGNS):")

with torch.no_grad():
    for i in range(3):  # print first 3 words

        # make a tensor for the index (shape [1])
        idx_tensor = torch.tensor([i], dtype=torch.long).to(device)
        vec = model.center_embeddings(idx_tensor)

        print(f"Word {i}: {vec.squeeze().cpu().numpy().round(2)}")

Unigram weights ready.

Training with Unigram Negatives...
Epoch 0, Loss: 3.2161
Epoch 50, Loss: 0.3715
Epoch 100, Loss: 0.3179
Epoch 150, Loss: 0.2885
Epoch 200, Loss: 0.2781

Learned Hashed Embeddings (SGNS):
Word 0: [ 0.58  0.05  1.22 -1.68 -0.19 -0.02  0.24  0.31  0.83  0.33  0.96 -0.72
 -0.54 -1.77 -1.25 -1.06  0.43  1.4  -0.41  0.04  0.03  0.01  0.3   1.18
  0.62  0.12  0.39  2.5   1.1  -0.74 -0.47  0.01 -0.71 -1.    0.53  0.62
 -0.68 -0.53  0.92  0.94 -1.32  1.47 -0.55  0.5  -0.72  0.38 -0.93 -1.65
 -0.81  0.21 -1.18  0.42 -0.75  0.28  0.5   0.43  0.19  0.74 -0.88  0.7
  1.14  0.61  0.18  0.74 -0.04 -0.22  0.17  0.31  0.39  1.19 -0.92  0.39
  0.93  0.5  -2.05  1.23  1.44  0.82 -0.39 -0.93  0.18  0.39 -1.34 -2.14
  0.8   0.33  0.33 -0.18  0.64 -1.66 -0.75 -0.31 -0.43 -0.71  1.43  0.03
 -0.62  0.4   0.79  0.21  0.37 -0.55 -0.23  0.13  0.09  1.06 -0.27  0.32
  0.94 -0.39  0.24  0.47 -1.31 -1.28 -2.79  0.76  0.6   0.38 -0.95  1.28
  0.07  1.48 -0.46 -0.16  0.46 -0.09  1.03  0.41  0.

## Comparing the embeddings


In [None]:
from scipy.spatial.distance import cosine
import numpy as np


# helper function cuz scipy cosine gives "distance" (0 is identical)
def get_similarity(vec1, vec2):
    if np.all(vec1 == 0) or np.all(vec2 == 0):
        return 0.0
    return 1 - cosine(vec1, vec2)


pairs = [
    ("paris", "france"),
    ("king", "queen"),
    ("king", "government"),
    ("dog", "government"),
]

# filter pairs to make sure they exist in our vocab
valid_pairs = []
for w1, w2 in pairs:
    if w1 in vocab and w2 in vocab:
        valid_pairs.append((w1, w2))
    else:
        print(f"skipping ({w1}, {w2}) - not in vocab")

print(f"\n--- Comparing Embedding Spaces ---")
print(f"{'Pair':<20} | {'BoW':<8} | {'TF-IDF':<8} | {'SkipGram':<8} | {'Hashed':<8}")
print("-" * 65)

for w1, w2 in valid_pairs:
    idx1, idx2 = vocab[w1], vocab[w2]

    v_bow1 = bow_vectors[:, idx1]
    v_bow2 = bow_vectors[:, idx2]
    sim_bow = get_similarity(v_bow1, v_bow2)

    v_tfidf1 = tfidf_vectors[:, idx1]
    v_tfidf2 = tfidf_vectors[:, idx2]
    sim_tfidf = get_similarity(v_tfidf1, v_tfidf2)

    # Standard SkipGram
    with torch.no_grad():
        v_sg1 = model.center_embeddings(torch.tensor(idx1).to(device)).cpu().numpy()
        v_sg2 = model.center_embeddings(torch.tensor(idx2).to(device)).cpu().numpy()
    sim_sg = get_similarity(v_sg1, v_sg2)

    # Hashed SkipGram
    with torch.no_grad():
        # we gotta unsqueeze inputs cuz your hashed embedding expects batches
        v_hash1 = (
            hashed_model.center_embeddings(torch.tensor([idx1]).to(device))
            .cpu()
            .squeeze()
            .numpy()
        )
        v_hash2 = (
            hashed_model.center_embeddings(torch.tensor([idx2]).to(device))
            .cpu()
            .squeeze()
            .numpy()
        )
    sim_hash = get_similarity(v_hash1, v_hash2)

    print(
        f"{w1}-{w2:<14} | {sim_bow:.3f}    | {sim_tfidf:.3f}    | {sim_sg:.3f}    | {sim_hash:.3f}"
    )


--- Comparing Embedding Spaces ---
Pair                 | BoW      | TF-IDF   | SkipGram | Hashed  
-----------------------------------------------------------------
paris-france         | 0.000    | 0.000    | 0.129    | 0.088
king-queen          | 0.000    | 0.000    | 0.182    | 0.095
king-government     | 0.000    | 0.000    | 0.214    | 0.129
dog-government     | 0.000    | 0.000    | 0.194    | 0.155


As you can see, there's definitely some level of fidelity loss to the embedding quality when you reduce the number of "actual" vocabulary, but when you scale up your training, this problem eventually amortizes.
