# Skip-gram Word2Vec from Scratch
This notebook implements the **Skip-gram with Negative Sampling (SGNS)** architecture using only NumPy. We cover:
1. **Text Preprocessing**: Fetching and cleaning Gutenberg data.
2. **Subsampling**: Reducing the influence of frequent stop words.
3. **Negative Sampling**: Efficiently training against 'noise' words.
4. **Backpropagation**: Manual gradient derivations for embeddings.

In [None]:
import numpy as np
import string
import nltk
import random
import requests
import re
from typing import Union, List, Optional, Tuple, Dict

## 1. Data Acquisition & Cleaning
We fetch *Frankenstein* from Project Gutenberg and remove the metadata headers/footers to ensure we are only training on the literary prose.

In [None]:
def clean_gutenberg(text: str) -> List[str]:
    """
    Cleans Gutenberg ebook text by removing metadata and punctuation.
    
    Args:
        text: Raw string from Gutenberg URL.
    Returns:
        List of lowercase word tokens.
    """
    start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK 84 ***"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK 84 ***"
    
    start = text.find(start_marker)
    end = text.find(end_marker)
    
    # Regex to keep only alphanumeric characters and spaces
    text = re.sub(r'[^\w\s]', '', text)

    if start != -1 and end != -1 and start < end:
        return text[start + len(start_marker) : end].lower().split()
    else:
        print("Warning: Markers not found. Using full text.")
        return text.lower().split()

url = "https://www.gutenberg.org/files/84/84-0.txt"
raw_data = requests.get(url).text
text_tokens = clean_gutenberg(raw_data)
print(f"Corpus size: {len(text_tokens)} tokens.")

## 2. The Tokenizer
The Tokenizer maps words to unique integer IDs and calculates the global unigram frequency distribution.

In [None]:
class Tokenizer:
    """
    Maps words to IDs and maintains frequency distributions for sampling.
    """
    def __init__(self, text: List[str]) -> None:
        self.token_to_id = {"<PAD>": 0, "<UNK>": 1}
        self.id_to_token = {0: "<PAD>", 1: "<UNK>"}

        # Build Vocab
        curr_id = 2 
        for word in text:
            if word not in self.token_to_id:
                self.token_to_id[word] = curr_id
                self.id_to_token[curr_id] = word
                curr_id += 1

        self.vocab_size = curr_id
        self.tokens = self.tokenize(text)

        # Compute frequencies
        self.frequencies = np.zeros(self.vocab_size, dtype=np.float32)
        np.add.at(self.frequencies, self.tokens, 1)
        self.frequencies /= np.sum(self.frequencies)

    def tokenize(self, words: Union[str, List[str]]) -> np.ndarray:
        single = isinstance(words, str)
        if single: words = [words]
        
        ids = np.array([self.token_to_id.get(w, 1) for w in words], dtype=np.int32)
        return ids[0] if single else ids

    def detokenize(self, tokens: Union[int, List[int]]) -> Union[str, List[str]]:
        single = isinstance(tokens, (int, np.integer))
        if single: tokens = [tokens]
        
        words = [self.id_to_token.get(int(t), "<UNK>") for t in tokens]
        return words[0] if single else words

## 3. Dataset Optimizations
To make Word2Vec efficient, we use:

### Subsampling
Frequent words are dropped with probability:
$$P(w_i) = 1 - \sqrt{\frac{t}{f(w_i)}}$$

### Negative Sampling
Instead of calculating probabilities for the entire vocabulary, we sample $K$ noise words from a smoothed distribution:
$$P_{ns}(w) = \frac{f(w)^{0.75}}{\sum_{v \in V} f(v)^{0.75}}$$

In [None]:
class SkipGramDataset:
    """
    Generates (center, context, negative) triplets for training.
    """
    def __init__(
        self, 
        tokens: np.ndarray, 
        tokenizer: Tokenizer, 
        subsample_threshold: float = 1e-5, 
        max_window_size: int = 5,
        num_negatives: int = 5
    ) -> None:
        self.tokenizer = tokenizer
        self.max_window_size = max_window_size
        self.num_negatives = num_negatives

        # Subsampling Logic
        freqs = np.clip(tokenizer.frequencies, 1e-10, None)
        drop_probs = 1 - np.sqrt(subsample_threshold / freqs)
        self.filtered_tokens = [t for t in tokens if random.random() >= drop_probs[t]]

        # Negative Sampling Distribution (Smoothed)
        prob = tokenizer.frequencies ** 0.75
        self.neg_table = prob / prob.sum()

        self.centers, self.contexts, self.negatives = self._generate_examples()

    def _generate_examples(self):
        centers, contexts, negatives = [], [], []
        n = len(self.filtered_tokens)

        for i, center in enumerate(self.filtered_tokens):
            # Dynamic Window
            window = np.random.randint(1, self.max_window_size + 1)
            start, end = max(0, i - window), min(n, i + window + 1)
            
            for j in range(start, end):
                if i == j: continue
                centers.append(center)
                contexts.append(self.filtered_tokens[j])
                
                # Negative Sampling
                neg = np.random.choice(self.tokenizer.vocab_size, size=self.num_negatives, p=self.neg_table)
                negatives.append(neg)
        
        return np.array(centers), np.array(contexts), np.array(negatives)

    def get_batches(self, batch_size: int):
        idx = np.arange(len(self.centers))
        np.random.shuffle(idx)
        
        batches = []
        for i in range(0, len(idx), batch_size):
            b_idx = idx[i : i + batch_size]
            batches.append((self.centers[b_idx], self.contexts[b_idx], self.negatives[b_idx]))
        return batches

## 4. The Model (SGNS)
We maintain two sets of embeddings: $V$ (Input/Center) and $U$ (Output/Context).

### Objective Function
$$\mathcal{L} = -\log \sigma(u_{context}^\top v_{center}) - \sum_{i=1}^K \log \sigma(-u_{neg_i}^\top v_{center})$$

In [None]:
def sigmoid(x): 
    return 1 / (1 + np.exp(-np.clip(x, -15, 15)))

class EmbeddingLayer:
    """Sparse embedding layer with gradient accumulation."""
    def __init__(self, vocab_size, dim):
        self.weights = np.random.randn(vocab_size, dim) * 0.01
        self.zero_grad()

    def zero_grad(self):
        self.grad_acc = None
        self.indices = None

    def accumulate(self, grad, indices):
        grad = np.asarray(grad).reshape(-1, self.weights.shape[1])
        indices = np.asarray(indices).reshape(-1)
        
        if self.grad_acc is None:
            self.grad_acc, self.indices = grad, indices
        else:
            self.grad_acc = np.concatenate([self.grad_acc, grad])
            self.indices = np.concatenate([self.indices, indices])

    def apply_grads(self, lr):
        if self.grad_acc is None: return
        # Sparse update: merge duplicate indices
        unique_idx, inv = np.unique(self.indices, return_inverse=True)
        merged_grad = np.zeros((len(unique_idx), self.weights.shape[1]))
        np.add.at(merged_grad, inv, self.grad_acc)
        
        np.add.at(self.weights, unique_idx, -lr * merged_grad)
        self.zero_grad()

class SkipGram:
    def __init__(self, vocab_size, dim):
        self.in_embed = EmbeddingLayer(vocab_size, dim)
        self.out_embed = EmbeddingLayer(vocab_size, dim)

    def forward(self, centers, contexts, negatives):
        batch_size = len(centers)
        
        v_c = self.in_embed.weights[centers] # (B, E)
        u_w = self.out_embed.weights[contexts] # (B, E)
        u_n = self.out_embed.weights[negatives] # (B, K, E)

        # Positive score
        pos_score = np.sum(v_c * u_w, axis=1, keepdims=True)
        pos_sig = sigmoid(pos_score)

        # Negative scores
        neg_score = np.sum(u_n * v_c[:, np.newaxis, :], axis=2)
        neg_sig = sigmoid(-neg_score)

        # Loss
        loss = -np.mean(np.log(pos_sig + 1e-9) + np.sum(np.log(neg_sig + 1e-9), axis=1, keepdims=True))

        # Gradients
        grad_pos = pos_sig - 1
        grad_neg = 1 - neg_sig

        v_c_grad = (grad_pos * u_w) + np.sum(grad_neg[:, :, np.newaxis] * u_n, axis=1)
        u_w_grad = grad_pos * v_c
        u_n_grad = grad_neg[:, :, np.newaxis] * v_c[:, np.newaxis, :]

        self.in_embed.accumulate(v_c_grad / batch_size, centers)
        self.out_embed.accumulate(u_w_grad / batch_size, contexts)
        self.out_embed.accumulate(u_n_grad / batch_size, negatives)

        return loss

## 5. Training Loop
We iterate through the dataset and update the embeddings via SGD.

In [None]:
def train(model, dataset, epochs, lr):
    for epoch in range(epochs):
        batches = dataset.get_batches(batch_size=256)
        total_loss = 0
        for c, ctx, neg in batches:
            loss = model.forward(c, ctx, neg)
            total_loss += loss
            model.in_embed.apply_grads(lr)
            model.out_embed.apply_grads(lr)
        
        if (epoch+1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1} | Avg Loss: {total_loss/len(batches):.4f}")

# Run Training
tok = Tokenizer(text_tokens[:20000])
ds = SkipGramDataset(tok.tokens, tok, num_negatives=5)
sg_model = SkipGram(tok.vocab_size, dim=50)

train(sg_model, ds, epochs=50, lr=0.2)

## 6. Inference: Similarity Search
Using Cosine Similarity to find words that occupy similar semantic space.

In [None]:
def get_similar(word, model, tok, k=5):
    if word not in tok.token_to_id: return f"{word} not in vocab"
    
    weights = model.in_embed.weights
    norm = np.linalg.norm(weights, axis=1, keepdims=True)
    norm_weights = weights / (norm + 1e-9)
    
    vec = norm_weights[tok.tokenize(word)]
    sims = norm_weights @ vec
    
    top_idx = np.argsort(sims)[-(k+1):-1][::-1]
    print(f"Similar to '{word}':")
    for i in top_idx:
        print(f" - {tok.detokenize(i)} ({sims[i]:.4f})")

get_similar("monster", sg_model, tok)