# Word2Vec from Scratch (Skip-Gram + Negative Sampling)
Pure NumPy implementation. Skip-gram with negative sampling (SGNS) trained on text8.

In [None]:
%pip install numpy matplotlib

## Smoke Test — Tiny Corpus
Trains on a 6-word sentence to verify the full pipeline works end-to-end.

In [None]:
import matplotlib.pyplot as plt
from main import train
from evaluate import cosine_similarity

W, w2i, i2w, losses = train("the cat sat on the mat",
                             embedding_dim=5, window_size=1,
                             num_negatives=2, learning_rate=0.1,
                             num_epochs=100, min_count=1, batch_size=32)

print(f"cat ↔ on  : {cosine_similarity(W[w2i['cat']], W[w2i['on']]):.4f}  (share 2 contexts)")
print(f"cat ↔ mat : {cosine_similarity(W[w2i['cat']], W[w2i['mat']]):.4f}  (share 1 context)")

plt.figure(figsize=(7, 3))
plt.plot(range(1, len(losses) + 1), losses)
plt.xlabel("Epoch")
plt.ylabel("Avg loss")
plt.title("Training loss — tiny corpus")
plt.tight_layout()
plt.show()

## Full Training on text8
~17M tokens, vocab ~71k words. Trains in ~15–20 min on CPU.  
Saved model is loaded automatically if already trained.

In [None]:
import os
import json
import matplotlib.pyplot as plt
from main import train, save_model, load_model

MODEL_PATH  = "model/model"      # model/model.npy + model/model.json
LOSSES_PATH = "model/losses.json"
CORPUS_FILE = "text8"

os.makedirs("model", exist_ok=True)

if os.path.exists(f"{MODEL_PATH}.npy"):
    W, w2i, i2w = load_model(MODEL_PATH)
    with open(LOSSES_PATH) as f:
        losses = json.load(f)
else:
    with open(CORPUS_FILE) as f:
        corpus = f.read()
    W, w2i, i2w, losses = train(corpus, embedding_dim=100, window_size=5,
                                 num_negatives=5, learning_rate=0.025, num_epochs=5,
                                 batch_size=512)
    save_model(W, w2i, MODEL_PATH)
    with open(LOSSES_PATH, "w") as f:
        json.dump(losses, f)

plt.figure(figsize=(7, 3))
plt.plot(range(1, len(losses) + 1), losses, marker='o')
plt.xlabel("Epoch")
plt.ylabel("Avg loss")
plt.title("Training loss — text8")
plt.tight_layout()
plt.show()

## Nearest Neighbours

In [None]:
from evaluate import find_nearest, normalize_embeddings

W_normed = normalize_embeddings(W)
for word in ["king", "woman", "computer"]:
    if word in w2i:
        neighbours = find_nearest(word, w2i, i2w, W_normed)
        print(f"{word}: {[w for w, _ in neighbours]}")

## Analogies  (a : b :: c : ?)

In [None]:
from evaluate import analogy

tests = [("man", "king", "woman"), ("france", "paris", "england"), ("good", "better", "bad")]
for a, b, c in tests:
    if all(w in w2i for w in (a, b, c)):
        print(f"{a} : {b}  ::  {c} : {analogy(a, b, c, w2i, i2w, W_normed)}")

## Analogy Accuracy

In [None]:
from evaluate import eval_analogies

ANALOGY_TESTS = [
    # semantic — capitals
    ("france",   "paris",    "england",  "london"),
    ("germany",  "berlin",   "france",   "paris"),
    ("italy",    "rome",     "france",   "paris"),
    # semantic — gender
    ("man",      "king",     "woman",    "queen"),
    ("man",      "actor",    "woman",    "actress"),
    ("man",      "father",   "woman",    "mother"),
    ("man",      "brother",  "woman",    "sister"),
    # semantic — comparative
    ("good",     "better",   "bad",      "worse"),
    ("great",    "greater",  "small",    "smaller"),
    ("big",      "bigger",   "small",    "smaller"),
    # semantic — verb tense
    ("walk",     "walked",   "run",      "ran"),
    ("go",       "went",     "buy",      "bought"),
    # semantic — plurals
    ("man",      "men",      "woman",    "women"),
    ("child",    "children", "dog",      "dogs"),
]

acc, correct, total = eval_analogies(ANALOGY_TESTS, w2i, i2w, W_normed)
print(f"Analogy accuracy: {correct}/{total}  ({acc:.1%})")