In [1]:
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'svg'")
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import re
seaborn.set_theme()


In [2]:
def tokenize(text, lowercase=True, keep_apostrophes=True, split_hyphens=False):
    if lowercase:
        text = text.lower()

    if keep_apostrophes:
        text = re.sub(r"'\w\b", "", text)

    tokens = []
    if split_hyphens:
        tokens = re.findall(r"\w+(?:'\w+)?|[^\w\s]", text)
    else:
        tokens = re.findall(r"\w+(?:[-']\w+)*|[^\w\s]", text)

    tokens = [t for t in tokens if t and t != "'"]

    return tokens


In [3]:
def create_id_to_word(tokens: list[str]) -> dict[int, str]:
    return {i: word for i, word in enumerate(sorted(set(tokens)))}

In [4]:
def one_hot_encode(id, vocab_size):
    res = [0] * vocab_size
    res[id] = 1
    return res

In [5]:
def create_word_to_id(tokens: list[str]) -> dict[str, int]:
    return {word: i for i, word in enumerate(sorted(set(tokens)))}

In [6]:
import gensim.downloader as api

dataset = api.load("text8")

tokens = [word for sentence in dataset for word in sentence]



word_to_id = create_word_to_id(tokens)
id_to_word = create_id_to_word(tokens)

In [7]:
def human_readable_vectors(centre_vec: list[int], context_vec: list[int],
                         id_to_word_map: dict[int, str]) -> str:
    centre_word = id_to_word_map[np.argmax(centre_vec)]
    context_word = id_to_word_map[np.argmax(context_vec)]
    return f"{centre_word} → {context_word}"


In [8]:
def generate_training_data(tokens, word_to_id, window=2):
    X = [] # stores one_hot_encoded centre words
    y = [] # stores one hot encoded context words
    n_tokens = len(tokens) # the number of tokens

    id_to_word_map = create_id_to_word(tokens)

    for i in range(n_tokens):
        left = list(range(max(0, i - window), i)) # A range of numbers to the specified distance to the left
        right = list(range(i+1, min(n_tokens, i + window + 1))) # A range of numbers to the specified distance to the right

        for j in left + right: # Loops through the tokens to the right or left (context tokens)
            if i == j:
                continue
            X.append(one_hot_encode(word_to_id[tokens[i]], len(word_to_id)))
            y.append(one_hot_encode(word_to_id[tokens[j]], len(word_to_id)))
            #print(human_readable_vectors(X[-1], y[-1], id_to_word_map))

    return np.array(X), np.array(y)


In [9]:
def init_network(vocab_size, n_embedding):
    model = {
        "w1": np.random.randn(vocab_size, n_embedding),
        "w2": np.random.randn(n_embedding, vocab_size)
    }
    return model


In [None]:
# Generate training data
X, y = generate_training_data(tokens, word_to_id, window=2)

# Get vocabulary size
vocab_size = len(word_to_id)

In [None]:
def softmax(x, epsilon=1e-10):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Numerical stability
    return exp_x / (np.sum(exp_x, axis=1, keepdims=True) + epsilon)


In [None]:
def cross_entropy(z, y):
    z = np.clip(z, 1e-10, 1-1e-10)  # Avoid log(0)
    return -np.sum(y * np.log(z))

In [None]:
def backward(model, X, y, alpha, clip_value=5.0):
    cache = forward(model, X)

    # Gradient calculations
    da2 = cache["z"] - y
    dw2 = cache["a1"].T @ da2
    da1 = da2 @ model["w2"].T
    dw1 = X.T @ da1

    # Gradient clipping
    dw1 = np.clip(dw1, -clip_value, clip_value)
    dw2 = np.clip(dw2, -clip_value, clip_value)

    # Update weights
    model["w1"] -= alpha * dw1
    model["w2"] -= alpha * dw2

    return cross_entropy(cache["z"], y)

In [None]:
def forward(model, X, return_cache=True):
    cache = {}
    cache["a1"] = X @ model["w1"]
    cache["a2"] = cache["a1"] @ model["w2"]
    cache["z"] = softmax(cache["a2"])
    return cache if return_cache else cache["z"]

In [None]:
n_embedding = 10
model = init_network(vocab_size, n_embedding)

# Training parameters
n_iter = 50
learning_rate = 0.05

history = []
for i in range(n_iter):
    loss = backward(model, X, y, learning_rate)
    history.append(loss)
    if i % 10 == 0:
        print(f"Iteration {i}: Loss = {loss:.4f}")


In [None]:
plt.figure(figsize=(8, 4))
plt.plot(range(len(history)), history, color="skyblue")
plt.title("Training Loss Over Iterations", pad=20)
plt.xlabel("Iteration")
plt.ylabel("Cross Entropy Loss")
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
learning = one_hot_encode(word_to_id["deepmind"], len(word_to_id))
result = forward(model, [learning], return_cache=False)[0]

for word in (id_to_word[id] for id in np.argsort(result)[::-1]):
    print(word)
