In [None]:
import numpy as np
import torch
import random
from torch.nn import functional as F
import matplotlib.pyplot as plt
from typing import List, Tuple

In [None]:
block_size = 3          # Number of characters used to predict the next one
B  = 26                 # Size of the alphabet
NE = 2                  # Number of embedding dimensions
NH = 100                # Number of nodes in the hidden layer
BATCH_SIZE = 100        # How many samples for each batch
NUM_ITERATIONS=10000    # How many epochs to run backprop
learning_rate = 0.1     # Initial learning rate
window_size = 100        # Window size to plot smoothed list of losses

random.seed(0)

In [None]:
def getnum(x : str, default : int) -> int:
    return (ord(x) - ord('a')) if x.isalpha() else default

def build_dataset(words : List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
    X, Y = list(), list()
    for w in words:
        if len(w) < block_size: continue
        w = "." * block_size + w + '.'
        for p in range(block_size, len(w)):
            prev = w[(p - block_size) : p]
            next = w[p]
            prev = [getnum(x, B) for x in prev]
            next = getnum(next, B)
            X.append(prev)
            Y.append(next)
    return torch.tensor(X), torch.tensor(Y)

In [None]:
with open("input.txt", "r") as f: words = f.read().splitlines()
random.shuffle(words)

n1 = int(0.1 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xvd, Yvd = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

print(Xtr.shape, Ytr.shape)
print(Xvd.shape, Yvd.shape)
print(Xte.shape, Yte.shape)

In [None]:
g = torch.Generator().manual_seed(0)
C = torch.randn((B + 1, NE), generator=g)
Wh = torch.randn((block_size * NE, NH), generator=g)
bh = torch.randn(NH, generator=g)
Wt = torch.randn((NH, B + 1), generator=g)
bt = torch.randn((B + 1), generator=g)

all_params =[C, Wh, bh, Wt, bt]
for param in all_params:
    param.requires_grad = True

num_params = sum([param.nelement() for param in all_params])
print(f"Total number of parameters: {num_params}")

In [None]:
losses = [0] * NUM_ITERATIONS

for iter in range(NUM_ITERATIONS):
    indices = torch.randint(0, Xtr.shape[0], (BATCH_SIZE,))
    embeddings = C[Xtr[indices]]
    #print(embeddings.shape)
    hidden = torch.tanh(embeddings.view(-1, block_size * NE) @ Wh + bh)
    logits = hidden @ Wt + bt
    loss = F.cross_entropy(logits, Ytr[indices])

    for param in all_params: param.grad = None
    loss.backward()
    learning_rate /= (1.0 if iter < NUM_ITERATIONS / 2 else 10)
    for param in all_params: param.data -= learning_rate * param.grad

    losses[iter] = loss.log10().item()

In [None]:
smooth_losses = smoothed_data = [
    np.mean(losses[max(0, p - window_size) : min(len(losses), p + window_size + 1)])
    for p in range(len(losses))
]


plt.figure(figsize=(10, 5))
plt.plot(range(len(losses)), losses)
plt.plot(range(len(smooth_losses)), smooth_losses)