In [25]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
g = torch.Generator().manual_seed(2147483647)

In [26]:
# load ~32000 names from a file, one in each line
with open("names.txt", "r") as infile:
    words = infile.read().splitlines() # ignores \n
chars = sorted(set("".join(words) + "."))
vocab_size = len(chars)
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for c, i in ctoi.items()}

In [27]:
# building the dataset (with contexts)

# training split -> ~80% -> optimize parameters of the model using gradient descent
# dev/validation split -> ~10% -> hyperparameter tuning
# test split -> ~10% -> evaluate model's final performance

# updated so that we can also create dev/test splits

context_len = 4 # context length - how many chars to take into account to predict the next one?

def build_dataset(words):
    X, Y = [], [] # X - input to the neural net, Y - expected labels for each example inside X
    
    for w in words:
        # print(w)
        context = [0] * context_len # ...
        for c in w + ".":
            idx = ctoi[c]
            X.append(context)
            Y.append(idx)
            # print("".join(itoc[i] for i in context), "--->", itoc[idx])
            context = context[1:] + [idx]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

torch.Size([182625, 4]) torch.Size([182625])
torch.Size([22655, 4]) torch.Size([22655])
torch.Size([22866, 4]) torch.Size([22866])


In [28]:
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None # 1 dim but works because of broadcasting

    def __call__(self, x): # x are the incoming activations
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1): # dim -> number of hidden neurons
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim) # bngain
        self.beta = torch.zeros(dim) # bnbias
        # buffers (trained with a running "momentum" update)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        if self.training:
            xmean = x.mean(0, keepdim=True) # batch mean
            xvar = x.var(0, keepdim=True, unbiased=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

    def parameters(self):
        return []

In [29]:
n_embd = 10 # character embedding vector dimension
n_hidden = 100 # number of neurons in a hidden layer

C = torch.randn((vocab_size, n_embd))

layers = [
    Linear(n_embd * context_len, n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), Tanh(),
    Linear(           n_hidden, vocab_size)
]

with torch.no_grad():
    layers[-1].weight *= 0.1 # to lower the initial loss (?)
    # apply gain
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= (5/3)

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.numel() for p in parameters))
for p in parameters:
    p.requires_grad = True

47497


In [32]:
epochs = 200_000
minibatch_size = 64
loss_i = []

for i in range(epochs):
    # create a minibatch
    idx = torch.randint(0, Xtr.shape[0], (minibatch_size,), generator=g)
    Xb, Yb = Xtr[idx], Ytr[idx]

    # forward pass
    emb = C[Xb]
    x = emb.view(emb.shape[0], -1)
    # Xb - (64, 4), emb - (64, 4, 10), x - (64, 40)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb)

    # backward pass
    for layer in layers:
        layer.out.retain_grad()
    for p in parameters:
        p.grad = None
    loss.backward()

    #update
    lr = 0.1 if i < epochs / 2 else 0.01
    for p in parameters:
        p.data += -lr * p.grad
    
    # track stats
    if i % 10000 == 0:
        print(f"{i}/{epochs}: {loss.item():.4f}")
    loss_i.append(loss.log10().item())

0/200000: 3.2883
10000/200000: 2.0039
20000/200000: 2.0001
30000/200000: 2.0076
40000/200000: 2.0188
50000/200000: 1.9709
60000/200000: 2.1628
70000/200000: 2.0979
80000/200000: 1.9422
90000/200000: 1.8212
100000/200000: 1.5685
110000/200000: 1.9177
120000/200000: 1.7643
130000/200000: 1.8297
140000/200000: 1.7261
150000/200000: 1.8924
160000/200000: 1.6564
170000/200000: 1.8420
180000/200000: 1.9479
190000/200000: 1.6983


In [37]:
@torch.no_grad()
def split_loss(split):
    X, Y = {"train": (Xtr, Ytr), "dev": (Xdev, Ydev), "test": (Xtest, Ytest)}[split] #!!!
    emb = C[X]
    x = emb.view(-1, n_embd * context_len)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Y) # more efficient, prevents inf and nan for big positive logits
    print(split, loss.item())

In [38]:
# find out the loss for a whole (TR!) data set, not just a minibatch
split_loss("train")

# tr loss ~= dev loss -> not overfitting (actually maybe underfitting!)
# <=> nn not able to memorize the values
# might mean that the network is too small (not enough parameters)

# find out the loss for a whole (DEV!) data set
split_loss("dev")

train 1.772779107093811
dev 2.079167366027832


In [42]:
# SAMPLING
gen = torch.Generator().manual_seed(2147483647)

for _ in range(20):
    name = []
    context = [0] * context_len # initialize with all ...
    while True: # create one name
        emb = C[torch.tensor([context])] # like C[X] but this time only 1 context
        # block_size x emb_dim
        x = emb.view(emb.shape[0], -1)
        for layer in layers:
            x = layer(x)
        probs = F.softmax(x, dim=1)
        idx = torch.multinomial(probs, num_samples=1, generator=gen).item()
        context = context[1:] + [idx]
        name.append(idx)
        if idx == 0:
            break
    print("".join(itoc[i] for i in name))

junide.
jakayah.
pressy.
adria.
jirritt.
sabrielle.
sameia.
yani.
evers.
deya.
newi.
taysean.
evy.
artez.
noudarsyn.
demmi.
poytsan.
houz.
jacoriana.
jocelynn.
