In [26]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
# read in all the words:
words = open('names.txt','r').read().splitlines()
# build the vocab of chars and mappings:
chars = sorted(set(''.join(words)))
stoi = {s : i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i : s for s, i in stoi.items()}
vocab_size = len(itos)

In [28]:
# build the dataset:
block_size = 3 # context length

def build_dataset(words):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

import random

random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [29]:
# compare manual gradients to pytorch gradients

def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [30]:
n_embd = 10
n_hidden = 64  

C = torch.randn((vocab_size, n_embd))
# layer 1:
W1 = torch.randn((n_embd * block_size, n_hidden)) * (5/3) / ((n_embd * block_size) ** 0.5)
b1 = torch.randn((1, n_hidden)) * 0.1
# layer 2:
W2 = torch.randn((n_hidden, vocab_size)) * 0.1
b2 = torch.randn((vocab_size)) * 0.1

bngain = torch.randn((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.randn((1, n_hidden)) * 0.1

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total

for p in parameters:
    p.requires_grad = True

max_steps = 200000
batch_size = 32
n = batch_size

with torch.no_grad():
    for i in range(max_steps):

        # mini-batch construct:
        ix = torch.randint(0, Xtr.shape[0], (batch_size, ))
        Xb, Yb = Xtr[ix], Ytr[ix]

        # forward pass:
        emb = C[Xb] # embed chars into vectors
        embcat = emb.view(emb.shape[0], -1)  # concatenate the vectors
        # linear layer:
        hprebn = embcat @ W1 + b1 # hidden layer pre-activation
        # batch-norm layer:
        bnmean = hprebn.mean(0, keepdim=True)
        bnvar = hprebn.var(0, keepdim=True, unbiased=True)
        bnvar_inv = (bnvar + 1e-5)**-0.5
        bnraw = (hprebn - bnmean) * bnvar_inv
        hpreact = bngain * bnraw + bnbias
        # non linearity:
        h = torch.tanh(hpreact) # hidden layer
        logits = h @ W2 + b2 # output layer
        loss = F.cross_entropy(logits, Yb) 

        # backward pass:
        for p in parameters:
            p.grad = None

        #loss.backward()

        # manual backprop:
        dlogits = F.softmax(logits, 1)
        dlogits[range(n), Yb] -= 1
        dlogits /= n
        # 2nd layer backprop:
        dh = dlogits @ W2.T
        dW2 = h.T @ dlogits
        db2 = dlogits.sum(0)
        # tanh:
        dhpreact = (1.0 - h**2) * dh
        # batch-norm backprop:
        dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
        dbnbias = dhpreact.sum(0, keepdim=True)
        dhprebn = bngain*bnvar_inv/n * (n*dhpreact - dhpreact.sum(0) - n/(n-1)*bnraw*(dhpreact*bnraw).sum(0))
        # first layer:
        dembcat = dhprebn @ W1.T
        dW1 = embcat.T @ dhprebn
        db1 = dhprebn.sum(0, keepdim=False)
        # embedding:
        demb = dembcat.view(emb.shape)
        dC = torch.zeros_like(C)
        for k in range(Xb.shape[0]):
            for j in range(Xb.shape[1]): 
                i = Xb[k][j]
                dC[i] += demb[k][j]
        
        grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]

        # update:
        lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
        for p, grad in zip(parameters, grads):
            #p.data += -lr * p.grad
            p.data += -lr * grad


4137


In [31]:
# calibrate the batch-norm at the end of training:

with torch.no_grad():
    # pass the training set through:
    emb = C[Xtr]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    # measure the mean/std over the entire training set:
    bnmean = hpreact.mean(0, keepdim=True)
    bnvar = hpreact.var(0, keepdim=True, unbiased=True)

In [34]:
# evaluate train and val loss:

# disable gradient tracking
@torch.no_grad()

def split_loss(split):
    x, y = {
        'train': (Xtr, Ytr),
        'val' : (Xdev, Ydev),
        'test' : (Xte, Yte)
    } [split]

    emb = C[x]
    embcat = emb.view(emb.shape[0], -1)
    hprebn = embcat @ W1 + b1
    bnraw = (hprebn - bnmean) * (bnvar + 1e-5)**-0.5
    hpreact = bngain * bnraw + bnbias
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.1679837703704834
val 2.1976518630981445


In [35]:
# sample from the model:

with torch.no_grad():
    for _ in range(20):

        out = []
        context = [0] * block_size
        
        while True:
            emb = C[torch.tensor([context])]
            embcat = emb.view(emb.shape[0], -1)
            hprebn = embcat @ W1 + b1
            bnraw = (hprebn - bnmean) * (bnvar + 1e-5)**-0.5
            hpreact = bngain * bnraw + bnbias
            h = torch.tanh(hpreact)
            logits = h @ W2 + b2
            probs = F.softmax(logits, dim=-1)
            ix = torch.multinomial(probs, num_samples=1).item()
            context = context[1:] + [ix]
            out.append(ix)
            if ix == 0: break


        print(''.join(itos[i] for i in out)) 

riley.
anda.
maisiahnalatuhara.
mar.
adirah.
syanna.
tymenora.
keena.
raleune.
abmone.
zal.
karius.
tobde.
jomariyah.
yor.
markenia.
maralir.
emra.
racaklalyn.
atri.
