In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'daisuke']

In [2]:
# build the vocab of chars and mapping to/from int
chars = sorted(list(set(''.join(words))))
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}
print(itos, len(chars))

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'} 26


In [4]:
block_size = 3 
def build_dataset(words):

    X_input, Y_label = [], []
    for word in words:
        context = [0] * block_size
        for ch in word + '.':
            ix = stoi[ch]
            X_input.append(context)
            Y_label.append(ix)
            context = context[1:] + [ix] # crop and append

    X_input = torch.tensor(X_input, dtype=torch.int64)
    Y_label = torch.tensor(Y_label, dtype=torch.int64)
    print(X_input.shape, Y_label.shape)
    return X_input, Y_label

import random
random.seed(42)
random.shuffle(words)
n1 = int(len(words) * 0.8)
n2 = int(len(words) * 0.9)

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182453, 3]) torch.Size([182453])
torch.Size([22846, 3]) torch.Size([22846])
torch.Size([22855, 3]) torch.Size([22855])


In [6]:
# utility function we will use later when comparing manual and auto gradients
def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s: 15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | max diff: {maxdiff}')

In [8]:
# MLP revised 
n_emb = 10
hidden_layer = 200
vocab_size = len(chars) + 1

g = torch.Generator().manual_seed(214483647)
C = torch.randn((vocab_size, n_emb), generator=g)
W1 = torch.randn((n_emb * block_size, hidden_layer), generator=g) * 5/3 / (n_emb * block_size ** 0.5) # * 0.2
b1 = torch.randn((hidden_layer,), generator=g) * 0.1
W2 = torch.randn((hidden_layer, vocab_size), generator=g) * 0.1 # smaller weights, avoid 0
b2 = torch.randn((vocab_size,), generator=g) * 0.1

# batch norm parameters
bngain = torch.randn((1, hidden_layer)) * 0.1 + 1.0
bnbias = torch.randn((1, hidden_layer)) * 0.1

# all zero parameters could mask an incorrect implementation of the backward pass

parameters = [C, W1, b1, W2, b2, bngain, bnbias]

total_param_size = sum(p.nelement() for p in parameters) # the number of parameters
print(total_param_size)
for p in parameters:
    p.requires_grad = True

12297


In [10]:
batch_size = 32
n = batch_size # a shorter variable name for convenience
# conscruct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

In [12]:
# forward pass, with smaller steps so that it is possible to backward one step at a time

emb = C[Xb] # n x block_size x n_emb
embcat = emb.view(emb.shape[0], -1) # n x (block_size * n_emb)
# linear layer
hprebn = embcat @ W1 + b1 # hidden_layer pre-activation
# batch norm layer
bnmeani = 1/n * hprebn.mean(dim=0, keepdim=True) # 1 x hidden_layer
bndiff = hprebn - bnmeani # n x hidden_layer
bndiff2 = bndiff ** 2 # n x hidden_layer
bnvar = 1/n * (bndiff2).sum(dim=0, keepdim=True) # 1 x hidden_layer
bnvar_int = (bnvar + 1e-5) ** -0.5 # 1 x hidden_layer
bnraw = bndiff * bnvar_int # n x hidden_layer
hpreact = bngain * bnraw + bnbias # n x hidden_layer

# non linearity
h = torch.tanh(hpreact) # n x hidden_layer
# linear layer2
logits = h @ W2 + b2 # n x vocab_size
# cross entropy loss 
logits_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logits_maxes # n x vocab_size
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdim=True) # n x 1
counts_sum_inv = counts_sum**-1
probs = counts * counts_sum_inv # n x vocab_size
logprobs = probs.log() # n x vocab_size
loss = -logprobs[torch.arange(n), Yb].mean() # scalar

# pytorch backward pass
for p in parameters:
    p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, norm_logits,
        logits_maxes, logits, h, hpreact, bnraw, bnvar_int, 
        bndiff2, bndiff, bnvar, bnvar_int, hprebn, bnmeani, embcat, emb]:
    t.retain_grad()
loss.backward()
loss


tensor(3.8814, grad_fn=<NegBackward0>)

In [None]:
# exercise here: backprop through through the whole thing manually
# backproping through exactyl all of the variables 
# as they are defined in the forward pass

dlogprobs = ???

# 
loss_fast = F.cross_entropy(logits, Yb)
print(loss_fast.item(), "diff:", (loss_fast - loss).item())

In [None]:
# exercise here

# 
hpreact_fast = bngain * (hprebn - hprebn.mean(0, keepdim=True)) / torch.sqrt(hprebn.var(0, keepdim=True, unbiased=True) + 1e-5) + bnbias