In [40]:
import torch
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt

In [1]:
words = open('data/names.txt', 'r').read().splitlines()
len(words)

32033

In [2]:
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [4]:
chars = ['.'] + sorted(list(set("".join(words))))
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}

In [45]:
X = []
Y = []
block_size = 3

random.shuffle(words)
for w in words:
    context = [0] * block_size
    for ch in w + '.':
        X.append(context)
        Y.append(stoi[ch])
        context = context[1:] + [stoi[ch]]

X, Y = torch.tensor(X), torch.tensor(Y)

In [46]:
train_split = int(.8 * len(X))
val_split = int(.9 * len(X))

xtr = X[:train_split]
ytr = Y[:train_split]
xval = X[train_split:val_split]
yval = Y[train_split:val_split]
xtest = X[val_split:]
ytest = Y[val_split:]

In [492]:
n_dim = 10
layer_size = 400
vocab_size = 27

C = torch.randn((vocab_size, n_dim))
w1 = torch.randn((n_dim * block_size, layer_size)) * (5/3)/((n_dim*block_size)**0.5)
# b1 = torch.randn((layer_size)) * 0.01
w2 = torch.randn((layer_size, vocab_size)) * 0.01
b2 = torch.randn((vocab_size)) * 0

bngain = torch.ones((1, layer_size))
bnbias = torch.zeros((1, layer_size))

bnmean_running = torch.zeros(1, layer_size)
bnstd_running = torch.zeros(1, layer_size)

parameters = [C, w1, w2, b2, bngain, bnbias]

In [446]:
for p in parameters:
    p.requires_grad = True 

In [484]:
batch_size = 32
num_epochs = 10000
lr = 0.1

for i in range(num_epochs):
    ix = torch.randint(0, xtr.shape[0], (batch_size,))
    
    x_enc = C[xtr[ix]]
    hpreact = torch.matmul(x_enc.view(-1, n_dim * block_size), w1)
    bmeani = hpreact.mean(0, keepdim=True)
    bstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * ((hpreact - bnmeani)/bnstdi) + bnbias

    bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
    bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
    
    h = torch.tanh(hpreact)
    logits = torch.matmul(h, w2) + b2
    loss = F.cross_entropy(logits, ytr[ix])

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data -= lr * p.grad

    if (i % (num_epochs/10) == 0):
        print(f"Epoch: {i} | Loss: {loss: .4f}")

Epoch: 0 | Loss:  2.0563
Epoch: 1000 | Loss:  1.9024
Epoch: 2000 | Loss:  1.8996
Epoch: 3000 | Loss:  2.0817
Epoch: 4000 | Loss:  1.8727
Epoch: 5000 | Loss:  2.0839
Epoch: 6000 | Loss:  1.8789
Epoch: 7000 | Loss:  1.7424
Epoch: 8000 | Loss:  2.0306
Epoch: 9000 | Loss:  2.0354


In [488]:
with torch.no_grad():
    x_enc = C[xtr]
    hpreact = torch.matmul(x_enc.view(-1, n_dim * block_size), w1) + b1
    bnmean = hpreact.mean(0, keepdim=True)
    bnstd = hpreact.std(0, keepdim=True)

In [489]:
with torch.no_grad():
    x_enc = C[xtr]
    hpreact = torch.matmul(x_enc.view(-1, n_dim * block_size), w1) + b1
    hpreact = bngain * ((hpreact - bnmean_running)/bnstd_running) + bnbias
    h = torch.tanh(hpreact)
    logits = torch.matmul(h, w2) + b2
    loss = F.cross_entropy(logits, ytr)
    print(f"train loss: {loss.item()}")

train loss: 2.0548861026763916


In [490]:
with torch.no_grad():
    x_enc = C[xval]
    hpreact = torch.matmul(x_enc.view(-1, n_dim * block_size), w1) + b1
    hpreact = bngain * ((hpreact - bnmean_running)/bnstd_running) + bnbias
    h = torch.tanh(hpreact)
    logits = torch.matmul(h, w2) + b2
    loss = F.cross_entropy(logits, yval)
    print(f"val loss: {loss.item()}")

val loss: 2.0843234062194824


In [641]:
class Linear():
    def __init__(self, in_features, out_features, Bias=True):
        self.w = torch.randn((in_features, out_features)) / in_features**.5
        self.b = torch.randn(out_features) if Bias else None

    def __call__(self, x: torch.Tensor):
        out = torch.matmul(x, self.w)
        out += self.b if self.b != None else 0
        return out

    def parameters(self):
        return [self.w] + ([] if self.b == None else [self.b])

In [642]:
class BatchNorm1D():
    def __init__(self, dim, eps=1e-05, momentum=0.1,):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        self.gain = torch.ones(dim)
        self.bias = torch.zeros(dim)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x: torch.Tensor):
        if self.training:
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        out = self.gain * xhat + self.bias
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*xmean
                self.running_var = (1-self.momentum)*self.running_var + self.momentum*xvar
        return out

    def parameters(self):
        return [self.gain, self.bias]

In [643]:
class Tanh:
    def __call__(self, x: torch.Tensor):
        return torch.tanh(x)

    def parameters(self):
        return []

In [601]:
layer_size = 200

C = torch.randn(vocab_size, n_dim)
layers = [
    Linear(n_dim*block_size, layer_size), Tanh(),
    Linear(layer_size, layer_size), Tanh(),
    Linear(layer_size, layer_size), Tanh(),
    Linear(layer_size, layer_size), Tanh(),
    Linear(layer_size, vocab_size)
]

with torch.no_grad():
    layers[-1].w *= 0.1
    for l in layers[:-1]:
        if isinstance(l, Linear):
            l.w *= 5/3

parameters = [C] + [p for l in layers for p in l.parameters()]
print(sum(p.nelement() for p in parameters))

132497


In [602]:
for p in parameters:
    p.requires_grad = True

In [614]:
num_epochs = 50000
batch_size = 32
lr = 0.001

for i in range(num_epochs):
    ix = torch.randint(0, xtr.shape[0], (batch_size,))

    x = C[xtr[ix]].view(-1, n_dim*block_size)
    for l in layers:
        x = l(x)
    loss = F.cross_entropy(x, ytr[ix])

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data -= lr * p.grad

    if i % 10000 == 0:
        print(f"Epoch: {i} | Loss: {loss: .4f}")

Epoch: 0 | Loss:  2.0393
Epoch: 10000 | Loss:  1.8025
Epoch: 20000 | Loss:  1.7611
Epoch: 30000 | Loss:  1.7831
Epoch: 40000 | Loss:  1.7340


In [615]:
with torch.no_grad():
    x = C[xtr].view(-1, n_dim*block_size)
    for l in layers:
        x = l(x)
    loss = F.cross_entropy(x, ytr)

    print(loss.item())

1.9248042106628418


In [616]:
with torch.no_grad():
    x = C[xval].view(-1, n_dim*block_size)
    for l in layers:
        x = l(x)
    loss = F.cross_entropy(x, yval)

    print(loss.item())

2.064256429672241


In [654]:
C = torch.randn(vocab_size, n_dim)
layers = [
    Linear(n_dim*block_size, layer_size, Bias=False), BatchNorm1D(layer_size), Tanh(),
    Linear(layer_size, layer_size, Bias=False), BatchNorm1D(layer_size), Tanh(),
    Linear(layer_size, layer_size, Bias=False), BatchNorm1D(layer_size), Tanh(),
    Linear(layer_size, layer_size, Bias=False), BatchNorm1D(layer_size), Tanh(),
    Linear(layer_size, vocab_size, Bias=False), BatchNorm1D(vocab_size)
]

with torch.no_grad():
    layers[-1].gain *= 0.1
    for l in layers[:-1]:
        if isinstance(l, Linear):
            l.w *= 5/3

parameters = [C] + [p for l in layers for p in l.parameters()]
print(sum(p.nelement() for p in parameters))

133324


In [655]:
for p in parameters:
    p.requires_grad = True

In [668]:
num_epochs = 100000
batch_size = 32
lr = 0.001

for l in layers:
    if (isinstance(l, BatchNorm1D)): l.training = True

for i in range(num_epochs):
    ix = torch.randint(0, xtr.shape[0], (batch_size,))

    x = C[xtr[ix]].view(-1, n_dim*block_size)
    for l in layers:
        x = l(x)
    loss = F.cross_entropy(x, ytr[ix])

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data -= lr * p.grad

    if i % 10000 == 0:
        print(f"Epoch: {i} | Loss: {loss: .4f}")

Epoch: 0 | Loss:  2.1070
Epoch: 10000 | Loss:  2.1247
Epoch: 20000 | Loss:  1.6084
Epoch: 30000 | Loss:  1.7507
Epoch: 40000 | Loss:  1.6699
Epoch: 50000 | Loss:  1.8256
Epoch: 60000 | Loss:  1.8996
Epoch: 70000 | Loss:  1.6073
Epoch: 80000 | Loss:  2.3179
Epoch: 90000 | Loss:  2.0406


In [671]:
with torch.no_grad():
    x = C[xtr].view(-1, n_dim*block_size)
    for l in layers:
        if isinstance(l, BatchNorm1D): l.training = False
        x = l(x)
    loss = F.cross_entropy(x, ytr)

    print(loss.item())

1.9417372941970825


In [672]:
with torch.no_grad():
    x = C[xval].view(-1, n_dim*block_size)
    for l in layers:
        if isinstance(l, BatchNorm1D): l.training = False
        x = l(x)
    loss = F.cross_entropy(x, yval)

    print(loss.item())

2.0569443702697754
