In [1]:
import torch
import torch.nn.functional as F

import random

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}

In [5]:
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [6]:
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [7]:
block_size = 3


def build_dataset(words):
    X, Y = [], []
    for word in words:
        context = [0] * block_size
        for ch in word + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [8]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [9]:
sum(p.nelement() for p in parameters)

11897

In [10]:
for p in parameters:
    p.requires_grad = True

In [11]:
stepi, lossi = [], []

In [12]:
for i in range(200000):
    # mini-batch construction
    ix = torch.randint(0, Xtr.shape[0], (32,))
    emb = C[Xtr[ix]]
    # forward pass
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1)  # shape: [32, 200]
    logits = h @ W2 + b2  # shape: [32, 27]
    loss = F.cross_entropy(logits, Ytr[ix])
    print(f'step: {i} loss: {loss.item():.6f}')
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    if i < 50000:
        lr = 0.8
    elif i < 125000:
        lr = 0.1
    else:
        lr = 0.01
    for p in parameters:
        p.data += -lr * p.grad
    # track status
    stepi.append(i)
    lossi.append(loss.log10().item())

step: 0 loss: 27.805714
step: 1 loss: 24.646944
step: 2 loss: 21.583502
step: 3 loss: 18.727634
step: 4 loss: 19.293615
step: 5 loss: 19.365305
step: 6 loss: 17.077229
step: 7 loss: 21.370569
step: 8 loss: 19.126797
step: 9 loss: 19.543945
step: 10 loss: 15.236225
step: 11 loss: 11.951473
step: 12 loss: 16.470245
step: 13 loss: 20.535086
step: 14 loss: 17.768721
step: 15 loss: 18.330017
step: 16 loss: 15.778012
step: 17 loss: 17.428566
step: 18 loss: 13.722249
step: 19 loss: 15.368627
step: 20 loss: 16.807545
step: 21 loss: 15.759993
step: 22 loss: 16.863905
step: 23 loss: 14.708574
step: 24 loss: 13.172266
step: 25 loss: 19.191444
step: 26 loss: 16.428427
step: 27 loss: 16.547319
step: 28 loss: 14.801894
step: 29 loss: 11.320836
step: 30 loss: 12.188525
step: 31 loss: 8.816339
step: 32 loss: 14.116547
step: 33 loss: 15.973302
step: 34 loss: 12.659248
step: 35 loss: 13.162020
step: 36 loss: 12.998364
step: 37 loss: 14.984596
step: 38 loss: 13.866426
step: 39 loss: 13.830735
step: 40 lo

In [13]:
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.2083, grad_fn=<NllLossBackward0>)

In [14]:
emb = C[Xtr]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
loss

tensor(2.1664, grad_fn=<NllLossBackward0>)

In [15]:
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor(context)]  # shape: [3, 10]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)   # shape: [1, 200]
        logits = h @ W2 + b2    # shape: [1, 27]
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        out.append(ix)
        context = context[1:] + [ix]
        if ix == 0:
            break
        
    print(''.join(itos[c] for c in out))

mora.
mayah.
see.
mad.
ryla.
renyrujendraegridered.
elii.
shyonelleigh.
est.
nar.
elynn.
hokelin.
shubergiam.
jer.
kin.
rendy.
penterofku.
zence.
ryy.
julieh.
