In [6]:
import torch
import torch.nn.functional as F

words = open('names.txt', 'r').read().splitlines()
len(words)

32033

In [7]:
chars = sorted(list(set(''.join(words))))
str_to_int = {s: i+1 for i, s in enumerate(chars)}
str_to_int['.'] = 0
int_to_str = {i: s for s, i in str_to_int.items()}

In [8]:
block_size = 3 # Context length - how many characters to input
vocab_size = len(chars) + 1 # 27

def build_dataset(words):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        
        for char in w + '.':
            ix = str_to_int[char]

            X.append(context)
            Y.append(ix)

            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return X, Y

# Split dataset into training, dev, and test sets
import random
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [9]:
emb_size = 10 # Lenght of tensor that represents each character
hidden_layer_neurons = 200

C = torch.randn((vocab_size, emb_size))
W1 = torch.randn((emb_size*block_size, hidden_layer_neurons)) * (5/3)/((emb_size*block_size)**0.5) # Fix saturated tanh and dead neurons, switched to Kaiming init, preserve standard deviation
# b1 = torch.randn(hidden_layer_neurons) * 0.01 # Not needed as it is extracted out by batch normalization bias
W2 = torch.randn((hidden_layer_neurons, vocab_size)) * 0.01 # Lower values of W2 and b2 equalizes the logits and creates a better starting loss
b2 = torch.randn(vocab_size) * 0

bngain = torch.ones((1, hidden_layer_neurons)) # Allows for scaling and shifting (essentially training) of batch normalization
bnbias = torch.zeros((1, hidden_layer_neurons))
bnmean_running = torch.zeros((1, hidden_layer_neurons))
bnstd_running = torch.ones((1, hidden_layer_neurons))

parameters = [C, W1, W2, b2, bngain, bnbias]
for p in parameters:
    p.requires_grad = True

f'Num parameters: {sum(p.nelement() for p in parameters)}'

'Num parameters: 12097'

In [10]:
# Training
lr = 0.1 # 0.1 -> 0.01 -> 0.005 -> 0.001
steps = 50000
minibatch_size = 32

for i in range(steps):
    
    # Mini-batch
    ix = torch.randint(0, Xtr.shape[0], (minibatch_size,))

    # Forward pass
    emb = C[Xtr[ix]] # (X.shape[0], block_size, emb_size)
    hpreact = emb.view(-1, emb_size*block_size) @ W1 # + b1 (don't need bias, extracted out by bnmean) # -1 means infer - (X.shape[0], block_size*emb_size)

    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)

    with torch.no_grad(): # Avoids specific batches having affects on results
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi

    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias # Batch normalization
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2 # (X.shape[0], vocab_size)

    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdim=True)
    # loss = -prob[torch.arange(16), Y].log().mean()

    loss = F.cross_entropy(logits, Ytr[ix]) # Way more efficient and same result as 3 previous lines

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update
    for p in parameters:
        p.data += -lr * p.grad

f'Loss: {loss.item()}'

'Loss: 2.3114173412323'

In [11]:
# Dev loss

emb = C[Xdev] # (X.shape[0], block_size, emb_size)
hpreact = emb.view(-1, emb_size*block_size) @ W1
hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias # Batch normalization with running means and stds
h = torch.tanh(hpreact) # -1 means infer - (X.shape[0], block_size*emb_size)
logits = h @ W2 + b2 # (X.shape[0], vocab_size)
loss = F.cross_entropy(logits, Ydev)
loss.item()

2.2044789791107178

In [12]:
# Test loss

emb = C[Xte] # (X.shape[0], block_size, emb_size)
hpreact = emb.view(-1, emb_size*block_size) @ W1
hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias # Batch normalization with running means and stds
h = torch.tanh(hpreact) # -1 means infer - (X.shape[0], block_size*emb_size)
logits = h @ W2 + b2 # (X.shape[0], vocab_size)
loss = F.cross_entropy(logits, Yte)
loss.item()

2.2004661560058594

In [13]:
# Sampling

g = torch.Generator()
names = 20

for _ in range(20):
    out = ''
    context = [0] * block_size

    while True:
        emb = C[context] # (X.shape[0], block_size, emb_size)
        hpreact = emb.view(-1, emb_size*block_size) @ W1
        hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias # Batch normalization with running means and stds
        h = torch.tanh(hpreact) # -1 means infer - (X.shape[0], block_size*emb_size)
        logits = h @ W2 + b2 # (X.shape[0], vocab_size)

        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()

        if ix == 0:
            break

        out += int_to_str[ix]
        context = context[1:] + [ix]
    
    print(out)

everekelte
zoe
teston
ana
adylinelle
saholvy
sye
ahonad
katel
katton
romeye
jole
con
sra
beoni
shyvi
riem
dezhilyn
tre
raylexel
