In [1]:
import torch
import torch.nn.functional as F

words = open('names.txt', 'r').read().splitlines()
len(words)

32033

In [2]:
chars = sorted(list(set(''.join(words))))
str_to_int = {s: i+1 for i, s in enumerate(chars)}
str_to_int['.'] = 0
int_to_str = {i: s for s, i in str_to_int.items()}

In [3]:
block_size = 3 # Context length - how many characters to input
vocab_size = len(chars) + 1 # 27

def build_dataset(words):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        
        for char in w + '.':
            ix = str_to_int[char]

            X.append(context)
            Y.append(ix)

            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return X, Y

# Split dataset into training, dev, and test sets
import random
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [4]:
emb_size = 10 # Lenght of tensor that represents each character
hidden_layer_neurons = 200

C = torch.randn((vocab_size, emb_size))
W1 = torch.randn((emb_size*block_size, hidden_layer_neurons)) * 0.1 # Fix saturated tanh and dead neurons
b1 = torch.randn(hidden_layer_neurons) * 0.01
W2 = torch.randn((hidden_layer_neurons, vocab_size)) * 0.01 # Lower values of W2 and b2 equalizes the logits and creates a better starting loss
b2 = torch.randn(vocab_size) * 0

parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

f'Num parameters: {sum(p.nelement() for p in parameters)}'

'Num parameters: 11897'

In [5]:
# Training
lr = 0.1 # 0.1 -> 0.01 -> 0.005 -> 0.001 for 
steps = 50000
minibatch_size = 32

for i in range(steps):
    
    # Mini-batch
    ix = torch.randint(0, Xtr.shape[0], (minibatch_size,))

    # Forward pass
    emb = C[Xtr[ix]] # (X.shape[0], block_size, emb_size)
    h = torch.tanh(emb.view(-1, emb_size*block_size) @ W1 + b1) # -1 means infer - (X.shape[0], block_size*emb_size)
    logits = h @ W2 + b2 # (X.shape[0], vocab_size)

    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdim=True)
    # loss = -prob[torch.arange(16), Y].log().mean()

    loss = F.cross_entropy(logits, Ytr[ix]) # Way more efficient and same result as 3 previous lines

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update
    for p in parameters:
        p.data += -lr * p.grad

f'Loss: {loss.item()}'

'Loss: 2.1790928840637207'

In [6]:
# Dev loss

emb = C[Xdev] # (X.shape[0], block_size, emb_size)
h = torch.tanh(emb.view(-1, emb_size*block_size) @ W1 + b1) # -1 means infer - (X.shape[0], block_size*emb_size)
logits = h @ W2 + b2 # (X.shape[0], vocab_size)
loss = F.cross_entropy(logits, Ydev)
loss.item()

2.1905815601348877

In [7]:
# Test loss

emb = C[Xte] # (X.shape[0], block_size, emb_size)
h = torch.tanh(emb.view(-1, emb_size*block_size) @ W1 + b1) # -1 means infer - (X.shape[0], block_size*emb_size)
logits = h @ W2 + b2 # (X.shape[0], vocab_size)
loss = F.cross_entropy(logits, Yte)
loss.item()

2.1901607513427734

In [8]:
# Sampling

g = torch.Generator()
names = 20

for _ in range(20):
    out = ''
    context = [0] * block_size

    while True:
        emb = C[context] # (X.shape[0], block_size, emb_size)
        h = torch.tanh(emb.view(-1, emb_size*block_size) @ W1 + b1) # -1 means infer - (X.shape[0], block_size*emb_size)
        logits = h @ W2 + b2 # (X.shape[0], vocab_size)

        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()

        if ix == 0:
            break

        out += int_to_str[ix]
        context = context[1:] + [ix]
    
    print(out)

cylianngstonnah
lelia
kipias
ail
julia
tyershrnael
rehlynne
nazir
tamadehbuski
stanah
kiook
artel
elev
abriel
maxivydan
taiden
eda
thadavero
daerin
melia
