In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#reading in all words
words = open('names.txt','r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
#Building the vocabulary of characters and mappings to integrers
chars = sorted(list(set(''.join(words))))
stoi = {s:i for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{0: '.', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z'}


In [32]:
#building the dataset

block_size = 3    #context length : characters taken to predict the next one
X, Y = [], []

for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '---->', itos[ix])
        context = context[1:] + [ix]      #crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [33]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [23]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6,100), generator=g)
b1 = torch.rand(100, generator=g)
W2 = torch.randn((100,27), generator=g)
b2 = torch.rand(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [25]:
sum(p.nelement() for p in parameters)

3481

In [28]:
for p in parameters:
    p.requires_grad = True

In [None]:
for _ in range(100):
    #Forward pass
    emb = C[X]          #[32,3,2]
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)            #(32,100)
    logits = h @ W2 + b2                                #(32,27)
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdims=True)
    # loss = -prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y)
    #print(loss.item())

    #backward pass
    for p in parameters:
        p.grad = None

    loss.backward()
    #update
    for p in parameters:
        p.data += -0.1 * p.grad

5.098272323608398
4.584640026092529
4.128853797912598
3.7277801036834717
3.3761425018310547
3.0636894702911377
2.7858705520629883
2.541958808898926
2.3299779891967773
2.1440341472625732
1.9787911176681519
1.8316885232925415
1.702017903327942
1.5883747339248657
1.4876667261123657
1.3966777324676514
1.3133565187454224
1.2367783784866333
1.166650414466858
1.102752447128296
1.044610619544983
0.9915212392807007
0.9426817297935486
0.8972891569137573
0.8546355962753296
0.8141732811927795
0.7755289673805237
0.738495945930481
0.7030269503593445
0.66923987865448
0.6374204754829407
0.6079904437065125
0.5813909769058228
0.5578778386116028
0.5373654365539551
0.5194960236549377
0.5038795471191406
0.4902423024177551
0.4784000515937805
0.4681757390499115
0.4593650698661804
0.4517451524734497
0.4451042711734772
0.4392598271369934
0.43406492471694946
0.42940449714660645
0.4251899719238281
0.42135217785835266
0.41783690452575684
0.4146006107330322
0.41160839796066284
0.40883103013038635
0.406244874000549