In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
# define stoi 
stoi = {}
allletters = sorted(set("".join(words)))

stoi = {s:i+1 for i,s in enumerate(allletters)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

In [4]:
X , Y = [], []
block_size= 3 # can be reset to whatever you like

for w in words[:5]:
    #'emma'
    print(w)
    context = [0]*block_size # contains indcies of context letters
    for ch in w + '.':
        ix = stoi[ch] # 'e' -> 5
        Y.append(ix) # 5 is the target
        X.append(context)
        print("".join(itos[i] for i in context), '------->', ch)
        context = context[1:] + [ix] # update context and append new index

X = torch.tensor(X)
Y = torch.tensor(Y)


emma
... -------> e
..e -------> m
.em -------> m
emm -------> a
mma -------> .
olivia
... -------> o
..o -------> l
.ol -------> i
oli -------> v
liv -------> i
ivi -------> a
via -------> .
ava
... -------> a
..a -------> v
.av -------> a
ava -------> .
isabella
... -------> i
..i -------> s
.is -------> a
isa -------> b
sab -------> e
abe -------> l
bel -------> l
ell -------> a
lla -------> .
sophia
... -------> s
..s -------> o
.so -------> p
sop -------> h
oph -------> i
phi -------> a
hia -------> .


In [30]:
g = torch.Generator().manual_seed(214748367)
# lookup matrix
C = torch.randn((27,2), generator=g)
# hidden layer - 100 neurons
W1 = torch.randn((6,100), generator=g)
b1 = torch.randn((100,), generator=g)
# Output layer
W2 = torch.randn((100,27), generator=g )
b2 = torch.randn((27,), generator=g)

parameters = [C, W1, b1, W2, b2]

In [31]:
sum(p.nelement() for p in parameters) # total no of parameters in the network

3481

I have combined many steps from previous notebook, so pl refer that in case of confusion. 

`X.shape = (32,3)`; 32 is no of datapoints in first 5 words. 

using `F.cross_entropy`:

In [32]:
# implementing F.cross_entropy for efficiency 

emb = C[X]
H = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1) # 32,100
logits = H @ W2 + b2
loss = F.cross_entropy(logits, target=Y)
loss.item()

20.672719955444336

Why should we prefer F.cross_entropy over a manual softmax implementation:

- intermediate variables are not created, freeing up memory
- F.cross_entropy can handle corner cases in a defined way (very low of very high logits -- prevents blowing up to inf)

In [27]:
for p in parameters:
    p.requires_grad = True

In [28]:
for  _ in range(1000):
    # Forward pass: 
    emb = C[X]
    H = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1) # 32,100
    logits = H @ W2 + b2
    loss = F.cross_entropy(logits, target=Y)
    # Back pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    for p in parameters:
        p.data -= 0.1 * p.grad

print(loss.item())

0.2553942799568176


__Initial loss__: 20.672719955444336<br>
__final loss__: 0.2553942799568176

Such a huge drop on 1000 iterations is seen because the model is overfitting on the first 5 words, which we have considered for the training so far. <br>
Lets check the predicted lables vs actual labels for these 5 words. 

In [34]:
logits.max(dim = 1).indices

tensor([ 6,  6, 18,  6,  7,  6,  1,  2,  9,  7,  6,  7,  6,  6,  7,  7,  6,  6,
         2,  6,  7, 10,  3,  6, 18,  6,  6,  1,  2, 18,  6,  8])

In [35]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

Fundamenetally we are not able to make loss as `0` because if we retrace the dataset, `... -> a, e, i` and a lot of other options. i.e. __same input is mapped to multipled output__. So there will always be some non zero loss on the dataset!


## Run on full dataset

In [38]:
X , Y = [], []
block_size= 3 # can be reset to whatever you like

for w in words:
    context = [0]*block_size # contains indcies of context letters
    for ch in w + '.':
        ix = stoi[ch] # 'e' -> 5
        Y.append(ix) # 5 is the target
        X.append(context)
        context = context[1:] + [ix] # update context and append new index

X = torch.tensor(X)
Y = torch.tensor(Y)


In [45]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [39]:
g = torch.Generator().manual_seed(214748367)
# lookup matrix
C = torch.randn((27,2), generator=g)
# hidden layer - 100 neurons
W1 = torch.randn((6,100), generator=g)
b1 = torch.randn((100,), generator=g)
# Output layer
W2 = torch.randn((100,27), generator=g )
b2 = torch.randn((27,), generator=g)

parameters = [C, W1, b1, W2, b2]

In [40]:
sum(p.nelement() for p in parameters) # total no of parameters in the network

3481

In [None]:
emb = C[X] # (228146,3,2)
H = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1) # 228146,100
logits = H @ W2 + b2
loss = F.cross_entropy(logits, target=Y)
loss.item()

18.43378448486328

In [42]:
for p in parameters:
    p.requires_grad = True

In [None]:
for  iter in range(50):
    # Forward pass: 
    emb = C[X]
    H = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1) # 228146,100
    logits = H @ W2 + b2
    loss = F.cross_entropy(logits, target=Y)
    # Back pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    for p in parameters:
        p.data -= 0.1 * p.grad
    
    if iter % 5 ==0:
        print(loss.item())

print(loss.item())

16.52655601501465
11.840333938598633
9.973389625549316
8.731470108032227
7.844998836517334
7.036542892456055
6.318970203399658
5.647314548492432
5.120351791381836
4.7293548583984375
4.479941368103027
