In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [3]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()

In [4]:
len(words)

32033

In [5]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [107]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
    
#     print(w)
    context = [0]*block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
#         print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [108]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [109]:
C = torch.randn((27,2)) # each one of 27 char will have 2 dimensional embedding

In [110]:
C[X].shape # for every one of the 32 x 3 characters we got a row of 2 from the C matrix C[X] != C.

torch.Size([228146, 3, 2])

In [111]:
emb = C[X]

In [11]:
# construct the hidden layer
W1 = torch.randn((6, 100)) #(num_inputs = 3*2, variable number of neurons)
b1 = torch.randn(100)

In [12]:
emb @ W1 + b1

RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)

In [13]:
emb

tensor([[[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01]],

        [[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-6.4305e-01,  9.2540e-01]],

        [[-9.1079e-02, -1.2058e-01],
         [-6.4305e-01,  9.2540e-01],
         [-2.5104e-01,  1.5556e+00]],

        [[-6.4305e-01,  9.2540e-01],
         [-2.5104e-01,  1.5556e+00],
         [-2.5104e-01,  1.5556e+00]],

        [[-2.5104e-01,  1.5556e+00],
         [-2.5104e-01,  1.5556e+00],
         [-1.1086e+00, -2.3352e+00]],

        [[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01]],

        [[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [ 2.3154e+00,  5.1048e-01]],

        [[-9.1079e-02, -1.2058e-01],
         [ 2.3154e+00,  5.1048e-01],
         [-1.2148e+00, -1.1060e-01]],

        [[ 2.3154e+00,  5.1048e-01],
         [-1.2148e+00, -1.1060e-01],
         [-2.1194e-01,

In [24]:
print('C[0]', C[0])
print('X[0]', X[0])
print('emb[0][0]', emb[0][0])
print('emb[0][1]', emb[0][1])
print('emb[0][2]', emb[0][2])
print('-----------------------------------------------------------')
print('C[5]',C[5])
print('X[1]', X[1])
print('emb[1]',emb[1])

C[0] tensor([-0.0911, -0.1206])
X[0] tensor([0, 0, 0])
emb[0][0] tensor([-0.0911, -0.1206])
emb[0][1] tensor([-0.0911, -0.1206])
emb[0][2] tensor([-0.0911, -0.1206])
-----------------------------------------------------------
C[5] tensor([-0.6430,  0.9254])
X[1] tensor([0, 0, 5])
emb[1] tensor([[-0.0911, -0.1206],
        [-0.0911, -0.1206],
        [-0.6430,  0.9254]])


In [25]:
emb[:, 0, :]

tensor([[-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [-6.4305e-01,  9.2540e-01],
        [-2.5104e-01,  1.5556e+00],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [ 2.3154e+00,  5.1048e-01],
        [-1.2148e+00, -1.1060e-01],
        [-2.1194e-01,  6.2278e-01],
        [-1.6346e-01,  1.3759e+00],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [-1.1086e+00, -2.3352e+00],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [-2.1194e-01,  6.2278e-01],
        [ 5.5838e-01,  1.7255e-01],
        [-1.1086e+00, -2.3352e+00],
        [ 8.4964e-01, -2.1192e-03],
        [-6.4305e-01,  9.2540e-01],
        [-1.2148e+00, -1.1060e-01],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.2058e-01],
        [-9.1079e-02, -1.205

In [26]:
torch.cat([emb[:, 0, :],emb[:, 1, :],emb[:, 2, :]], 1).shape # [embeddings for all of the first characters, second chars, third chars

torch.Size([32, 6])

In [33]:
torch.unbind(emb,1)

(tensor([[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-6.4305e-01,  9.2540e-01],
         [-2.5104e-01,  1.5556e+00],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [ 2.3154e+00,  5.1048e-01],
         [-1.2148e+00, -1.1060e-01],
         [-2.1194e-01,  6.2278e-01],
         [-1.6346e-01,  1.3759e+00],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-1.1086e+00, -2.3352e+00],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-2.1194e-01,  6.2278e-01],
         [ 5.5838e-01,  1.7255e-01],
         [-1.1086e+00, -2.3352e+00],
         [ 8.4964e-01, -2.1192e-03],
         [-6.4305e-01,  9.2540e-01],
         [-1.2148e+00, -1.1060e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
 

In [34]:
torch.cat(torch.unbind(emb,1), 1)

tensor([[-9.1079e-02, -1.2058e-01, -9.1079e-02, -1.2058e-01, -9.1079e-02,
         -1.2058e-01],
        [-9.1079e-02, -1.2058e-01, -9.1079e-02, -1.2058e-01, -6.4305e-01,
          9.2540e-01],
        [-9.1079e-02, -1.2058e-01, -6.4305e-01,  9.2540e-01, -2.5104e-01,
          1.5556e+00],
        [-6.4305e-01,  9.2540e-01, -2.5104e-01,  1.5556e+00, -2.5104e-01,
          1.5556e+00],
        [-2.5104e-01,  1.5556e+00, -2.5104e-01,  1.5556e+00, -1.1086e+00,
         -2.3352e+00],
        [-9.1079e-02, -1.2058e-01, -9.1079e-02, -1.2058e-01, -9.1079e-02,
         -1.2058e-01],
        [-9.1079e-02, -1.2058e-01, -9.1079e-02, -1.2058e-01,  2.3154e+00,
          5.1048e-01],
        [-9.1079e-02, -1.2058e-01,  2.3154e+00,  5.1048e-01, -1.2148e+00,
         -1.1060e-01],
        [ 2.3154e+00,  5.1048e-01, -1.2148e+00, -1.1060e-01, -2.1194e-01,
          6.2278e-01],
        [-1.2148e+00, -1.1060e-01, -2.1194e-01,  6.2278e-01, -1.6346e-01,
          1.3759e+00],
        [-2.1194e-01,  6.2278e

In [35]:
a = torch.arange(18)

In [36]:
a.shape

torch.Size([18])

In [37]:
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [40]:
emb

tensor([[[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01]],

        [[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-6.4305e-01,  9.2540e-01]],

        [[-9.1079e-02, -1.2058e-01],
         [-6.4305e-01,  9.2540e-01],
         [-2.5104e-01,  1.5556e+00]],

        [[-6.4305e-01,  9.2540e-01],
         [-2.5104e-01,  1.5556e+00],
         [-2.5104e-01,  1.5556e+00]],

        [[-2.5104e-01,  1.5556e+00],
         [-2.5104e-01,  1.5556e+00],
         [-1.1086e+00, -2.3352e+00]],

        [[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01]],

        [[-9.1079e-02, -1.2058e-01],
         [-9.1079e-02, -1.2058e-01],
         [ 2.3154e+00,  5.1048e-01]],

        [[-9.1079e-02, -1.2058e-01],
         [ 2.3154e+00,  5.1048e-01],
         [-1.2148e+00, -1.1060e-01]],

        [[ 2.3154e+00,  5.1048e-01],
         [-1.2148e+00, -1.1060e-01],
         [-2.1194e-01,

In [42]:
emb.view(32, 6) # this does the concatenation that we have done previously but it is more efficient because it does not
# copy memory instead it just interprets what's in the memory differently

tensor([[-9.1079e-02, -1.2058e-01, -9.1079e-02, -1.2058e-01, -9.1079e-02,
         -1.2058e-01],
        [-9.1079e-02, -1.2058e-01, -9.1079e-02, -1.2058e-01, -6.4305e-01,
          9.2540e-01],
        [-9.1079e-02, -1.2058e-01, -6.4305e-01,  9.2540e-01, -2.5104e-01,
          1.5556e+00],
        [-6.4305e-01,  9.2540e-01, -2.5104e-01,  1.5556e+00, -2.5104e-01,
          1.5556e+00],
        [-2.5104e-01,  1.5556e+00, -2.5104e-01,  1.5556e+00, -1.1086e+00,
         -2.3352e+00],
        [-9.1079e-02, -1.2058e-01, -9.1079e-02, -1.2058e-01, -9.1079e-02,
         -1.2058e-01],
        [-9.1079e-02, -1.2058e-01, -9.1079e-02, -1.2058e-01,  2.3154e+00,
          5.1048e-01],
        [-9.1079e-02, -1.2058e-01,  2.3154e+00,  5.1048e-01, -1.2148e+00,
         -1.1060e-01],
        [ 2.3154e+00,  5.1048e-01, -1.2148e+00, -1.1060e-01, -2.1194e-01,
          6.2278e-01],
        [-1.2148e+00, -1.1060e-01, -2.1194e-01,  6.2278e-01, -1.6346e-01,
          1.3759e+00],
        [-2.1194e-01,  6.2278e

In [49]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # for -1 here pytorch infers what should go into that dimension

In [50]:
h.shape

torch.Size([32, 100])

In [51]:
(emb.view(-1, 6) @ W1).shape

torch.Size([32, 100])

In [52]:
b1.shape

torch.Size([100])

In [53]:
# will broadcasting allow emb.view(-1, 6) @ W1 + b1 ?? 
# 32, 100
#   1, 100 --> when adding this to 32,100 tensor this will get copied 32 times and add to all the rows
# so this is correct


In [56]:
W2 = torch.randn((100,27)) # 27 for 27 possible characters that will come next
b2 = torch.randn(27)

In [57]:
logits = h @ W2 + b2

In [58]:
logits.shape

torch.Size([32, 27])

In [59]:
counts = logits.exp()

In [71]:
prob = counts / counts.sum(1, keepdims=True)

In [72]:
prob.shape

torch.Size([32, 27])

In [73]:
prob[0].sum()

tensor(1.)

In [79]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(18.6424)

In [68]:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [80]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [81]:
# ---------------- now made respectable :) ----------------

In [82]:
X.shape, Y.shape # dataset

(torch.Size([32, 3]), torch.Size([32]))

In [83]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [84]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [90]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
# Never do below three lines in production instead use F.cross_entropy()
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims=True)
# loss = -prob[torch.arange(32), Y].log().mean()
loss = F.cross_entropy(logits, Y)
loss

tensor(17.7697)

In [88]:
# 1.
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims=True)
# loss = -prob[torch.arange(32), Y].log().mean()

# 2.
# F.cross_entropy(logits, Y)
#
# 1 and 2. are equivalent. 2. is more efficient under the hood because instead of creating separate tensors
# for each equation it does a fused kernel.
# a. the forward pass is much more efficient because of reason stated above
# b. the backward pass will be much more efficient because the derivative will be simplified
# c. the numbers are better well behaved. logits.exp() can cause nans but pytorch internally solved this
#    by subtracting the max number from the tensor which still gives an equivalent output.

tensor(17.7697)

In [92]:
for p in parameters:
    p.requires_grad = True

In [94]:
 for _ in range(1000):
        
    # forward pass
    emb = C[X] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Y)
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    for p in parameters:
        p.data += -0.1 * p.grad
print(loss.item())

0.25361168384552


In [101]:
# playing with the tensor.max() for my understanding
x = torch.tensor([[2, 1], [3, 4]])

max_values, max_indices = x.max(dim=0)

print("Max values along dim 0:", max_values)
print("Indices of max values along dim 0:", max_indices)

x = torch.tensor([[2, 1], [3, 4]])

max_values, max_indices = x.max(dim=1)

print("Max values along dim 1:", max_values)
print("Indices of max values along dim 1:", max_indices)

Max values along dim 0: tensor([3, 4])
Indices of max values along dim 0: tensor([1, 1])
Max values along dim 1: tensor([2, 4])
Indices of max values along dim 1: tensor([0, 1])


In [104]:
# look at logits.max and compare against Y to see why loss can't be zero
# loss can't be zero because we have ... in the beginning and they map to completely different values most of the time.
logits.max(1)


torch.return_types.max(
values=tensor([13.8819, 18.9210, 21.1106, 21.3680, 17.5255, 13.8819, 16.7710, 14.8131,
        16.5614, 19.2740, 16.8058, 21.7451, 13.8819, 18.0764, 18.0122, 20.9834,
        13.8819, 17.3934, 16.1381, 18.0022, 19.2822, 16.8741, 11.6843, 11.3966,
        16.0696, 13.8819, 16.8989, 17.7183, 13.3022, 16.8211, 20.0219, 17.1322],
       grad_fn=<MaxBackward0>),
indices=tensor([19, 13, 13,  1,  0, 19, 12,  9, 22,  9,  1,  0, 19, 22,  1,  0, 19, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]))

In [112]:
Y

tensor([ 5, 13, 13,  ..., 26, 24,  0])

In [113]:
# ---------------- now made respectable :) ---------------- cleaned up version

In [135]:
X.shape, Y.shape # dataset

(torch.Size([228146, 3]), torch.Size([228146]))

In [146]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [143]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [147]:
for p in parameters:
    p.requires_grad = True

In [152]:
lre = torch.linspace(-3, 0, 1000) # learning rate exponent
lrs = 10**lre

In [145]:
for _ in range(100):
    
    # minibatch construct
    ix = torch.randint(0, X.shape[0], (32,))
    
    # forward pass
    emb = C[X[ix]] # (num_of_examples, 3, 2) -> (32, 3, 2) b/c of minibatching
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    print(loss.item())
   
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    for p in parameters:
        p.data += -1 * p.grad

        

17.668642044067383
19.547134399414062
20.24677085876465
21.31930160522461
16.994714736938477
16.058204650878906
19.999771118164062
18.43269157409668
22.727069854736328
18.27863311767578
21.136085510253906
15.871482849121094
20.224401473999023
18.966400146484375
19.36940574645996
21.601890563964844
21.991422653198242
19.3601016998291
18.9153995513916
17.814815521240234
17.472797393798828
22.65011978149414
18.823829650878906
20.27212142944336
20.813501358032227
17.798551559448242
19.318870544433594
19.346134185791016
17.018573760986328
17.049057006835938
18.64921760559082
20.202402114868164
20.557998657226562
18.127859115600586
19.86014747619629
19.513843536376953
19.1051025390625
17.123342514038086
21.553592681884766
20.52910041809082
17.29924774169922
18.37529754638672
17.90870475769043
17.13455581665039
15.718107223510742
16.671146392822266
18.89324188232422
17.906776428222656
18.897478103637695
16.95439910888672
18.371116638183594
20.275789260864258
18.368051528930664
18.177402496337

In [134]:
# evaluate loss for all of x and all of y
emb = C[X] # (num_of_examples, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Y)
loss

tensor(2.5576, grad_fn=<NllLossBackward0>)

In [154]:
C.shape 

torch.Size([27, 2])