<a href="https://colab.research.google.com/github/DrAlexSanz/micrograd/blob/main/02_Makemore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal of this notebook

* Following Karpathy's 3rd lecture on Makemore

https://youtu.be/TCH_1BHY58I?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&t=2279

In [127]:
import torch # for later
import torch.nn.functional as F # for laterhttps://youtu.be/TCH_1BHY58I?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&t=2279
import matplotlib.pyplot as plt
%matplotlib inline

In [128]:
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-01-05 10:44:59--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt.6’


2024-01-05 10:44:59 (11.3 MB/s) - ‘names.txt.6’ saved [228145/228145]



In [152]:
words = open("names.txt", "r").read().splitlines()
#words[:8]

In [153]:
len(words)

32033

In [154]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [None]:
# Build the dataset for the NN

block_size = 3 # This is the context lenght. How many characters do I take to predict the next one.

X, Y = [], []

for w in words:
    #print(w)
    context = [0] * block_size

    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print("".join(itos[i] for i in context), "----->", itos[ix])
        context = context[1:] + [ix] # Crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [157]:
# And to see the shapes of what I will input in the NN (X) and the labels (Y)

X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [158]:
# Now I build the embedding layer (Remember it's a matrix multiplication in the end, that acts as a lookup table).
# This is a neural network layer with no non-linearities.
# I have 27 characters (rows) and I will embed them in a 2D vector space (columns). 2 is arbitrary

# Initialize randomly
C = torch.randn((27, 2))
C

tensor([[ 0.4879,  0.6253],
        [ 0.2169, -1.0615],
        [-0.7011, -0.5842],
        [ 0.2227,  1.0159],
        [ 0.1758, -1.7483],
        [ 0.7596,  0.3377],
        [-0.8905, -0.4763],
        [-0.1649,  0.9313],
        [ 0.0654,  0.2417],
        [ 0.1689,  0.0327],
        [-2.0943,  1.4789],
        [-0.8040, -0.6336],
        [ 0.0190, -0.6924],
        [-2.3341,  0.8161],
        [-0.5792,  0.8404],
        [-0.8375,  0.4751],
        [-0.7118, -1.9520],
        [ 0.8602,  0.6499],
        [-1.7072,  0.8506],
        [ 1.5254,  0.9495],
        [-1.8308, -0.2287],
        [ 0.3776,  0.9663],
        [-0.1177,  0.4388],
        [-0.0327,  1.2796],
        [-0.9155,  1.3253],
        [ 0.4963, -0.7943],
        [-0.1347, -0.8412]])

In [159]:
# To embed the characters in this space (transform, no training yet) I can just index or matrix multiply them

emb = C[X]
print(emb.shape)
# This will give me the 4th row of the C tensor
print(C[X][3, 2])
print(X[3, 2])
print(C[13])

torch.Size([228146, 3, 2])
tensor([-2.3341,  0.8161])
tensor(13)
tensor([-2.3341,  0.8161])


In [160]:
# Now we do the linear layer

W1 = torch.randn(6, 100) # Weights 3 by 2 inputs, 100 neurons
b1 = torch.randn(100)

In [161]:
# However, I can't multiply these matrices just like that. I need first to concatenate the 3D tensor along the second dimension

# What I want is this, but it doesn't scale with the dimension
# torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim = 1) #Shape should be 32x6

concat = torch.cat((torch.unbind(emb, 1)), dim = 1) # Unbind produces a list as I was doing manually in the previous line. Then I concatenate them and the shape is the same.
# However this is very inefficient. It's copying and creating new object in memeory

In [162]:
# Additionally, there is a better way to do this in this case.

# I can just use view, which is very efficient

#concat_view = emb.view(32, 6) # (32, 3*2) And you can check that the results are the same

RuntimeError: shape '[32, 6]' is invalid for input of size 1368876

In [163]:
#concat_view == concat # It's always True

In [164]:
# So then I calculate the hidden states that I want to pass to the tanh

h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1) # And this will be 32 x 100

In [165]:
# and the softmax layer

W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [166]:
logits = h @ W2 + b2

In [167]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdims = True)

In [169]:
# So now for each row of the output, I take the probabilities by indexing

loss = -prob[torch.arange(228146), Y].log().mean()
loss

tensor(14.7433)

### Let's make this presentable for recap

In [170]:
g = torch.Generator().manual_seed(2147483647) # For reproducibility
C = torch.randn((27, 2), generator = g)
W1 = torch.randn((6, 100), generator = g)
b1 = torch.randn((100), generator = g)
W2 = torch.randn((100, 27), generator = g)
b2 = torch.randn(27, generator = g)
parameters = [C, W1, b1, W2, b2]

In [171]:
sum(p.nelement() for p in parameters) # Count the number of parameters

3481

### So now, let's use pytorch's functions for optimizing on one batch.
### Later, go up and comment the line where I take the first 5 words of the dataset

In [147]:
# Forward pass
# emb = C[X] # shape is (32, 2, 2)
# h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # Shape is (32, 100)
# logits = h @ W2 + 2 # Shape is (32, 37)
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims = True)
# loss = -prob[torch.arange(32), Y].log().mean()
# loss = F.cross_entropy(logits, Y)
# loss

In [172]:
# Make sure everything that is a parameters accepts gradients
for p in parameters:
    p.requires_grad = True

In [173]:
# Now I add the backward pass and put it in the loop, to optimize only on one batch. I should overfit quickly.

for _ in range(10):
    # Forward pass
    emb = C[X] # shape is (32, 2, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # Shape is (32, 100)
    logits = h @ W2 + 2 # Shape is (32, 37)
    loss = F.cross_entropy(logits, Y)
    print(loss.item())

    # Backward pass
    for p in parameters:
        p.grad = torch.zeros(p.shape, dtype=torch.float32) # Set them to 0. Apparently python doesn't like None insetad of 0
    loss.backward() # calculate gradients

    # Gradient update stage
    for p in parameters:
        p.data += -0.1 * p.grad # lr is 0.1 here



19.1424503326416
16.641582489013672
15.451205253601074
14.499948501586914
13.695572853088379
12.988574981689453
12.366576194763184
11.82927417755127
11.383988380432129
11.017670631408691
