<a href="https://colab.research.google.com/github/DrAlexSanz/micrograd/blob/main/02_Makemore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal of this notebook

* Following Karpathy's 3rd lecture on Makemore

https://youtu.be/TCH_1BHY58I?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&t=2279

In [1]:
import torch # for later
import torch.nn.functional as F # for later
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2023-12-14 13:45:05--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2023-12-14 13:45:05 (8.30 MB/s) - ‘names.txt’ saved [228145/228145]



In [3]:
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
len(words)

32033

In [5]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [8]:
# Build the dataset for the NN

block_size = 3 # This is the context lenght. How many characters do I take to predict the next one.

X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size

    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print("".join(itos[i] for i in context), "----->", itos[ix])
        context = context[1:] + [ix] # Crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... -----> e
..e -----> m
.em -----> m
emm -----> a
mma -----> .
olivia
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
via -----> .
ava
... -----> a
..a -----> v
.av -----> a
ava -----> .
isabella
... -----> i
..i -----> s
.is -----> a
isa -----> b
sab -----> e
abe -----> l
bel -----> l
ell -----> a
lla -----> .
sophia
... -----> s
..s -----> o
.so -----> p
sop -----> h
oph -----> i
phi -----> a
hia -----> .


In [9]:
# And to see the shapes of what I will input in the NN (X) and the labels (Y)

X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [13]:
# Now I build the embedding layer (Remember it's a matrix multiplication in the end, that acts as a lookup table).
# This is a neural network layer with no non-linearities.
# I have 27 characters (rows) and I will embed them in a 2D vector space (columns). 2 is arbitrary

# Initialize randomly
C = torch.randn((27, 2))
C

tensor([[ 1.2721,  1.4637],
        [ 0.4382,  0.1532],
        [-0.8177, -1.2940],
        [-0.8050,  0.6404],
        [-1.2150,  0.4946],
        [-1.6286, -0.8722],
        [-0.6927,  0.5824],
        [-0.1338, -0.8566],
        [ 0.9942, -0.1670],
        [ 2.9981,  0.8421],
        [-0.4551,  0.0597],
        [ 0.7267, -0.4338],
        [ 0.3857, -1.7375],
        [ 0.3362, -0.1526],
        [-0.0389,  0.7885],
        [ 0.2621, -0.9237],
        [ 0.6052, -1.3118],
        [ 1.2345, -0.3739],
        [ 0.2678, -0.2567],
        [ 0.1939,  0.7376],
        [ 1.2741, -0.9261],
        [ 0.6714, -0.3176],
        [ 0.0067,  1.6626],
        [ 1.1944,  0.7488],
        [ 0.8947, -0.3699],
        [-0.0654,  1.1566],
        [-0.9388,  0.0951]])

In [23]:
# To embed the characters in this space (transform, no training yet) I can just index or matrix multiply them

emb = C[X]
print(emb.shape)
# This will give me the 4th row of the C tensor
print(C[X][3, 2])
print(X[3, 2])
print(C[13])

torch.Size([32, 3, 2])
tensor([ 0.3362, -0.1526])
tensor(13)
tensor([ 0.3362, -0.1526])


In [38]:
# Now we do the linear layer

W1 = torch.randn(6, 100) # Weights 3 by 2 inputs, 100 neurons
b1 = torch.randn(100)

In [34]:
# However, I can't multiply these matrices just like that. I need first to concatenate the 3D tensor along the second dimension

# What I want is this, but it doesn't scale with the dimension
# torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim = 1) #Shape should be 32x6

concat = torch.cat((torch.unbind(emb, 1)), dim = 1) # Unbind produces a list as I was doing manually in the previous line. Then I concatenate them and the shape is the same.
# However this is very inefficient. It's copying and creating new object in memeory

In [35]:
# Additionally, there is a better way to do this in this case.

# I can just use view, which is very efficient

concat_view = emb.view(32, 6) # (32, 3*2) And you can check that the results are the same

In [36]:
concat_view == concat # It's always True

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [41]:
# So then I calculate the hidden states that I want to pass to the tanh

h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1) # And this will be 32 x 100

In [44]:
# and the softmax layer

W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [45]:
logits = h @ W2 + b2

In [47]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdims = True)

In [51]:
# So now for each row of the output, I take the probabilities by indexing

loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(20.5057)

### Let's make this presentable for recap

In [52]:
g = torch.Generator().manual_seed(2147483647) # For reproducibility
C = torch.randn((27, 2), generator = g)
W1 = torch.randn((6, 100), generator = g)
b1 = torch.randn((100), generator = g)
W2 = torch.randn((100, 27), generator = g)
b2 = torch.randn(27, generator = g)
parameters = [C, W1, b1, W2, b2]

In [53]:
sum(p.nelement() for p in parameters) # Count the number of parameters

3481