In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

![](Capture.PNG)

Although we won't do the part where C's go directly into the final layer too because we're here for educational content, not minmaxing :flushed:

In [3]:
# read in words
words = open('../2 - makemore/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [16]:
# Build the vocab of characters and mappings of char <-> ints
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)} # string to integer
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()} # int to string
print(itos, '\n', stoi)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'} 
 {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [138]:
# Building the dataset

def generate_dataset(words, block_size, doprint=False):
	"returns X, Y datasets based on words list"
	#block_size: context length: how many characters do we take to predict the next one?

	X, Y = [], []

	for w in words:
		if doprint:
			print(w)	
		context = [0] * block_size # this would make [0, 0, ...] based on block_size

		for ch in w + '.': 
			ix = stoi[ch] # index of char
			X.append(context)
			Y.append(ix)
			
			if doprint:
				print(''.join(itos[i] for i in context), '------>', itos[ix])
			context = context[1:] + [ix] # [0,0,0] -> [0,0, ix] like a rolling effect

	X = torch.tensor(X)
	Y = torch.tensor(Y)
	return X, Y
X, Y = generate_dataset(words[:5], 3, True)


emma
... ------> e
..e ------> m
.em ------> m
emm ------> a
mma ------> .
olivia
... ------> o
..o ------> l
.ol ------> i
oli ------> v
liv ------> i
ivi ------> a
via ------> .
ava
... ------> a
..a ------> v
.av ------> a
ava ------> .
isabella
... ------> i
..i ------> s
.is ------> a
isa ------> b
sab ------> e
abe ------> l
bel ------> l
ell ------> a
lla ------> .
sophia
... ------> s
..s ------> o
.so ------> p
sop ------> h
oph ------> i
phi ------> a
hia ------> .


In [21]:
print(X.shape, X.dtype, Y.shape, Y.dtype)
X, Y # Each X[n] maps to a Y[n], that's a training example. So you wanna take the X, predict the Y (via NN once again).

torch.Size([32, 3]) torch.int64 torch.Size([32]) torch.int64


(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         [ 5, 13, 13],
         [13, 13,  1],
         [ 0,  0,  0],
         [ 0,  0, 15],
         [ 0, 15, 12],
         [15, 12,  9],
         [12,  9, 22],
         [ 9, 22,  9],
         [22,  9,  1],
         [ 0,  0,  0],
         [ 0,  0,  1],
         [ 0,  1, 22],
         [ 1, 22,  1],
         [ 0,  0,  0],
         [ 0,  0,  9],
         [ 0,  9, 19],
         [ 9, 19,  1],
         [19,  1,  2],
         [ 1,  2,  5],
         [ 2,  5, 12],
         [ 5, 12, 12],
         [12, 12,  1],
         [ 0,  0,  0],
         [ 0,  0, 19],
         [ 0, 19, 15],
         [19, 15, 16],
         [15, 16,  8],
         [16,  8,  9],
         [ 8,  9,  1]]),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
          1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]))

### Embedding our inputs
This is the first step of the NN -- basically, converting our indexes to some embedding.


In [23]:
C = torch.randn((27, 2)) # long chocolate bar 
C[5]

tensor([-0.8650,  1.1346])

In [27]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C # It's the same thing as indexing, because that's how matrix mult works, and how one hot encodes (remember, just, a vector of 0s  except for the 1 at index 5.) So there's multiple ways you can interpret this. For now, we'll use indexing because it's faster.

tensor([-0.8650,  1.1346])

In [30]:
C[[5,6,7]] # We can index multiple things at once, X here we come!!

tensor([[-0.8650,  1.1346],
        [ 1.4501, -1.1544],
        [ 0.3911, -1.0933]])

In [38]:
emb = C[X] # Can even do multidimensional indexing
print(emb.shape)
"""
This gives us the embedding for every example of X (of which, each example has 3 inputs).
"""
print(f"Ex: For example 13, the 2nd char index is {X[13, 2]}, and that has an embedding of {emb[13, 2]}, and we can see it's the same as {C[1]=}")

torch.Size([32, 3, 2])
For example 13, the 2nd char index is 1, and that has an embedding of tensor([ 0.4246, -0.0144]), and we can see it's the same as C[1]=tensor([ 0.4246, -0.0144])


### Weights and Layers
Look at the picture. This is the layer that'll get tanh'd -- the above embedding stuff is the look into C and embedding which, which will be our inputs to the NN 

In [42]:
W1 = torch.randn((6, 100)) # as you can see from the embedding, we'll have 6 inputs per example. (3 chars and each char embedded to 2 things).
b1 = torch.randn(100)

# However!
emb @ W1 + b1 # Gives error since we can't multiply (32,3,2) by (6,100)! We have to convert the embed to a (32, 6) matrix.

RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)

In [47]:
print(emb[:, 0, :].shape) # This'll get embedding of all the first character examples. 
""" Now, we'll get all of them in a sequence, and then concat their columns together."""

torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape # And we want to concatenate along the '1' dimension (i.e column wise here). But it's not easily generalisable if we wanted more code blocks. So:
"""OH SHIT! I JUST REALISED WHAT WAY THE DIMENSION NUMBER WORKS. LOL. FUCK ME.
It's just, say you have a torch.Size([a,b,c,d]). a is dim 0, b is dim 1, c is dim 2...
"""

torch.Size([32, 2])


torch.Size([32, 6])

In [54]:
print(emb.shape) # we unbind along index 1 to combine the way we want to.
print(len(torch.unbind(emb, 1)))

torch.cat(torch.unbind(emb, 1), 1).shape # Unbind here does the same thing as our whole emb sequence above

torch.Size([32, 3, 2])
3


torch.Size([32, 6])

In [56]:
# ----- Education ------
# # However, we have more efficient way! :o view!
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [62]:
a.view(18), a.view(2,9), a.view(9,2), a.view(3,3,2) # They all work, and super quick! Just gotta be a*b*... = num of elements total.

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]),
 tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
         [ 9, 10, 11, 12, 13, 14, 15, 16, 17]]),
 tensor([[ 0,  1],
         [ 2,  3],
         [ 4,  5],
         [ 6,  7],
         [ 8,  9],
         [10, 11],
         [12, 13],
         [14, 15],
         [16, 17]]),
 tensor([[[ 0,  1],
          [ 2,  3],
          [ 4,  5]],
 
         [[ 6,  7],
          [ 8,  9],
          [10, 11]],
 
         [[12, 13],
          [14, 15],
          [16, 17]]]))

In [66]:
print(emb.shape)
print(emb.view(32, 6) ==  torch.cat(torch.unbind(emb, 1), 1)) # It 'views' it in the same way :o
h = emb.view(32, 6) @ W1 + b1
h

####### --------------- Back to main ------------- ############ 


torch.Size([32, 3, 2])
tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [Tr

tensor([[ 2.7268, -3.0981,  3.1197,  ...,  6.5318, -7.7704,  0.0571],
        [ 2.2707, -2.2964,  4.6629,  ...,  5.1119, -7.6003, -0.6592],
        [ 2.6195, -1.4434,  3.9820,  ...,  3.0401, -6.1684, -1.0681],
        ...,
        [-2.2184,  0.0373,  0.8352,  ...,  0.4229,  3.6632, -1.0233],
        [-1.2713,  1.4442,  5.6576,  ..., -2.1889, -1.2700, -1.3892],
        [ 1.2163,  0.8004,  1.0746,  ..., -2.6963,  4.0367, -1.6649]])

In [71]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # -1 same as emb.shape[0] (infers shit pytorch chad mode) just so we don't hardcode num of examples
print(h)  # nums between -1, 1
print(h.shape)

tensor([[ 0.9915, -0.9959,  0.9961,  ...,  1.0000, -1.0000,  0.0571],
        [ 0.9789, -0.9800,  0.9998,  ...,  0.9999, -1.0000, -0.5779],
        [ 0.9894, -0.8944,  0.9993,  ...,  0.9954, -1.0000, -0.7887],
        ...,
        [-0.9766,  0.0373,  0.6833,  ...,  0.3993,  0.9987, -0.7712],
        [-0.8541,  0.8945,  1.0000,  ..., -0.9752, -0.8538, -0.8830],
        [ 0.8386,  0.6642,  0.7912,  ..., -0.9909,  0.9994, -0.9309]])
torch.Size([32, 100])


----- Educational
just to make sure we  broadcasting right! https://pytorch.org/docs/stable/notes/broadcasting.html

Two tensors are “broadcastable” if the following rules hold:

Each tensor has at least one dimension.

When iterating over the dimension sizes, starting at the trailing dimension, the dimension sizes must either be equal, one of them is 1, or one of them does not exist.

In [73]:
(emb.view(-1, 6) @ W1).shape, b1.shape
# 32, 100
#  1 , 100 so it'll make this a row vector (it puts the 1 there), and then element wise additions over all 32 examples. We want this in this case -- same bias vector added to all rows of the matrix.
# Good practice to make sure!!!!!

(torch.Size([32, 100]), torch.Size([100]))

 Back to main!!


In [81]:
# Second set of weights -- our final layer in this case!
W2 = torch.randn((100, 27)) # 100 inputs, converts to 27 outputs (27 chars that come next)
b2 = torch.randn(27)

In [82]:
logits = h @ W2 + b2 
logits.shape

torch.Size([32, 27])

In [94]:
counts = logits.exp() 
probs = counts / counts.sum(1, keepdim=True)
print(probs.shape,  probs[0].sum()) # Sanity check, sum of each row should equal 1 -- each row is prob dist of next word.

torch.Size([32, 27]) tensor(1.0000)


Now we'll get our loss function, simple simple

In [99]:
print(Y, '\n', probs[torch.arange(32), Y]) # The arange goes 0-31, so it'll return the 0-31st row of probs, and Y makes it so we index to the *correct* index, and get the probability of it. Some arw good like a 3.54e-01, but a lot are awful! Let's get the loss now.

loss = -probs[torch.arange(32), Y].log().mean() # hardcoded 32 :(
print(f'{loss=}')

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]) 
 tensor([4.0566e-08, 8.0922e-04, 1.1215e-04, 3.2266e-08, 3.4230e-10, 3.5458e-04,
        7.3422e-02, 4.9846e-11, 1.0692e-06, 5.5262e-17, 3.8316e-07, 3.0864e-06,
        5.4867e-09, 3.0371e-17, 6.2880e-12, 1.4152e-09, 1.8369e-10, 2.4629e-08,
        5.3184e-08, 1.9197e-10, 1.4355e-12, 5.7702e-03, 2.1948e-11, 1.6727e-06,
        1.9351e-04, 4.9451e-08, 3.5400e-01, 2.4573e-21, 3.6991e-11, 9.8634e-17,
        1.7046e-05, 1.4627e-06])
loss=tensor(18.6535)


### Sum up of everything above, but respectably

In [254]:
X, Y = generate_dataset(words, 3)
X.shape, Y.shape # dataset

(torch.Size([228146, 3]), torch.Size([228146]))

In [273]:
g = torch.Generator().manual_seed(2147483647)
def generate_parameters():
	C = torch.randn(27, 2, generator=g) # le embedding index thingy 
	W1 = torch.randn((6, 100), generator=g)
	b1 = torch.randn(100, generator=g)
	W2 = torch.randn((100, 27), generator=g)
	b2 = torch.randn(27, generator=g)
	parameters = [C, W1, b1, W2, b2] # for easy summing parameters

	for p in parameters:  # Turn on requires grad for our parameter matrices
		p.requires_grad = True

	return parameters
parameters = generate_parameters()

In [274]:
sum(p.nelement() for p in parameters) # num of parameters in total

3481

Educational moment!
So, with cross entropy, the forward pass can be much more efficient, the backward pass can be much more efficient, and can be numerically well behaved
```
logits = torch.tensor([-50, 2, 3, 100])
counts = logits.exp() # poggers time to convert logits to prob distrs!
probs = counts / counts.sum()
probs -> tensor([0., 0., 0., nan])
```
Uh oh! Basically, it goes way over our max positive limit (via exp, literally doing e^100). Small numbers are fine, and since you can +- any arbitrary constant to logits and get the same prob outputs due to the normalisation, cross_entropy basically does a thing where it picks out the max value number in the logits tensor, and subtracts it away so we don't get any sussy baka nans.

Okay, I'm removing the code below and replacing it with F.cross_entropy now :( 
```
	counts = logits.exp() # poggers time to convert logits to prob distrs!
	probs = counts / counts.sum(1, keepdims=True)
	loss = -probs[torch.arange(32), Y].log().mean() # hardcoded 32 :(
```

In [275]:
def forward_pass(params, X):
	C, W1, b1, W2, b2 = params
	# Construct minibatch (of size 32)
	ix = torch.randint(0, X.shape[0], (32,)) # 32 random ints from size of 0- training set
	
	# forward pass
	emb = C[X[ix]] # (32,3,2) here
	h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
	logits = h @ W2 + b2 # (32, 27)

	loss = F.cross_entropy(logits, Y[ix]) # Does the epic normalisation stuff
	# print(f'{loss=}')
	return logits, loss

def train_model(parameters, X, Y):
	""" 
	THIS IS MINIBATCHED UP BTW!!!
	The training section of your parameters -- you gradient descent down this mofo!
	This changed a lot over the course of the video. Like, we have cross entropy instead of the default inefficient normalising way, minibatches, ...
	Yeah.

	Args:
		parameters (_type_): From generate_parameters()
		"""
	C, W1, b1, W2, b2 = parameters
	for _ in range(10):

		# Forward pass -- this is minibatched!
		logits, loss = forward_pass(parameters, X)

		# backward pass - zero grad, backprop
		for p in parameters:
			p.grad = None
		loss.backward()

		# update
		for p in parameters:
			p.data += -0.1 * p.grad
	print(f'{loss=}') # This is the loss for the minibatch rn.
	return parameters


In [276]:
parameters = train_model(parameters, X, Y)
C, W1, b1, W2, b2 = parameters # Just so we can get some fkin 'niceness' between karpathy's going through stuff lecture and us modularising it

loss=tensor(7.8733, grad_fn=<NllLossBackward0>)


In [277]:
# This is just a bit so we can see the whole training set loss and not just the training batch loss
emb = C[X] # (X.shape[0],3,2) here -- whole set in this case
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)

loss = F.cross_entropy(logits, Y) # Does the epic normalisation stuff
print(f'{loss=}')

loss=tensor(11.0778, grad_fn=<NllLossBackward0>)


#### Educational time! Minibatches
At the core -- even though minibatches only approximate the gradient (i.e not as reliable as the full training set), you can take a lot more steps, and it's worth it over the full training set that gets the gradient but you tkae a lot less steps.

Basically, the way you minibatch is you make a range of numbers (of your minibatch size, like 32) between 0-training set size. And then you run the gradient descent on that batch as your 'training set' basically. And you just keep doing that.
~44:00 mins in.
```
ix = torch.randint(0, 5, (32,)) # generates 32 numbers from 0-5. We'll use this idea for minibatches.
```

### Trying to find the best learning rate
This shit's gotta be so confusing for onlookers lmaooo. Okay, anyways. ~46 mins in. Wait. 46???? DUDE. IT'S BEEN LIKE HALF A FUCKING HOUR. GAAAAAAAH
We'll see what learning rates are the best. Time to dismantle the code again :o


In [301]:
parameters = generate_parameters()

lre = torch.linspace(-3, 0, 1000) # generates 1000 points equally distanced between -3, 0. But these will act as the exponentials of the actual things we're gonna plot.count
lrs = 10**lre # exponential spacing :o 

In [305]:
lri = []  # learning rates we used
lossi = [] # losses due to that learning rate.
for i in range(1000):
	C, W1, b1, W2, b2 = parameters
	# Forward pass -- this is minibatched!
	# Construct minibatch (of size 32)
	ix = torch.randint(0, X.shape[0], (32,)) # 32 random ints from size of 0- training set
	
	# forward pass
	logits, loss = forward_pass(parameters, X)


	# backward pass - zero grad, backprop
	for p in parameters:
		p.grad = None
	loss.backward()

	# update
	lr = lrs[i]
	for p in parameters:
		p.data +=  lr * p.grad

	# track stats
	lri.append(lr)
	lrs.append(loss.item())


# print(f'{loss=}') # This is the loss for the minibatch rn.

loss=tensor(2.6687, grad_fn=<NllLossBackward0>)
