# makemore: part 4



In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#Data load. No changes here

#https://datos.gob.es/es/catalogo/a09002970-municipios-de-espana
# We will instead be using names of villages/cities in Spain. Only 8k data
import pandas as pd

# Read the CSV data
df = pd.read_csv("Municipis_d_Espanya.csv", sep=",")

# Function to clean the names
def clean_name(name):
    # If there's a slash, take the first part
    name = name.split('/')[0]
    # If it's in "Last, First" format, swap it to "First Last"
    if ',' in name:
        parts = name.split(', ')
        if len(parts) == 2:
            name = f"{parts[1]} {parts[0]}"
    return name

# Apply the function to clean names
df["Nom"] = df["Nom"].apply(clean_name)

# Extract only the 'Territorio' column as a list
words = df["Nom"].tolist()

print(f"{len(words)} words")

#Simplifying the problem (lowercase and no accents)
import unidecode
import re

print(words[:1])
words = [re.sub(r'[\(\)\'"]', '', unidecode.unidecode(word).lower()) for word in words]
print(words[:1])

8134 words
['Alegría-Dulantzi']
['alegria-dulantzi']


In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: ' ', 2: '-', 3: 'a', 4: 'b', 5: 'c', 6: 'd', 7: 'e', 8: 'f', 9: 'g', 10: 'h', 11: 'i', 12: 'j', 13: 'k', 14: 'l', 15: 'm', 16: 'n', 17: 'o', 18: 'p', 19: 'q', 20: 'r', 21: 's', 22: 't', 23: 'u', 24: 'v', 25: 'w', 26: 'x', 27: 'y', 28: 'z', 0: '.'}
29


In [5]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%


torch.Size([85032, 3]) torch.Size([85032])
torch.Size([10606, 3]) torch.Size([10606])
torch.Size([10768, 3]) torch.Size([10768])


No changes until here
_____________________

In [6]:
# utility function we will use later when comparing manual gradients to PyTorch gradients
def compare_to_pytorch(label, manual_calculated, tensor):
    # Check if the values are exactly the same
    is_exact = torch.all(manual_calculated == tensor.grad).item()

    # Check if the values are approximately the same
    is_approximate = torch.allclose(manual_calculated, tensor.grad)

    # Calculate the maximum difference between the expected values and the gradients
    max_difference = (manual_calculated - tensor.grad).abs().max().item()

    # Print the results
    print(f'{label:15s} | exactly equal: {str(is_exact):5s} | approximatly equal: {str(is_approximate):5s} | larges difference: {max_difference}')

In [7]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1 # using b1 just for fun, it's useless because of BN
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

# Note: I am initializating many of these parameters in non-standard ways
# because sometimes initializating with e.g. all zeros could mask an incorrect
# implementation of the backward pass.

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

4287


In [8]:
batch_size = 32
# construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

In [66]:
# Step 1: Embedding the input characters into vectors
embeddings = C[Xb]  # C is the embedding matrix, Xb are the input indices
flattened_embeddings = embeddings.view(embeddings.shape[0], -1)  # Flatten embeddings to 2D: (batch_size, embedding_dim)

# Step 2: Linear Layer 1 (pre-activation)
hidden_pre_bn = flattened_embeddings @ W1 + b1  # Linear transformation (batch_size, hidden_size)

# Step 3: Batch Normalization (BN)
batch_mean = 1 / batch_size * hidden_pre_bn.sum(0, keepdim=True)  # Calculate batch mean (1, hidden_size)
centered_bn = hidden_pre_bn - batch_mean  # Subtract the mean from each hidden pre-activation value
centered_bn2=centered_bn**2
batch_variance = 1 / (batch_size - 1) * (centered_bn2).sum(0, keepdim=True)  # Calculate batch variance (1, hidden_size)
# batch_size-1= Bessel correction. 
batch_variance_inv = (batch_variance + 1e-5)**-0.5  # Inverse of the standard deviation (1, hidden_size)
normalized_bn = centered_bn * batch_variance_inv  # Normalize the batch (batch_size, hidden_size)

# Step 4: Apply Scale and Shift (Gamma and Beta) to Batch Normalization
hidden_pre_activation = bngain * normalized_bn + bnbias  # Apply learnable scaling (gamma) and shifting (beta)

# Step 5: Non-linearity (activation)
hidden = torch.tanh(hidden_pre_activation)  # Apply the tanh activation function

# Step 6: Linear Layer 2 (output layer)
logits = hidden @ W2 + b2  # Final linear transformation to produce logits (batch_size, output_dim)

# Step 7: Cross Entropy Loss (manual calculation)
logit_max = logits.max(1, keepdim=True).values  # For numerical stability, subtract the max logit
norm_logits = logits - logit_max  # Subtract max logits to prevent overflow during exponentiation

# Calculate probabilities (softmax)
counts_logits = norm_logits.exp()  # Exponentiate the logits to get unnormalized probabilities
counts_logits_sum = counts_logits.sum(1, keepdim=True)  # Sum of exponentiated logits (batch_size, 1)
count_logits_sum_inv = counts_logits_sum**-1  # Inverse of the sum for normalization (batch_size, 1)
probabilities = counts_logits * count_logits_sum_inv  # Normalize to get the actual probabilities (batch_size, output_dim)

# Compute log probabilities
log_probabilities = probabilities.log()  # Logarithm of probabilities (batch_size, output_dim)

# Calculate the cross entropy loss
loss = -log_probabilities[range(batch_size), Yb].mean()  # Average the negative log likelihood for the true labels

# Step 8: Backward Pass
for param in parameters:
    param.grad = None  # Clear previous gradients

# Retain gradients for intermediate variables for debugging or inspection
for tensor in [log_probabilities, probabilities, counts_logits, counts_logits_sum, count_logits_sum_inv,
               norm_logits, logit_max, logits, hidden, hidden_pre_activation, normalized_bn,
               batch_variance_inv, batch_variance, centered_bn,centered_bn2, batch_mean, hidden_pre_bn, flattened_embeddings, embeddings]:
    tensor.retain_grad()  # Retain gradients for inspection if needed

# Perform backpropagation to compute gradients
loss.backward()

# Return the final loss value
loss


tensor(3.3797, grad_fn=<NegBackward0>)

# Exercise 1
## backprop through the whole thing manually

In [10]:
'''
log_probabilities
probabilities
count_logits_sum_inv
counts_logits_sum
counts_logits
norm_logits
logit_max
logits
hidden
hidden_pre_activation
normalized_bn
batch_variance_inv
batch_variance
centered_bn
batch_mean
flattened_embeddings
embeddings
'''

'\nlog_probabilities\nprobabilities\ninv_logits_sum\nlogits_sum\nexp_logits\nstable_logits\nlogit_max\nlogits\nhidden\nhidden_pre_activation\nnormalized_bn\nbatch_variance_inv\nbatch_variance\ncentered_bn\nbatch_mean\nflattened_embeddings\nembeddings\n'

In [20]:
#loss = -logprobabilities[range(batch_size), Yb].mean()
#loss = -1*(sum(elements))/n_elements
#dloss= -1/n_elements

print(log_probabilities.shape)

d_log_probabilities=torch.zeros_like(log_probabilities)
d_log_probabilities[range(batch_size), Yb] = -1/batch_size

compare_to_pytorch('log_probabilities', d_log_probabilities, log_probabilities)

torch.Size([32, 29])
log_probabilities | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [21]:
#log_probabilities = probabilities.log()
#d/dx(log(x)) = 1/x

d_probabilities=(1/probabilities)*d_log_probabilities #examples with low prob --> boost grad

compare_to_pytorch('probabilities', d_probabilities, probabilities)

probabilities   | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [22]:
#probabilities = exp_logits * inv_logits_sum  # Normalize to get the actual probabilities (batch_size, output_dim)
#2 ops really
#   replicate column tensor
#   multiplication


print(counts_logits.shape)
print(count_logits_sum_inv.shape)

# c = a * b, but with tensors:
# a[3x3] * b[3,1]  ---->
# a11*b1  a12*b1  a13*b1
# a21*b2  a22*b2  a23*b2
# a31*b3  a32*b3  a33*b3
# c[3x3]


print(d_probabilities.shape)

d_count_logits_sum_inv=(counts_logits*d_probabilities).sum(1,keepdim=True)


compare_to_pytorch('Inv logits sum', d_count_logits_sum_inv, count_logits_sum_inv)

torch.Size([32, 29])
torch.Size([32, 1])
torch.Size([32, 29])
Inv logits sum  | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [24]:
#Exp logits its used twice

print(count_logits_sum_inv.shape)
print(d_probabilities.shape)

d_counts_logits_firstcontrib=(count_logits_sum_inv*d_probabilities)

#counts_logits = norm_logits.exp()  # Exponentiate the logits to get unnormalized probabilities
#counts_logits_sum = counts_logits.sum(1, keepdim=True)  # Sum of exponentiated logits (batch_size, 1)
#count_logits_sum_inv = counts_logits_sum**-1  # Inverse of the sum for normalization (batch_size, 1)
#probabilities = counts_logits * count_logits_sum_inv  # Normalize to get the actual probabilities (batch_size, output_dim)

#Before counts, we must derivate counts_sum. "we cannot reach it yet"
#count_logits_sum_inv = counts_logits_sum**-1 
#d/dx(1/x) = -1/x^2
d_counts_logits_sum=-1/(counts_logits_sum**2)*d_count_logits_sum_inv
compare_to_pytorch('Exp sum', d_counts_logits_sum, counts_logits_sum)

torch.Size([32, 1])
torch.Size([32, 29])
Exp sum         | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [27]:
#counts_logits_sum = counts_logits.sum(1, keepdim=True)  # Sum of exponentiated logits (batch_size, 1)
print(counts_logits.shape)
print(counts_logits_sum.shape)


# a11 a12 a13 ---> b1(= a11 + a12 + a13)
# a21 a22 a23 ---> b2(= a21 + a22 + a23)
# a31 a32 a33 ---> b3(= a31 + a32 + a33)

d_counts_logits_secondcontrib= torch.ones_like(counts_logits)*d_counts_logits_sum

d_counts_logits=d_counts_logits_firstcontrib+d_counts_logits_secondcontrib

compare_to_pytorch('counts', d_counts_logits, counts_logits)

torch.Size([32, 29])
torch.Size([32, 1])
counts          | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [31]:
#counts_logits = norm_logits.exp()  # Exponentiate the logits to get unnormalized probabilities
#d/dx(e^x) = e^x

d_norm_logits=norm_logits.exp()*d_counts_logits

compare_to_pytorch('norm_logits', d_norm_logits, norm_logits)


norm_logits     | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [35]:
#norm_logits = logits - logit_max  # Subtract max logits to prevent overflow during exponentiation

print(logit_max.shape)# broacast in the minus
print(logits.shape)


# c11 c12 c13 = |a11 a12 a13|   |b1|
# c21 c22 c23 = |a21 a22 a23| - |b2|
# c31 c32 c33 = |a31 a32 a33|   |b1|
# e.g c32=a23 -b3

d_logit_max=-1*d_norm_logits.sum(1, keepdim=True)


compare_to_pytorch('logit_max', d_logit_max, logit_max)


torch.Size([32, 1])
torch.Size([32, 29])
logit_max       | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [37]:
d_logits = 1 * d_norm_logits.clone()
d_logits += F.one_hot(logits.max(1).indices, num_classes=logits.shape[1]) * d_logit_max

compare_to_pytorch('logits', d_logits, logits)


logits          | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [38]:
#logits = hidden @ W2 + b2  # Final linear transformation to produce logits (batch_size, output_dim)

print(logits.shape)
print(hidden.shape)
print(W2.shape)
print(b2.shape)


# Matrix operation: d = a @ b + c
# Given:
# a = [[a11, a12],
#      [a21, a22]]
# 
# b = [[b11, b12],
#      [b21, b22]]
# 
# c = [[c1, c2],
#      [c1, c2]]
#
# The resulting matrix d is computed as:
# d11 = a11 * b11 + a12 * b21 + c1
# d12 = a11 * b12 + a12 * b22 + c2
# d21 = a21 * b11 + a22 * b21 + c1
# d22 = a21 * b12 + a22 * b22 + c2


#hidden=a, W2=b, b2=c



torch.Size([32, 29])
torch.Size([32, 64])
torch.Size([64, 29])
torch.Size([29])


![](dl_da.png)

In [40]:
d_hidden= d_logits @ W2.T 
compare_to_pytorch('hidden', d_hidden, hidden)


hidden          | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


![](dl_db.png)

In [41]:
d_W2= hidden.T@d_logits

compare_to_pytorch('W2', d_W2, W2)


W2              | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


![](dl_dc.png)

In [42]:
d_b2= d_logits.sum(0, keepdim=True)

compare_to_pytorch('b2',d_b2, b2)

b2              | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [46]:
#hidden = torch.tanh(hidden_pre_activation)  # Apply the tanh activation function
#a=tanh(z)=(e^z-e^-z)/(e^z+e^-z)
#da/dz=1-a**2

d_hidden_pre_activation=(1-hidden**2)
d_hidden_pre_activation*=d_hidden#chain_rule

compare_to_pytorch("hidden_pre_activation", d_hidden_pre_activation, hidden_pre_activation)

hidden_pre_activation | exactly equal: False | approximatly equal: True  | larges difference: 4.656612873077393e-10


In [53]:
#hidden_pre_activation = bngain * normalized_bn + bnbias  # Apply learnable scaling (gamma) and shifting (beta)

d_normalized_bn= bngain*d_hidden_pre_activation

compare_to_pytorch("normalized_bn", d_normalized_bn, normalized_bn)

d_bngain= (normalized_bn*d_hidden_pre_activation).sum(0, keepdim=True)
compare_to_pytorch("bngain", d_bngain, bngain)

d_bnbias=d_hidden_pre_activation.sum(0, keepdim=True)
compare_to_pytorch("bnbias", d_bnbias, bnbias)


normalized_bn   | exactly equal: False | approximatly equal: True  | larges difference: 4.656612873077393e-10
bngain          | exactly equal: False | approximatly equal: True  | larges difference: 1.3969838619232178e-09
bnbias          | exactly equal: False | approximatly equal: True  | larges difference: 1.862645149230957e-09


In [57]:
#normalized_bn = centered_bn * batch_variance_inv  # Normalize the batch (batch_size, hidden_size)

print(normalized_bn.shape)
print(centered_bn.shape)
print(batch_variance_inv.shape)

d_batch_variance_inv=(centered_bn*d_normalized_bn).sum(0,keepdim=True)

compare_to_pytorch("batch_variance_inv", d_batch_variance_inv, batch_variance_inv)

torch.Size([32, 64])
torch.Size([32, 64])
torch.Size([1, 64])
batch_variance_inv | exactly equal: False | approximatly equal: True  | larges difference: 2.7939677238464355e-09


In [61]:
d_centered_bn_1st=batch_variance_inv*d_normalized_bn

#we follow until we find the other branch

In [59]:
#batch_variance_inv = (batch_variance + 1e-5)**-0.5  # Inverse of the standard deviation (1, hidden_size)
#d/dx(1/x^0.5) = -0.5/x^1.5

d_batch_variance = -0.5*(batch_variance+1e-5)**-1.5*d_batch_variance_inv

compare_to_pytorch("batch_variance", d_batch_variance, batch_variance)

batch_variance  | exactly equal: False | approximatly equal: True  | larges difference: 8.149072527885437e-10


In [69]:
#batch_variance = 1 / (batch_size - 1) * (centered_bn2).sum(0, keepdim=True)  # Calculate batch variance (1, hidden_size)
#batch_variance = k * (centered_bn2).sum(0, keepdim=True) 


k=1/(batch_size-1)
d_centered_bn2=k*torch.ones_like(centered_bn2)*d_batch_variance

compare_to_pytorch("centered_bn2", d_centered_bn2, centered_bn2)

centered_bn2    | exactly equal: False | approximatly equal: True  | larges difference: 2.546585164964199e-11


In [70]:
#centered_bn2=centered_bn**2

d_centered_bn_2nd=2*centered_bn*d_centered_bn2

d_centered_bn=d_centered_bn_1st+d_centered_bn_2nd

compare_to_pytorch("centered_bn", d_centered_bn, centered_bn)

centered_bn     | exactly equal: False | approximatly equal: True  | larges difference: 4.656612873077393e-10


In [78]:
#centered_bn = hidden_pre_bn - batch_mean  # Subtract the mean from each hidden pre-activation value
centered_bn.shape, hidden_pre_bn.shape, batch_mean.shape

d_hidden_pre_bn=d_centered_bn.clone()
d_batch_mean_1st=(-torch.ones_like(centered_bn)*d_centered_bn).sum(0)


compare_to_pytorch("hidden_pre_bn", d_hidden_pre_bn, hidden_pre_bn)


hidden_pre_bn   | exactly equal: False | approximatly equal: False | larges difference: 0.0007873533759266138
batch_mean      | exactly equal: False | approximatly equal: True  | larges difference: 3.725290298461914e-09
