# makemore: part 4



In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
#Data load. No changes here

#https://datos.gob.es/es/catalogo/a09002970-municipios-de-espana
# We will instead be using names of villages/cities in Spain. Only 8k data
import pandas as pd

# Read the CSV data
df = pd.read_csv("Municipis_d_Espanya.csv", sep=",")

# Function to clean the names
def clean_name(name):
    # If there's a slash, take the first part
    name = name.split('/')[0]
    # If it's in "Last, First" format, swap it to "First Last"
    if ',' in name:
        parts = name.split(', ')
        if len(parts) == 2:
            name = f"{parts[1]} {parts[0]}"
    return name

# Apply the function to clean names
df["Nom"] = df["Nom"].apply(clean_name)

# Extract only the 'Territorio' column as a list
words = df["Nom"].tolist()

print(f"{len(words)} words")

#Simplifying the problem (lowercase and no accents)
import unidecode
import re

print(words[:1])
words = [re.sub(r'[\(\)\'"]', '', unidecode.unidecode(word).lower()) for word in words]
print(words[:1])

8134 words
['Alegría-Dulantzi']
['alegria-dulantzi']


In [5]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: ' ', 2: '-', 3: 'a', 4: 'b', 5: 'c', 6: 'd', 7: 'e', 8: 'f', 9: 'g', 10: 'h', 11: 'i', 12: 'j', 13: 'k', 14: 'l', 15: 'm', 16: 'n', 17: 'o', 18: 'p', 19: 'q', 20: 'r', 21: 's', 22: 't', 23: 'u', 24: 'v', 25: 'w', 26: 'x', 27: 'y', 28: 'z', 0: '.'}
29


In [6]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%


torch.Size([85032, 3]) torch.Size([85032])
torch.Size([10606, 3]) torch.Size([10606])
torch.Size([10768, 3]) torch.Size([10768])


No changes until here
_____________________

In [18]:
# utility function we will use later when comparing manual gradients to PyTorch gradients
def compare_to_pytorch(label, manual_calculated, tensor):
    # Check if the values are exactly the same
    is_exact = torch.all(manual_calculated == tensor.grad).item()

    # Check if the values are approximately the same
    is_approximate = torch.allclose(manual_calculated, tensor.grad)

    # Calculate the maximum difference between the expected values and the gradients
    max_difference = (manual_calculated - tensor.grad).abs().max().item()

    # Print the results
    print(f'{label:15s} | exactly equal: {str(is_exact):5s} | approximatly equal: {str(is_approximate):5s} | larges difference: {max_difference}')

In [None]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1 # using b1 just for fun, it's useless because of BN
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

# Note: I am initializating many of these parameters in non-standard ways
# because sometimes initializating with e.g. all zeros could mask an incorrect
# implementation of the backward pass.

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

18619


In [None]:
batch_size = 32
# construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

In [10]:
# Step 1: Embedding the input characters into vectors
embeddings = C[Xb]  # C is the embedding matrix, Xb are the input indices
flattened_embeddings = embeddings.view(embeddings.shape[0], -1)  # Flatten embeddings to 2D: (batch_size, embedding_dim)

# Step 2: Linear Layer 1 (pre-activation)
hidden_pre_bn = flattened_embeddings @ W1 + b1  # Linear transformation (batch_size, hidden_size)

# Step 3: Batch Normalization (BN)
batch_mean = 1 / batch_size * hidden_pre_bn.sum(0, keepdim=True)  # Calculate batch mean (1, hidden_size)
centered_bn = hidden_pre_bn - batch_mean  # Subtract the mean from each hidden pre-activation value

batch_variance = 1 / (batch_size - 1) * (centered_bn**2).sum(0, keepdim=True)  # Calculate batch variance (1, hidden_size)
batch_variance_inv = (batch_variance + 1e-5)**-0.5  # Inverse of the standard deviation (1, hidden_size)
normalized_bn = centered_bn * batch_variance_inv  # Normalize the batch (batch_size, hidden_size)

# Step 4: Apply Scale and Shift (Gamma and Beta) to Batch Normalization
hidden_pre_activation = bngain * normalized_bn + bnbias  # Apply learnable scaling (gamma) and shifting (beta)

# Step 5: Non-linearity (activation)
hidden = torch.tanh(hidden_pre_activation)  # Apply the tanh activation function

# Step 6: Linear Layer 2 (output layer)
logits = hidden @ W2 + b2  # Final linear transformation to produce logits (batch_size, output_dim)

# Step 7: Cross Entropy Loss (manual calculation)
logit_max = logits.max(1, keepdim=True).values  # For numerical stability, subtract the max logit
stable_logits = logits - logit_max  # Subtract max logits to prevent overflow during exponentiation

# Calculate probabilities (softmax)
exp_logits = stable_logits.exp()  # Exponentiate the logits to get unnormalized probabilities
logits_sum = exp_logits.sum(1, keepdim=True)  # Sum of exponentiated logits (batch_size, 1)
inv_logits_sum = logits_sum**-1  # Inverse of the sum for normalization (batch_size, 1)
probabilities = exp_logits * inv_logits_sum  # Normalize to get the actual probabilities (batch_size, output_dim)

# Compute log probabilities
log_probabilities = probabilities.log()  # Logarithm of probabilities (batch_size, output_dim)

# Calculate the cross entropy loss
loss = -log_probabilities[range(batch_size), Yb].mean()  # Average the negative log likelihood for the true labels

# Step 8: Backward Pass
for param in parameters:
    param.grad = None  # Clear previous gradients

# Retain gradients for intermediate variables for debugging or inspection
for tensor in [log_probabilities, probabilities, exp_logits, logits_sum, inv_logits_sum,
               stable_logits, logit_max, logits, hidden, hidden_pre_activation, normalized_bn,
               batch_variance_inv, batch_variance, centered_bn, batch_mean, hidden_pre_bn, flattened_embeddings, embeddings]:
    tensor.retain_grad()  # Retain gradients for inspection if needed

# Perform backpropagation to compute gradients
loss.backward()

# Return the final loss value
loss


tensor(2.0208, grad_fn=<NegBackward0>)

# Exercise 1
## backprop through the whole thing manually

In [None]:
'''
log_probabilities
probabilities
inv_logits_sum
logits_sum
exp_logits
stable_logits
logit_max
logits
hidden
hidden_pre_activation
normalized_bn
batch_variance_inv
batch_variance
centered_bn
batch_mean
flattened_embeddings
embeddings
'''

In [22]:
#loss = -logprobabilities[range(batch_size), Yb].mean()
#loss = -1*(sum(elements))/n_elements
#dloss= -1/n_elements

print(log_probabilities.shape)

d_log_probabilities=torch.zeros_like(log_probabilities)
d_log_probabilities[range(batch_size), Yb] = -1/batch_size

compare_to_pytorch('log_probabilities', d_log_probabilities, log_probabilities)

torch.Size([64, 29])
log_probabilities | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [21]:
#log_probabilities = probabilities.log()
#d/dx(log(x)) = 1/x

d_probabilities=(1/probabilities)*d_log_probabilities #examples with low prob --> boost grad

compare_to_pytorch('probabilities', d_probabilities, probabilities)

probabilities   | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [36]:
#probabilities = exp_logits * inv_logits_sum  # Normalize to get the actual probabilities (batch_size, output_dim)
#2 ops really
#   replicate column tensor
#   multiplication


print(exp_logits.shape)
print(inv_logits_sum.shape)

# c = a * b, but with tensors:
# a[3x3] * b[3,1]  ---->
# a11*b1  a12*b1  a13*b1
# a21*b2  a22*b2  a23*b2
# a31*b3  a32*b3  a33*b3
# c[3x3]


print(d_probabilities.shape)

d_inv_logis_sum=(exp_logits*d_probabilities).sum(1,keepdim=True)


compare_to_pytorch('Inv logits sum', d_inv_logis_sum, inv_logits_sum)

torch.Size([64, 29])
torch.Size([64, 1])
torch.Size([64, 29])
Inv logits sum  | exactly equal: True  | approximatly equal: True  | larges difference: 0.0


In [None]:
#exp_logits = stable_logits.exp()  # Exponentiate the logits to get unnormalized probabilities
#logits_sum = exp_logits.sum(1, keepdim=True)  # Sum of exponentiated logits (batch_size, 1)
#inv_logits_sum = logits_sum**-1  # Inverse of the sum for normalization (batch_size, 1)
#probabilities = exp_logits * inv_logits_sum  # Normalize to get the actual probabilities (batch_size, output_dim)

#Exp logits its used twice


d_exp_logitcs=(inv_logits_sum*d_probabilities)+
