#### Ex1 -  I did not get around to seeing what happens when you initialize all weights and biases to zero. Inspect the gradients and activations to figure out what is happening and why the network is only partially training, and what part is being trained exactly. 

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures

# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)

# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

# MLP revisited
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.zeros(vocab_size,n_embd)#torch.randn((vocab_size, n_embd),            generator=g)
#C  = torch.randn((vocab_size, n_embd),            generator=g)
W1 = torch.zeros(n_embd * block_size, n_hidden)
W2 = torch.zeros(n_hidden, vocab_size)
b2 = torch.zeros(vocab_size)

# BatchNorm parameters
bngain = torch.zeros((1, n_hidden))
bnbias = torch.ones((1, n_hidden))
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, W2, b2, bngain, bnbias]
for p in parameters:
  p.requires_grad = True

# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
  
  # forward pass
  emb = C[Xb] # embed the characters into vectors
  embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
  # Linear layer
  hpreact = embcat @ W1 #+ b1 # hidden layer pre-activation
  # BatchNorm layer
  # -------------------------------------------------------------
  bnmeani = hpreact.mean(0, keepdim=True)
  bnstdi = hpreact.std(0, keepdim=True)
  hpreact = bngain * (hpreact - bnmeani) / (bnstdi + 1e-05)**0.5 + bnbias
  with torch.no_grad():
    bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
    bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
  # -------------------------------------------------------------
  # Non-linearity
  h = torch.tanh(hpreact) # hidden layer
  logits = h @ W2 + b2 # output layer
  loss = F.cross_entropy(logits, Yb) # loss function

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
  for p in parameters: p.data += -lr * p.grad
  
  names = ['C', 'W1', 'W2', 'b2', 'bngain', 'bnbias']
  count = 0
  
  if i == max_steps-1:
    print("loss: "+str(loss.item()))
    print("")
    for p in parameters:
      print(names[count]+":")
      print(p.grad)
      print("")
      count+=1

loss: 2.7348735332489014

C:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 

If we imagine all weights to be zero, then that would mean that, regardless of the input, all layers would have a pre activation equal to the bias. Seeing as though the bias are also initialized as zero and f(0) = 0 on the tanh function. Therefore, for every hidden layer, both the input and the output are zero.
When we apply backpropagation, we determine the impact of a certain parameter on the final result by calculating the derivative of the error with respect to that parameter. Considering that, when everything else remains constant, changing any of the parameters leads to the same result, we conclude that the gradient for that parameter is zero, and its value will remain the same.
It is important to note that this thought process isn't valid when we consider activation functions where f(0)!=0 such as the sigmoid function. In this scenario, we expect the values to change equaly in each layer and also to have all equal collumns (this makes sense if we recall that all layers will have the same input and output).
After analysing the results, we note that there is some slow learning hapening on the second layer, as well as in the batchnorm layer. I suspect this is happening because 1e-05 is added on the batch norm layer, in order to prevent division by zero. This makes it so that some of the inputs are not actually zero, but a very small value that enables learning from that point on.

#### Ex2 - BatchNorm, unlike other normalization layers like LayerNorm/GroupNorm etc. has the big advantage that after training, the batchnorm gamma/beta can be "folded into" the weights of the preceeding Linear layers, effectively erasing the need to forward it at test time. Set up a small 3-layer MLP with batchnorms, train the network, then "fold" the batchnorm gamma/beta into the preceeding Linear layer's W,b by creating a new W2, b2 and erasing the batch norm. Verify that this gives the same forward pass during inference. i.e. we see that the batchnorm is there just for stabilizing the training, and can be thrown out after training is done! pretty cool.

Building the neural net the standard way:

In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures

# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)

# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

Training the neural net:

In [4]:
g = torch.Generator().manual_seed(2147483647)
C  = torch.randn((vocab_size, n_embd),            generator=g)

W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) # Kunning innit
W2 = torch.randn((n_hidden, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) # Kunning innit
W3 = torch.randn((n_hidden, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) # Kunning innit

W4 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.01
b4 = torch.zeros(vocab_size)

# BatchNorm parameters
bngain1 = torch.zeros((1, n_hidden))
bngain2 = torch.zeros((1, n_hidden))
bngain3 = torch.zeros((1, n_hidden))

bnbias1 = torch.ones((1, n_hidden))
bnbias2 = torch.ones((1, n_hidden))
bnbias3 = torch.ones((1, n_hidden))

bnmean_running1 = torch.zeros((1, n_hidden))
bnmean_running2 = torch.zeros((1, n_hidden))
bnmean_running3 = torch.zeros((1, n_hidden))

bnstd_running1 = torch.ones((1, n_hidden))
bnstd_running2 = torch.ones((1, n_hidden))
bnstd_running3 = torch.ones((1, n_hidden))

parameters = [C, W1, W2, W3, W4, b4, bngain1, bngain2, bngain3, bnbias1, bnbias2, bnbias3]
for p in parameters:
  p.requires_grad = True

# same optimization as last time
max_steps = 10000
batch_size = 32
lossi = []

for i in range(max_steps):
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
  
  # Forward pass

  emb = C[Xb] # embed the characters into vectors
  embcat = emb.view(emb.shape[0], -1) # concatenate the vectors

  # Layer 1

  hpreact1 = embcat @ W1 # hidden layer pre-activation
  bnmeani1 = hpreact1.mean(0, keepdim=True)
  bnstdi1 = hpreact1.std(0, keepdim=True)
  hpreact1 = bngain1 * (hpreact1 - bnmeani1) / (bnstdi1 + 1e-05)**0.5 + bnbias1
  with torch.no_grad():
    bnmean_running1 = 0.999 * bnmean_running1 + 0.001 * bnmeani1
    bnstd_running1 = 0.999 * bnstd_running1 + 0.001 * bnstdi1
  h1 = torch.tanh(hpreact1)

  # Layer 2

  hpreact2 = h1 @ W2 # hidden layer pre-activation
  bnmeani2 = hpreact2.mean(0, keepdim=True)
  bnstdi2 = hpreact2.std(0, keepdim=True)
  hpreact2 = bngain2 * (hpreact2 - bnmeani2) / (bnstdi2 + 1e-05)**0.5 + bnbias2
  with torch.no_grad():
    bnmean_running2 = 0.999 * bnmean_running2 + 0.001 * bnmeani2
    bnstd_running2 = 0.999 * bnstd_running2 + 0.001 * bnstdi2
  h2 = torch.tanh(hpreact2)
  
  # Layer 3

  hpreact3 = h2 @ W3 # hidden layer pre-activation
  bnmeani3 = hpreact3.mean(0, keepdim=True)
  bnstdi3 = hpreact3.std(0, keepdim=True)
  hpreact3 = bngain3 * (hpreact3 - bnmeani3) / (bnstdi3 + 1e-05)**0.5 + bnbias3
  with torch.no_grad():
    bnmean_running3 = 0.999 * bnmean_running3 + 0.001 * bnmeani3
    bnstd_running3 = 0.999 * bnstd_running3 + 0.001 * bnstdi3
  h3 = torch.tanh(hpreact3)

  # Output Layer

  logits = h3 @ W4 + b4
  loss = F.cross_entropy(logits, Yb) # loss function

  #backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())
  

      0/  10000: 3.2371


Inference the "usual" way:

In [24]:
ix = torch.randint(0, Xtr.shape[0], (1,),generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
 
# Forward pass

emb = C[Xb] # embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors

# Layer 1

hpreact1 = embcat @ W1 # hidden layer pre-activation
bnmeani1 = bnmean_running1
bnstdi1 = bnstd_running1
hpreact1 = bngain1 * (hpreact1 - bnmeani1) / (bnstdi1 + 1e-05)**0.5 + bnbias1
h1 = torch.tanh(hpreact1)

# Layer 2

hpreact2 = h1 @ W2 # hidden layer pre-activation
bnmeani2 = bnmean_running2
bnstdi2 = bnstd_running2
hpreact2 = bngain2 * (hpreact2 - bnmeani2) / (bnstdi2 + 1e-05)**0.5 + bnbias2
h2 = torch.tanh(hpreact2)
  
# Layer 3

hpreact3 = h2 @ W3 # hidden layer pre-activation
bnmeani3 = bnmean_running3
bnstdi3 = bnstd_running3
hpreact3 = bngain3 * (hpreact3 - bnmeani3) / (bnstdi3 + 1e-05)**0.5 + bnbias3
h3 = torch.tanh(hpreact3)

# Output Layer

logits = h3 @ W4 + b4

Inference with the "folding" of the batch norm layer:

In [25]:
# Forward pass
# Layer 1

bnmeani1 = bnmean_running1
bnstdi1 = bnstd_running1
W12 = bngain1 * W1 / (bnstdi1 + 1e-05)**0.5
b12 = bngain1 * (-bnmeani1) / (bnstdi1 + 1e-05)**0.5
hpreact1 = embcat @ W12 + b12 # hidden layer pre-activation
newh1 = torch.tanh(hpreact1)

# Layer 2

bnmeani2 = bnmean_running2
bnstdi2 = bnstd_running2
W22 = bngain2 * W2 / (bnstdi2 + 1e-05)**0.5
b22 = bngain2 * (-bnmeani2) / (bnstdi2 + 1e-05)**0.5
hpreact2 = newh1 @ W22 + b22 # hidden layer pre-activation
newh2 = torch.tanh(hpreact2)

  
# Layer 3

bnmeani3 = bnmean_running3
bnstdi3 = bnstd_running3
W32 = bngain3 * W3 / (bnstdi3 + 1e-05)**0.5
b32 = bngain3 * (-bnmeani3) / (bnstdi3 + 1e-05)**0.5
hpreact3 = newh2 @ W32 + b32 # hidden layer pre-activation
newh3 = torch.tanh(hpreact3)


# Output Layer

newlogits = h3 @ W4 + b4

Comparing the two results:

In [26]:
logits == newlogits.all()

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True]])