# Overview

Batch normalization came out in 2015 from a team at google and it was an extremely impactful paper because it made possible to train every deep neural Nets quite reliably.

In notebook ["Kaiming init" calculating the init scale](https://www.kaggle.com/code/aisuko/kaiming-init-calculating-the-init-scale), we want our hidden states value to be roughly unit gaussian, a unit or one standard deviation at least at initialization.

We will implement normalize our hidden network stats to be unit gaussian by following paper [Batch Normalization](https://arxiv.org/pdf/1502.03167).

We follow the Algorithm1 in the paper `Batch normalizing`.

* Mini-batch mean
* mini-batch variance
* Normalize
* Scale ans shift

In [1]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
from torch.utils.tensorboard import SummaryWriter


# read in all the words
with open('/kaggle/input/character-lm-without-framework/names.txt', 'r', encoding='utf-8') as f:
    words=f.read()

words=words.splitlines()

# build the vocabulary of characters and 
chars=sorted(list(set(''.join(words))))

stoi={s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos={i:s for s,i in stoi.items()}
vocab_size=len(itos)


block_size=3 # context length: how many characters do we take to predict the next one?


def build_dataset(words):
    X,Y=[],[]
    
    for w in words:
        context=[0]*block_size
        for ch in w+'.':
            ix=stoi[ch]
            X.append(context)
            Y.append(ix)
            context=context[1:]+[ix] # crop and append
            
    X=torch.tensor(X)
    Y=torch.tensor(Y)
    print(X.shape, Y.shape)
    return X,Y

random.seed(42)
random.shuffle(words)
n1=int(0.8*len(words))
n2=int(0.9*len(words))

Xtr, Ytr=build_dataset(words[:n1])     # 80%
Xdev, Ydev=build_dataset(words[n1:n2]) # 10%
Xte, Yte=build_dataset(words[n2:])     # 10%


# MLP

n_embd=10 # the dimensionality of the character embedding vectors
n_hidden=200 # the number of neurons in the hidden layer of the MLP
max_steps=200000
batch_size=32
lossi=[]
writer=SummaryWriter()

g=torch.Generator().manual_seed(2147483647) # for reproducibility
C=torch.randn((vocab_size, n_embd),           generator=g)
W1=torch.randn((n_embd*block_size, n_hidden), generator=g) * ((5/3)/((n_embd*block_size)**0.5))
b1=torch.randn(n_hidden,                      generator=g) * 0.01
W2=torch.randn((n_hidden, vocab_size),        generator=g) * 0.01
b2=torch.randn(vocab_size,                    generator=g) * 0

parameters=[C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad=True
    

for i in range(max_steps):
    # minibatch construct
    ix=torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb,Yb=Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    emb=C[Xb] # embed the characters into vectors
    embcat=emb.view(emb.shape[0], -1) # concatenate the vectors
    hpreact=embcat@W1+b1 # hidden layer pre-activation
    
    # If all of these outputs h are in the flat regions of -1 and 1, 
    # then the gradients that are flowing through the network will just get
    # destroyed at this layer.
    h=torch.tanh(hpreact) # hidden layer

    logits=h@W2+b2 # output layer
    loss=F.cross_entropy(logits, Yb) # loss function
    writer.add_scalar("Loss/train", loss, i)
    
    # backward pass
    for p in parameters:
        p.grad=None
    loss.backward()
    
    # update
    lr=0.1 if i< 100000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data+=-lr*p.grad
    
    # track stats
    if i%10000 ==0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
    break

writer.flush()
writer.close()

2024-07-30 05:29:36.632986: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 05:29:36.633132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 05:29:36.768535: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])
11897
      0/ 200000: 3.3179


In [2]:
# in the hidden layer, we have 32 examples by 200 neurons
hpreact.shape

torch.Size([32, 200])

In [3]:
# cross zero dimension and keep them as true

# We doing the mean over all the elements in the batch 
hpreact.mean(0, keepdim=True).shape

torch.Size([1, 200])

In [4]:
# calculating the standard deviation of these activations
hpreact.std(0, keepdim=True).shape

torch.Size([1, 200])

# How we standardize these values?

In [5]:
# MLP

n_embd=10 # the dimensionality of the character embedding vectors
n_hidden=200 # the number of neurons in the hidden layer of the MLP
max_steps=200000
batch_size=32
lossi=[]
writer=SummaryWriter()

g=torch.Generator().manual_seed(2147483647) # for reproducibility
C=torch.randn((vocab_size, n_embd),           generator=g)
W1=torch.randn((n_embd*block_size, n_hidden), generator=g) * ((5/3)/((n_embd*block_size)**0.5))
b1=torch.randn(n_hidden,                      generator=g) * 0.01
W2=torch.randn((n_hidden, vocab_size),        generator=g) * 0.01
b2=torch.randn(vocab_size,                    generator=g) * 0

parameters=[C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad=True
    

for i in range(max_steps):
    # minibatch construct
    ix=torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb,Yb=Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    emb=C[Xb] # embed the characters into vectors
    embcat=emb.view(emb.shape[0], -1) # concatenate the vectors
    hpreact=embcat@W1+b1 # hidden layer pre-activation
    
    
    # algorithm: we mini-batch =mean/standard_deviation
    hpreact = (hpreact-hpreact.mean(0, keepdim=True))/ hpreact.std(0, keepdim=True)
    
    
    # If all of these outputs h are in the flat regions of -1 and 1, 
    # then the gradients that are flowing through the network will just get
    # destroyed at this layer.
    h=torch.tanh(hpreact) # hidden layer

    logits=h@W2+b2 # output layer
    loss=F.cross_entropy(logits, Yb) # loss function
    writer.add_scalar("Loss/train", loss, i)
    
    # backward pass
    for p in parameters:
        p.grad=None
    loss.backward()
    
    # update
    lr=0.1 if i< 100000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data+=-lr*p.grad
    
    # track stats
    if i%10000 ==0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
    break

writer.flush()
writer.close()

11897
      0/ 200000: 3.3147


# How to do the normalization?

In [6]:
# MLP

n_embd=10 # the dimensionality of the character embedding vectors
n_hidden=200 # the number of neurons in the hidden layer of the MLP
max_steps=200000
batch_size=32
lossi=[]
writer=SummaryWriter()

g=torch.Generator().manual_seed(2147483647) # for reproducibility
C=torch.randn((vocab_size, n_embd),           generator=g)
W1=torch.randn((n_embd*block_size, n_hidden), generator=g) * ((5/3)/((n_embd*block_size)**0.5))
b1=torch.randn(n_hidden,                      generator=g) * 0.01
W2=torch.randn((n_hidden, vocab_size),        generator=g) * 0.01
b2=torch.randn(vocab_size,                    generator=g) * 0

bngain=torch.ones((1, n_hidden))
bnbias=torch.zeros((1, n_hidden))

parameters=[C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad=True
    

for i in range(max_steps):
    # minibatch construct
    ix=torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb,Yb=Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    emb=C[Xb] # embed the characters into vectors
    embcat=emb.view(emb.shape[0], -1) # concatenate the vectors
    hpreact=embcat@W1+b1 # hidden layer pre-activation
    
    
    # algorithm: we mini-batch =mean/standard_deviation
    hpreact = bngain*(hpreact-hpreact.mean(0, keepdim=True))/ hpreact.std(0, keepdim=True)+bnbias
    
    
    # If all of these outputs h are in the flat regions of -1 and 1, 
    # then the gradients that are flowing through the network will just get
    # destroyed at this layer.
    h=torch.tanh(hpreact) # hidden layer

    logits=h@W2+b2 # output layer
    loss=F.cross_entropy(logits, Yb) # loss function
    writer.add_scalar("Loss/train", loss, i)
    
    # backward pass
    for p in parameters:
        p.grad=None
    loss.backward()
    
    # update
    lr=0.1 if i< 100000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data+=-lr*p.grad
    
    # track stats
    if i%10000 ==0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
    break

writer.flush()
writer.close()

12297
      0/ 200000: 3.3147


# How do we do when the neural net in a forward pass estimates the statistics of the mean energy standard deviation of a batch?

We would like to have a step after training that calculates and sets the batheoom mean and standard deviation a single time over the training set.

In [7]:
# calibrate the batch norm at the end of training

# we won't call .backward below
with torch.no_grad():
    # pass the training set through
    emb=C[Xtr] # take the training set
    embcat=emb.view(emb.shape[0], -1)
    hpreact=embcat @ W1+b1 # get the pre-activations for every single training examples
    # one single training time---measure the mean/std over the entire training set
    bnmean=hpreact.mean(0, keepdim=True)
    bnstd=hpreact.std(0, keepdim=True)

In [8]:
@torch.no_grad()
def split_loss(split):
    x,y={
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    emb=C[x] # (N, block_size, n_embd)
    embcat=emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
    
    hpreact=embcat@W1+b1
#     hpreact=bngain*(hpreact=hpreact.mean(0, keepdim=True)) / hpreact.std(0, keepdim=True)+bnbias
    hpreact=bngain*(hpreact-bnmean)/bnstd+bnbias
    h=torch.tanh(hpreact) # (N, n_hidden)
    logits=h@W2+b2 #(N, vocab_size)
    loss=F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 3.268287420272827
val 3.267815113067627


# Summary

Here is another idea from the paper that we don't want to define further step after training. We can define them at the training process.

We are using batch normalization to control the statistics of activations in the neural net. It is a common to sprinkle batch normalization layer across the neural net and usally we will place it **after layers** that have **multiplications**, like a **linear layer** or a convolutional layer.

In [9]:
# MLP

n_embd=10 # the dimensionality of the character embedding vectors
n_hidden=200 # the number of neurons in the hidden layer of the MLP
max_steps=200000
batch_size=32
lossi=[]
writer=SummaryWriter()

g=torch.Generator().manual_seed(2147483647) # for reproducibility
C=torch.randn((vocab_size, n_embd),           generator=g)
W1=torch.randn((n_embd*block_size, n_hidden), generator=g) * ((5/3)/((n_embd*block_size)**0.5))
# batch normalization has itself bias, so we remove b1
# b1=torch.randn(n_hidden,                      generator=g) * 0.01
W2=torch.randn((n_hidden, vocab_size),        generator=g) * 0.01
b2=torch.randn(vocab_size,                    generator=g) * 0

# BatchNorm parameters
bngain=torch.ones((1, n_hidden))
bnbias=torch.zeros((1, n_hidden))
# two buffers, running means and running standard diviation of 
bnmean_running=torch.zeros((1, n_hidden))
bnstd_running=torch.ones((1, n_hidden))

parameters=[C, W1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad=True
    

for i in range(max_steps):
    # minibatch construct
    ix=torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb,Yb=Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    emb=C[Xb] # embed the characters into vectors
    embcat=emb.view(emb.shape[0], -1) # concatenate the vectors
    
    # Linear layer
    hpreact=embcat@W1 # +b1 # hidden layer pre-activation
    
    # BatchNorm Layer
    # calculate the means and standard deviation of the activation that are feeding into the batchnorm layer
    bnmeani=hpreact.mean(0, keepdim=True)
    bnstdi=hpreact.std(0, keepdim=True)

    # it's centering that batch to be unit gaussian
    # and then it's offsetting and scaling it by the learned bias and gain
    # algorithm: we mini-batch =mean/standard_deviation
    hpreact = bngain*(hpreact-bnmeani)/ bnstdi+bnbias
    
    with torch.no_grad():
        bnmean_running=0.999*bnmean_running+0.001*bnmeani
        bnstd_running=0.999*bnstd_running+0.001*bnstdi
    
    
    # If all of these outputs h are in the flat regions of -1 and 1, 
    # then the gradients that are flowing through the network will just get
    # destroyed at this layer.
    h=torch.tanh(hpreact) # hidden layer

    logits=h@W2+b2 # output layer
    loss=F.cross_entropy(logits, Yb) # loss function
    writer.add_scalar("Loss/train", loss, i)
    
    # backward pass
    for p in parameters:
        p.grad=None
    loss.backward()
    
    # update
    lr=0.1 if i< 100000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data+=-lr*p.grad
    
    # track stats
    if i%10000 ==0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
#     break

writer.flush()
writer.close()

12097
      0/ 200000: 3.3239
  10000/ 200000: 2.0322
  20000/ 200000: 2.5675
  30000/ 200000: 2.0125
  40000/ 200000: 2.2446
  50000/ 200000: 1.8897
  60000/ 200000: 2.0785
  70000/ 200000: 2.3681
  80000/ 200000: 2.2918
  90000/ 200000: 2.0238
 100000/ 200000: 2.3673
 110000/ 200000: 2.3132
 120000/ 200000: 1.6414
 130000/ 200000: 1.9311
 140000/ 200000: 2.2231
 150000/ 200000: 2.0027
 160000/ 200000: 2.0997
 170000/ 200000: 2.4949
 180000/ 200000: 2.0199
 190000/ 200000: 2.1707


In [10]:
# calibrate the batch norm at the end of training

# we won't call .backward below
with torch.no_grad():
    # pass the training set through
    emb=C[Xtr] # take the training set
    embcat=emb.view(emb.shape[0], -1)
    hpreact=embcat @ W1 #+b1 # get the pre-activations for every single training examples
    # one single training time---measure the mean/std over the entire training set
    bnmean=hpreact.mean(0, keepdim=True)
    bnstd=hpreact.std(0, keepdim=True)

In [11]:
@torch.no_grad()
def split_loss(split):
    x,y={
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    emb=C[x] # (N, block_size, n_embd)
    embcat=emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
    
    hpreact=embcat@W1+b1
#     hpreact=bngain*(hpreact=hpreact.mean(0, keepdim=True)) / hpreact.std(0, keepdim=True)+bnbias
    hpreact=bngain*(hpreact-bnmean)/bnstd+bnbias
    h=torch.tanh(hpreact) # (N, n_hidden)
    logits=h@W2+b2 #(N, vocab_size)
    loss=F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.0677990913391113
val 2.105621099472046


# Acknowledgements

* https://www.youtube.com/watch?v=P6sfmUTpUmc&t=2443s
* https://arxiv.org/pdf/1502.03167