# Batch Normalization
This notebook aims to help gain a better understanding of batch normalization and defining a residual neural network model without batch normalization and with batch normalization and comparing the variance of the hidden unit outputs for both models

### Imports

In [1]:
import numpy as np
import os
import torch, torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
import mnist1d
import random

### Import Input Data

In [2]:
args = mnist1d.data.get_dataset_args()
data = mnist1d.data.get_dataset(args, path='./mnist1d_data.pkl', download=False, regenerate=False)

Successfully loaded data from ./mnist1d_data.pkl


### Define Training and Validation Data
Separate the input data into training and validation data and store it in their respective variables

In [3]:
train_data_x = data['x'].transpose()
train_data_y = data['y']
val_data_x = data['x_test'].transpose()
val_data_y = data['y_test']

### Print Dimensions of Training and Validation Data

In [4]:
print("Train data: %d examples (columns), each of which has %d dimensions (rows)"%((train_data_x.shape[1],train_data_x.shape[0])))
print("Validation data: %d examples (columns), each of which has %d dimensions (rows)"%((val_data_x.shape[1],val_data_x.shape[0])))

Train data: 4000 examples (columns), each of which has 40 dimensions (rows)
Validation data: 4000 examples (columns), each of which has 40 dimensions (rows)


### Define Variance Function
Define a function that computes the variance between the hidden unit outputs per hidden layer

In [5]:
def print_variance(name, data):
  # First dimension (rows) is batch elements (# of input data)
  # Second dimension (columns) is number of neurons.
  np_data = data.detach().numpy()
  # Compute variance across neurons and average these variances over members of the batch
  neuron_variance = np.mean(np.var(np_data, axis=0))
  # Print out the name and the variance
  print("%s variance=%f"%(name,neuron_variance))

### Define He Initialization Function

In [6]:
def weights_init(layer_in):
  if isinstance(layer_in, nn.Linear):
    nn.init.kaiming_uniform_(layer_in.weight)
    layer_in.bias.data.fill_(0.0)

### Define Backpropagation Function
Define a function that computes the backward pass and updates the parameter of the model based on the computed gradient

In [7]:
def run_one_step_of_model(model, x_train, y_train):
  # Define cross entropy loss function as the loss function for the residual neural network
  loss_function = nn.CrossEntropyLoss()

  # Define the stochastic gradient descent step and initialize learning rate and momentum
  optimizer = torch.optim.SGD(model.parameters(), lr = 0.05, momentum=0.9)

  # Load the converted data into a class that creates the batches
  data_loader = DataLoader(TensorDataset(x_train,y_train), batch_size=200, shuffle=True, worker_init_fn=np.random.seed(1))

  # Initialize model weights
  model.apply(weights_init)

  # For each example in the input data
  for i, data in enumerate(data_loader):
    # Retrieve inputs and labels for this batch (example)
    x_batch, y_batch = data
    # Reset the parameter gradients to zero
    optimizer.zero_grad()
    # Compute the forward pass and the model output
    pred = model(x_batch)
    # Compute the loss
    loss = loss_function(pred, y_batch)
    # Compute the backward pass
    loss.backward()
    # Undergo the SGD update
    optimizer.step()

    break

### Format Training and Test Data
Convert the training and test data into proper format for training the neural network model

In [8]:
x_train = torch.tensor(train_data_x.transpose().astype('float32'))
y_train = torch.tensor(train_data_y.astype('long'))

### Define Residual Neural Network
Define a residual neural network model with 5 residual branches

In [9]:
class ResidualNetwork(torch.nn.Module):
  def __init__(self, input_size, output_size, hidden_size=100):
    super(ResidualNetwork, self).__init__()
    self.linear1 = nn.Linear(input_size, hidden_size)
    self.linear2 = nn.Linear(hidden_size, hidden_size)
    self.linear3 = nn.Linear(hidden_size, hidden_size)
    self.linear4 = nn.Linear(hidden_size, hidden_size)
    self.linear5 = nn.Linear(hidden_size, hidden_size)
    self.linear6 = nn.Linear(hidden_size, hidden_size)
    self.linear7 = nn.Linear(hidden_size, output_size)

  def count_params(self):
    return sum([p.view(-1).shape[0] for p in self.parameters()])

  def forward(self, x):
    print_variance("Input",x)
    f = self.linear1(x)
    print_variance("First preactivation",f)
    res1 = f+ self.linear2(f.relu())
    print_variance("After first residual connection",res1)
    res2 = res1 + self.linear3(res1.relu())
    print_variance("After second residual connection",res2)
    res3 = res2 + self.linear4(res2.relu())
    print_variance("After third residual connection",res3)
    res4 = res3 + self.linear5(res3.relu())
    print_variance("After fourth residual connection",res4)
    res5 = res4 + self.linear6(res4.relu())
    print_variance("After fifth residual connection",res5)
    return self.linear7(res5)

### Initialize Hyperparameters
Initialize hyperparameters for the residual neural network model

In [10]:
n_hidden = 100
n_input = 40
n_output = 10

### Define Residual Neural Network Model
Define the residual neural network model using the defined hyperparameters

In [11]:
model = ResidualNetwork(n_input, n_output, n_hidden)

### Compute Variance
For each backward pass, undergo backpropagation and determine the variance between the hidden unit outputs for each hidden layer

In [12]:
run_one_step_of_model(model, x_train, y_train)

Input variance=1.055562
First preactivation variance=2.071517
After first residual connection variance=3.586905
After second residual connection variance=6.299599
After third residual connection variance=11.865705
After fourth residual connection variance=21.049721
After fifth residual connection variance=39.599545


### Define Residual Neural Network Model with Batch Normalization
Define a residual neural network model that undergoes batch normalization before each pre-activation computation

In [13]:
class ResidualNetworkWithBatchNorm(torch.nn.Module):
  def __init__(self, input_size, output_size, hidden_size=100):
    super(ResidualNetworkWithBatchNorm, self).__init__()
    self.batchnorm1 = nn.BatchNorm1d(input_size)
    self.linear1 = nn.Linear(input_size, hidden_size)
    self.batchnorm2 = nn.BatchNorm1d(hidden_size)
    self.linear2 = nn.Linear(hidden_size, hidden_size)
    self.batchnorm3 = nn.BatchNorm1d(hidden_size)
    self.linear3 = nn.Linear(hidden_size, hidden_size)
    self.batchnorm4 = nn.BatchNorm1d(hidden_size)
    self.linear4 = nn.Linear(hidden_size, hidden_size)
    self.batchnorm5 = nn.BatchNorm1d(hidden_size)
    self.linear5 = nn.Linear(hidden_size, hidden_size)
    self.batchnorm6 = nn.BatchNorm1d(hidden_size)
    self.linear6 = nn.Linear(hidden_size, hidden_size)
    self.batchnorm7 = nn.BatchNorm1d(hidden_size)
    self.linear7 = nn.Linear(hidden_size, output_size)

  def count_params(self):
    return sum([p.view(-1).shape[0] for p in self.parameters()])

  def forward(self, x):
    print_variance("Input",x)
    x = self.batchnorm1(x)
    f = self.linear1(x)
    print_variance("First preactivation",f)
    f = self.batchnorm2(f)
    res1 = f+ self.linear2(f.relu())
    print_variance("After first residual connection",res1)
    res1 = self.batchnorm3(res1)
    res2 = res1 + self.linear3(res1.relu())
    print_variance("After second residual connection",res2)
    res2 = self.batchnorm4(res2)
    res3 = res2 + self.linear4(res2.relu())
    print_variance("After third residual connection",res3)
    res3 = self.batchnorm5(res3)
    res4 = res3 + self.linear5(res3.relu())
    print_variance("After fourth residual connection",res4)
    res4 = self.batchnorm6(res4)
    res5 = res4 + self.linear6(res4.relu())
    res5 = self.batchnorm7(res5)
    print_variance("After fifth residual connection",res5)
    return self.linear7(res5)

### Initialize Hyperparameters
Initialize hyperparameters for the residual neural network with batch normalization

In [14]:
n_hidden = 100
n_input = 40
n_output = 10

### Define Residual Neural Network Model with Batch Normalization
Define the residual neural network model with batch normalization using the defined hyperparameters

In [15]:
model = ResidualNetworkWithBatchNorm(n_input, n_output, n_hidden)

### Compute Variance
For each backward pass, undergo backpropagation and determine the variance between the hidden unit outputs for each hidden layer

In [16]:
run_one_step_of_model(model, x_train, y_train)

Input variance=1.031035
First preactivation variance=1.906459
After first residual connection variance=1.671842
After second residual connection variance=1.673384
After third residual connection variance=1.740552
After fourth residual connection variance=1.689274
After fifth residual connection variance=0.999994
