# Challenge - 01: (Document 5)

# Introduction to the concept of Batch Normalization 

### Batch Normalization 

In [2]:
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [3]:
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

In [4]:
LAYERS = [ tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(300, kernel_initializer="he_normal"),
    tf.keras.layers.LeakyReLU(),
    tf.keras.layers.Dense(100, kernel_initializer="he_normal"),
    tf.keras.layers.LeakyReLU(),
    tf.keras.layers.Dense(10, activation="softmax")]


model = tf.keras.models.Sequential(LAYERS)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [5]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1010      
Total params: 266,610
Trainable params: 266,610
Non-trainable params: 0
__________________________________________________

In [6]:
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "SGD", metrics = ['accuracy'])

In [7]:
model.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test))

Train on 55000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1cdb37cb4e0>

# Batch Normalization Approach 1 

In this approach , we add BN after each layer after the activation function 

In [8]:
del model 

In [9]:
LAYERS = [ tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(300, kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(100, kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(10, activation="softmax")]


model = tf.keras.models.Sequential(LAYERS)

In [10]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 784)               3136      
_________________________________________________________________
dense_3 (Dense)              (None, 300)               235500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_4 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_5 (Dense)              (None, 10)               

In [11]:
784*4 #4 indicating 4 parameters mean,variance, gamma, beta 
# where non trainable parameters are means, variance
# where trainable parameters are gamma , beta 

3136

In [12]:
300 *4

1200

In [13]:
100*4

400

In [14]:
model.layers

[<tensorflow.python.keras.layers.core.Flatten at 0x1cdbea0a128>,
 <tensorflow.python.keras.layers.normalization.BatchNormalization at 0x1cdbea0a278>,
 <tensorflow.python.keras.layers.core.Dense at 0x1cdbea0a588>,
 <tensorflow.python.keras.layers.normalization.BatchNormalization at 0x1cdbea0a828>,
 <tensorflow.python.keras.layers.core.Dense at 0x1cdbea0ab38>,
 <tensorflow.python.keras.layers.normalization.BatchNormalization at 0x1cdbea0add8>,
 <tensorflow.python.keras.layers.core.Dense at 0x1cdbea1a128>]

In [15]:
bn1 = model.layers[1]

In [16]:
bn1.variables

[<tf.Variable 'batch_normalization/gamma:0' shape=(784,) dtype=float32>,
 <tf.Variable 'batch_normalization/beta:0' shape=(784,) dtype=float32>,
 <tf.Variable 'batch_normalization/moving_mean:0' shape=(784,) dtype=float32>,
 <tf.Variable 'batch_normalization/moving_variance:0' shape=(784,) dtype=float32>]

In [18]:
for variable in bn1.variables:
    print(variable.name, variable.trainable)

batch_normalization/gamma:0 True
batch_normalization/beta:0 True
batch_normalization/moving_mean:0 False
batch_normalization/moving_variance:0 False


In [19]:
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "SGD", metrics = ['accuracy'])

In [20]:
model.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test))

Train on 55000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1cddeb84e10>

# Batch Normalization Approach 2 

In this appraoch, we are adding Normalization techniques after each layer before the activation functions 

In [21]:
del model

In [22]:
LAYERS_BN_BIAS_FALSE = [
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(300, use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),
    tf.keras.layers.Dense(100, use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),
    tf.keras.layers.Dense(10, activation="softmax")
]

model = tf.keras.models.Sequential(LAYERS_BN_BIAS_FALSE)

In [23]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 784)               3136      
_________________________________________________________________
dense_6 (Dense)              (None, 300)               235200    
_________________________________________________________________
batch_normalization_4 (Batch (None, 300)               1200      
_________________________________________________________________
activation (Activation)      (None, 300)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               30000     
_________________________________________________________________
batch_normalization_5 (Batch (None, 100)              

In [24]:
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "SGD", metrics = ['accuracy'])

In [25]:
model.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test))

Train on 55000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1cdf20a4978>

# Observations 

1. gamma and beta -> get new trainable parameters 
2. mean and standard deviation --> non trainable parameters 
3. that means cost functions depends on 4 parameters, mean, SD, gamma, beta 

# Advantages 

1. You wont need scaling of data if we are using BN as a first layer
2. Although it adds two extra learnable parameters, still it converges faster , thus getting better results 
3. It helps to reduce your Gradient vanishing and exploading problem 
4. It does not get effected by choice of activation function and weight intialization 
5. It solves the problem of Internal Covariate shift 

# Disadvantages 

1. It incrases the complexity of network 
2. No of learnable paramters increased 
3. complexity -> runtime penalty hence prediction is slow but more accurate 
4. Training time increased but convergence is faster 

# When to use 

1. for deep neural Network layers more than 10
2. Mostly used in CNN 