# Implementing Batch Normalisation with Keras

## Load the dataset and create training set, test set and validation set

In [1]:
# import the lib
import tensorflow as tf
from tensorflow import keras

# load the dataset
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

# scale 
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0

# split for validation set
X_valid, X_train = X_train_full[: 5000], X_train_full[5000: ]
y_valid, y_train = y_train_full[: 5000], y_train_full[5000: ]

2021-11-07 08:01:41.807574: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-07 08:01:41.808623: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## create the model with BN layers

In [2]:
# import the lib
import numpy as np

# set the random seed
np.random.seed(42)
tf.random.set_seed(42)

# create the model
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(), 
    keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"), 
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10, activation="softmax")
])

2021-10-15 13:29:00.155920: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-15 13:29:00.156064: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-15 13:29:00.156173: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-8E5U3B3): /proc/driver/nvidia/version does not exist
2021-10-15 13:29:00.167935: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 784)               3136      
_________________________________________________________________
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1

## check the parameters of the first BN layer

In [4]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

## add BN layers before activation functions

In [9]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu"),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu"),
    keras.layers.Dense(10, activation="softmax")
])

In [10]:
# compile the model
model.compile(loss="sparse_categorical_crossentropy",
             optimizer=keras.optimizers.SGD(learning_rate=1e-3),
             metrics=["accuracy"])

In [11]:
# history
history = model.fit(X_train, y_train, epochs=10,
                   validation_data=(X_valid, y_valid))

2021-10-15 19:18:33.271764: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Gradient Clipping

In [12]:
# define the optimizer
optimizer = keras.optimizers.SGD(clipvalue=1.0, learning_rate=1e-3)

# compile the model with the defined optimizer
model.compile(loss="mse",
             optimizer=optimizer,
             metrics=["accuracy"])

## Reusing Pre-Trained Layers

### split the MNIST dataset into dataset A and dataset B 

In [38]:
# define a function for splitting data
def split_dataset(X, y):
    
    # create labels y_A for dataset A
    ## find the condition that the label corresponds to sandals or shirts
    y_5_or_6 = (y == 5) | (y == 6)
    
    ## set all the non-sandal and non-shirt labels together for dataset A
    y_A = y[~y_5_or_6]
    
    ## the original labels 7, 8, 9 should be set to 5, 6, 7 respectively
    y_A[y_A > 6] -= 2
    
    # create labels y_B for dataset B
    y_B = (y[y_5_or_6] == 6).astype(np.float32)
    
    return ((X[~y_5_or_6], y_A), 
            (X[y_5_or_6], y_B))

# split the training set into A and B
(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)

# split the validation set into A and B
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)

# split the test set into A and B
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)

# assign the first 200 instances of training set B for the task
X_train_B = X_train_B[: 200]
y_train_B = y_train_B[: 200]

In [27]:
X_train_A.shape

(43986, 28, 28)

In [28]:
X_train_B.shape

(200, 28, 28)

### create, compile and history the model A

In [30]:
# import the lib
import numpy as np
import tensorflow as tf
from tensorflow import keras

# set the random seed
np.random.seed(42)
tf.random.set_seed(42)

# initialise the model A 
model_A = keras.models.Sequential()

# add layers on model A
model_A.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 100, 50, 50, 50):
    model_A.add(keras.layers.Dense(n_hidden, activation="selu"))
model_A.add(keras.layers.Dense(8, activation="softmax"))

# compile the model
model_A.compile(loss="sparse_categorical_crossentropy",
               optimizer=keras.optimizers.SGD(learning_rate=1e-3),
               metrics=["accuracy"])

# history fit
history = model_A.fit(X_train_A, y_train_A, epochs=20,
                     validation_data=(X_valid_A, y_valid_A))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### save the model A

In [31]:
model_A.save("my_model_A.h5")

### create, compile, and history-fit model B

In [34]:
# initialise model B
model_B = keras.models.Sequential()

# add input layer to model B
model_B.add(keras.layers.Flatten(input_shape=[28, 28]))

# add hidden layer to model B
for n_hidden in (300, 100, 50, 50, 50):
    model_B.add(keras.layers.Dense(n_hidden, activation="selu"))
    
# add output layer
model_B.add(keras.layers.Dense(1, activation="sigmoid"))

# compile the model B
model_B.compile(loss="binary_crossentropy",
               optimizer=keras.optimizers.SGD(learning_rate=1e-3),
               metrics=["accuracy"])

# hitory fit
history_B = model_B.fit(X_train_B, y_train_B, epochs=20,
                       validation_data=(X_valid_B, y_valid_B))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [35]:
# show the summary of B
model_B.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_5 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 300)               235500    
_________________________________________________________________
dense_23 (Dense)             (None, 100)               30100     
_________________________________________________________________
dense_24 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_25 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_26 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_27 (Dense)             (None, 1)                

### create  Model B_on_A

In [36]:
# load model A from the h5 file
model_A = keras.models.load_model("my_model_A.h5")

# create the model B_on_A with all the A's layers except from the output layer
model_B_on_A = keras.models.Sequential(model_A.layers[: -1])

# add an output layer to model B_on_A
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))

# freeze all the reused layers 
for layer in model_B_on_A.layers[: -1]:
    layer.trainable = False

# compile the layer-freezed model B_on_A
model_B_on_A.compile(loss="binary_crossentropy",
                    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
                    metrics=["accuracy"])

# 4-epoch history fit
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4,
                          validation_data=(X_valid_B, y_valid_B))

# unfreeze all the reused layers
for layer in model_B_on_A.layers[: -1]:
    layer.trainable = True

# compile the layer-unfreezed model
model_B_on_A.compile(loss="binary_crossentropy",
                    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
                    metrics=["accuracy"])

# 16-epoch history fit
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,
                          validation_data=(X_valid_B, y_valid_B))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [39]:
model_B.evaluate(X_test_B, y_test_B)



[0.14390385150909424, 0.9704999923706055]

In [40]:
model_B_on_A.evaluate(X_test_B, y_test_B)



[0.0721736028790474, 0.9909999966621399]

## Regularization with Dropout

In [2]:
# load the data
# import the lib
import tensorflow as tf
from tensorflow import keras

# load the dataset
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

# scale 
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0

# split for validation set
X_valid, X_train = X_train_full[: 5000], X_train_full[5000: ]
y_valid, y_train = y_train_full[: 5000], y_train_full[5000: ]

In [3]:
# create the model 
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    
    # add Dropout layer in front of all the hidden layers
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
    
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    
    # add Dropout layer in front of the output layer
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(10, activation="softmax")
])

# compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

# scale the training instances to mean 0 and standard deviation 1
pixel_means = X_train.mean(axis=0, keepdims=True)
pixel_stds = X_train.std(axis=0, keepdims=True)
X_train_scaled = (X_train - pixel_means) / pixel_stds
X_valid_scaled = (X_valid - pixel_means) / pixel_stds
X_test_scaled = (X_test - pixel_means) / pixel_stds

# obtain the history
n_epochs = 2
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,
                   validation_data=(X_valid_scaled, y_valid))

2021-11-07 08:17:04.415836: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-07 08:17:04.417252: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-07 08:17:04.417477: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-8E5U3B3): /proc/driver/nvidia/version does not exist
2021-11-07 08:17:04.454818: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-07 08:17:06.663784: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLI

Epoch 1/2
Epoch 2/2
