# Setup

Import a few common modules

In [2]:
#CODE for points = 1
# import sklearn, numpy, os

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)
import sklearn
import numpy as np
import os

# Scikit-Learn ≥0.20 is required

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

%load_ext tensorboard



# to make this notebook's output stable across runs
np.random.seed(42)


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# Vanishing/Exploding Gradients Problem

In [3]:
def logit(z):
    return 1 / (1 + np.exp(-z))

In [4]:
z = np.linspace(-5, 5, 200)

## Xavier and He Initialization

In [9]:
#CODE for points = 1
# set activation to relu and kernel initializer to he_normal

def relu(z):
    return np.maximum(z, 0)
initializer = tf.keras.initializers.HeNormal()
keras.layers.Dense(10, activation= relu, kernel_initializer= initializer)

<keras.layers.core.dense.Dense at 0x7f95e98ad390>

### Leaky ReLU

In [11]:
#CODE for points = 1

# remember α is the hyperparameter that defines how much the function “leaks”
# set the value of alpha, use the value typically set

def leaky_relu(z, alpha= 0.01):
    return np.maximum(alpha*z, z)

Let's train a neural network on Fashion MNIST using the Leaky ReLU:

In [13]:
#CODE for points = 1
#load MNIST dataset from keras
from keras.datasets import mnist
(X_train_full, y_train_full), (X_test, y_test) = mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [17]:
#CODE for points = 1

# initialize kernel_initializer to "he_normal" and activation function to softmax

tf.random.set_seed(42)
np.random.seed(42)

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, kernel_initializer= initializer),
    keras.layers.LeakyReLU(),
    keras.layers.Dense(100, kernel_initializer= initializer),
    keras.layers.LeakyReLU(),
    keras.layers.Dense(10, activation= "softmax")
])

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [None]:
history = model.fit(X_train, y_train, epochs=5,
                    validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Now look at what happens if we try to use the ReLU activation function instead:

In [18]:
np.random.seed(42)
tf.random.set_seed(42)

In [19]:
model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=[28, 28]))

model.add(keras.layers.Dense(300, activation="relu", kernel_initializer="he_normal"))

for layer in range(99):
    model.add(keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"))

model.add(keras.layers.Dense(10, activation="softmax"))

In [20]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [None]:
history = model.fit(X_train, y_train, epochs=5,
                    validation_data=(X_valid, y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

Not great at all, we suffered from the vanishing/exploding gradients problem.

# Batch Normalization

In [22]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10, activation="softmax")
])

In [23]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_5 (Flatten)         (None, 784)               0         
                                                                 
 batch_normalization (BatchN  (None, 784)              3136      
 ormalization)                                                   
                                                                 
 dense_113 (Dense)           (None, 300)               235500    
                                                                 
 batch_normalization_1 (Batc  (None, 300)              1200      
 hNormalization)                                                 
                                                                 
 dense_114 (Dense)           (None, 100)               30100     
                                                                 
 batch_normalization_2 (Batc  (None, 100)             

In [24]:
bn1 = model.layers[1]
[(var.name, var.trainable) for var in bn1.variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

In [None]:
#bn1.updates #deprecated

In [25]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [26]:
history = model.fit(X_train, y_train, epochs=5,
                    validation_data=(X_valid, y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Sometimes applying BN before the activation function works better (there's a debate on this topic). Moreover, the layer before a `BatchNormalization` layer does not need to have bias terms, since the `BatchNormalization` layer some as well, it would be a waste of parameters, so you can set `use_bias=False` when creating those layers:

In [27]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dense(100, use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dense(10, activation="softmax")
])

In [28]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [29]:
history = model.fit(X_train, y_train, epochs=5,
                    validation_data=(X_valid, y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Faster Optimizers

## Momentum optimization

In [30]:
#CODE for points = 1
# initialize lr and momentum to typical values

optimizer = keras.optimizers.SGD(learning_rate= 0.1, momentum= 0.0)

## Nesterov Accelerated Gradient

In [32]:
#CODE for points = 1
# initialize lr and momentum to typical values. Set nesterov so that it is used

optimizer = keras.optimizers.SGD(learning_rate= 0.1, momentum= 0.8, nesterov= True)

# Avoiding Overfitting Through Regularization

## $\ell_1$ and $\ell_2$ regularization

In [45]:
#CODE for points = 0.5

# Use syntax for assigning l2 regularization with a factor 0.01 as given here - https://keras.io/api/layers/regularizers/

lr2 = tf.keras.regularizers.l2(l2=0.01)
layer = keras.layers.Dense(100, activation="elu",
                           kernel_initializer="he_normal",
                           kernel_regularizer= lr2)

In [51]:
#CODE for points = 1

# Use syntax for assigning l2 regularization with a factor 0.01 as given here - https://keras.io/api/layers/regularizers/
# nadam optimizer
from tensorflow.keras.optimizers import Nadam
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="elu",
                       kernel_initializer="he_normal",
                       kernel_regularizer= lr2),
    keras.layers.Dense(100, activation="elu",
                       kernel_initializer="he_normal",
                       kernel_regularizer= lr2),
    keras.layers.Dense(10, activation="softmax",
                       kernel_regularizer= lr)
])

   
model.compile(loss="sparse_categorical_crossentropy", optimizer= "nadam", metrics=["accuracy"])
n_epochs = 2
history = model.fit(X_train, y_train, epochs=n_epochs,
                    validation_data=(X_valid, y_valid))

Epoch 1/2
Epoch 2/2


In [52]:
#CODE for points = .5

# Use syntax for assigning l2 regularization with a factor 0.01 as given here - https://keras.io/api/layers/regularizers/

from functools import partial

RegularizedDense = partial(keras.layers.Dense,
                           activation="elu",
                           kernel_initializer="he_normal",
                           kernel_regularizer= lr2)

In [54]:
#CODE for points = 1

# activation function initialized as softmax
# nadam optimizer


model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    RegularizedDense(300),
    RegularizedDense(100),
    RegularizedDense(10, activation= "softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer= "nadam", metrics=["accuracy"])
n_epochs = 2
history = model.fit(X_train, y_train, epochs=n_epochs,
                    validation_data=(X_valid, y_valid))


Epoch 1/2
Epoch 2/2
