In [None]:
# Exploration of Vanishing Gradients and its solutions
# Copyright (C) 2019  Abien Fred Agarap, Joshua Cruzada
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

Towards Understanding of the Vanishing Gradients Problem and its solutions
===

## Overview

We explore the performance of a neural network with different activation functions.

## Setup

We install TensorFlow 2.0

In [None]:
!pip install tensorflow-gpu==2.0.0-alpha0

Load TensorBoard for visualization.

In [None]:
%load_ext tensorboard.notebook
%tensorboard --logdir tmp

## Deep Neural Network

In this notebook, we write deep neural network using the TensorFlow low-level API.

In [1]:
"""Implementation of Neural Network with different activations"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

__version__ = '1.0.0'
__author__ = 'Abien Fred Agarap'

import numpy as np
import tensorflow as tf

Allocate GPU memory as needed.

In [2]:
tf.config.gpu.set_per_process_memory_growth(True)

Set up seeds for reproducibility.

In [3]:
np.random.seed(42)
tf.random.set_seed(42)

We define the class for a deep neural net by first writing a constructor to accept the parameters for building the architecture.

In [4]:
class NeuralNet():
    def __init__(self, layers, activation):
        self.weights = []
        self.layers = layers
        self.num_layers = len(layers)
        self.activation = activation

We define a function for initializing the learning parameters.

In [5]:
class NeuralNet(NeuralNet):
    def initialize_params(self):
        for layer in range(1, self.num_layers):
            self.weights.append(tf.Variable(tf.random.normal([self.layers[layer], self.layers[layer - 1]])))

We define the forward pass for the model.

In [6]:
class NeuralNet(NeuralNet):
    def forward_prop(self, batch_features):
        activations = []
        linear_activations = []
        activations.append(tf.transpose(batch_features))
        for layer in range(1, self.num_layers):
            linear_activations.append(tf.matmul(self.weights[layer - 1], activations[layer - 1]))
            if layer != self.num_layers - 1:
                if self.activation == 'relu':
                    activations.append(tf.nn.relu(linear_activations[layer - 1]))
                elif self.activation == 'sigmoid':
                    activations.append(tf.nn.sigmoid(linear_activations[layer - 1]))
                elif self.activation == 'tanh':
                    activations.append(tf.nn.tanh(linear_activations[layer - 1]))
                elif self.activation == 'leaky_relu':
                    activations.append(tf.nn.leaky_relu(linear_activations[layer - 1]))
                elif self.activation == 'swish':
                    activations.append(tf.nn.sigmoid(linear_activations[layer - 1]) * linear_activations[layer - 1])
        return tf.transpose(linear_activations[self.num_layers - 2])

We define the prediction function that uses the forward pass of the model.

In [7]:
class NeuralNet(NeuralNet):
    @tf.function
    def predict(self, batch_features):
        logits = self.forward_prop(batch_features)
        return logits

We define the training loop for the model.

In [8]:
class NeuralNet(NeuralNet):
    def train(self, dataset, epochs=10):
        self.initialize_params()

        optimizer = tf.optimizers.Adam(learning_rate=3e-4)

        writer = tf.summary.create_file_writer('tmp')

        with writer.as_default():
            with tf.summary.record_if(True):
                for epoch in range(epochs):
                    epoch_loss = 0
                    epoch_accuracy = []
                    for step, (batch_features, batch_labels) in enumerate(dataset):
                        with tf.GradientTape() as tape:
                            logits = self.forward_prop(batch_features)
                            batch_loss = tf.losses.categorical_crossentropy(batch_labels, logits, from_logits=True)
                            batch_loss = tf.reduce_mean(batch_loss)
                        gradients = tape.gradient(batch_loss, self.weights)
                        optimizer.apply_gradients(zip(gradients, self.weights))

                        accuracy = tf.metrics.Accuracy()
                        accuracy(tf.argmax(self.predict(batch_features), 1), tf.argmax(batch_labels, 1))

                        epoch_loss += batch_loss
                        epoch_accuracy.append(accuracy.result())
                    epoch_loss = np.mean(epoch_loss)
                    epoch_accuracy = np.mean(epoch_accuracy)

                    tf.summary.scalar('loss', epoch_loss, step=step)
                    tf.summary.scalar('accuracy', epoch_accuracy, step=step)

                    if epoch != 0 and (epoch + 1) % 10 == 0:
                        print('Epoch {}/{}. Loss : {}, Accuracy : {}'.format(epoch + 1, epochs, epoch_loss, epoch_accuracy))

## Dataset 

We use synthetic dataset for training and testing deep neural nets.

In [9]:
from utils import create_dataset

We set some hyperparameters.

In [10]:
batch_size = 128
epochs = 100

We use blobs, circles, and moons synthetic datasets.

In [11]:
blobs_training_dataset, blobs_test_dataset = create_dataset(batch_size=batch_size, data='blobs', onehot=True)

In [12]:
circles_training_dataset, circles_test_dataset = create_dataset(batch_size=batch_size, data='circles', onehot=True)

In [13]:
moons_training_dataset, moons_test_dataset = create_dataset(batch_size=batch_size, data='moons', onehot=True)

## Varying Activation Functions

We vary the activation functions used for a deep neural network with 5 layers.

### Hyperbolic Tangent

$$ tanh(x) = \dfrac{e^x - e^{-x}}{e^x + e^{-x}} $$

In [19]:
model = NeuralNet([2, 16, 16, 16, 16, 16, 3], activation='tanh')

We train the 5-layer neural net with `tanh` activation.

In [20]:
model.train(blobs_training_dataset, epochs=epochs)

Epoch 10/100. Loss : 1.884418249130249, Accuracy : 0.9986185431480408
Epoch 20/100. Loss : 0.5408930778503418, Accuracy : 0.9998570680618286
Epoch 30/100. Loss : 0.20754793286323547, Accuracy : 1.0
Epoch 40/100. Loss : 0.07831117510795593, Accuracy : 1.0
Epoch 50/100. Loss : 0.03708554059267044, Accuracy : 1.0
Epoch 60/100. Loss : 0.015668511390686035, Accuracy : 1.0
Epoch 70/100. Loss : 0.006831524893641472, Accuracy : 1.0
Epoch 80/100. Loss : 0.0035598259419202805, Accuracy : 1.0
Epoch 90/100. Loss : 0.0019118780037388206, Accuracy : 1.0
Epoch 100/100. Loss : 0.0010167760774493217, Accuracy : 1.0


We measure the classification performance of the neural net. First, we set up the metrics.

In [21]:
accuracy = tf.keras.metrics.Accuracy()
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
auc = tf.keras.metrics.AUC()

We get the classification performance on the test predictions from the deep neural net with hyperbolic tangent activations.

In [22]:
tanh_predictions = tf.nn.softmax(model.predict(blobs_test_dataset[0]))
accuracy(tanh_predictions.numpy(), blobs_test_dataset[1])
precision(tanh_predictions.numpy(), blobs_test_dataset[1])
recall(tanh_predictions.numpy(), blobs_test_dataset[1])
auc(tanh_predictions.numpy(), blobs_test_dataset[1])

<tf.Tensor: id=6827281, shape=(), dtype=float32, numpy=0.0>

Display the performance measures.

In [23]:
print('Test accuracy : {}\nPrecision : {}\nRecall : {}\nAUC : {}'.format(accuracy.result(), precision.result(), recall.result(), auc.result()))

Test accuracy : 0.0
Precision : 1.0
Recall : 0.3333333432674408
AUC : 0.0


### Sigmoid

$$ \sigma(x) = \dfrac{1}{1 + e^{-x}} $$

In [24]:
model = NeuralNet([2, 16, 16, 16, 16, 16, 3], activation='sigmoid')

We train the 5-layer neural net with `sigmoid` activation.

In [25]:
model.train(blobs_training_dataset, epochs=epochs)

Epoch 10/100. Loss : 17.32961082458496, Accuracy : 1.0
Epoch 20/100. Loss : 2.4970147609710693, Accuracy : 1.0
Epoch 30/100. Loss : 0.878398060798645, Accuracy : 1.0
Epoch 40/100. Loss : 0.4019172191619873, Accuracy : 1.0
Epoch 50/100. Loss : 0.19001628458499908, Accuracy : 1.0
Epoch 60/100. Loss : 0.08843421190977097, Accuracy : 1.0
Epoch 70/100. Loss : 0.04012209549546242, Accuracy : 1.0
Epoch 80/100. Loss : 0.017887214198708534, Accuracy : 1.0
Epoch 90/100. Loss : 0.008018615655601025, Accuracy : 1.0
Epoch 100/100. Loss : 0.0036467076279222965, Accuracy : 1.0


We measure the classification performance of the neural net. First, we set up the metrics.

In [26]:
accuracy = tf.keras.metrics.Accuracy()
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
auc = tf.keras.metrics.AUC()

We get the classification performance on the test predictions from the deep neural net with hyperbolic tangent activations.

In [27]:
sigmoid_predictions = tf.nn.softmax(model.predict(blobs_test_dataset[0]))
accuracy(sigmoid_predictions.numpy(), blobs_test_dataset[1])
precision(sigmoid_predictions.numpy(), blobs_test_dataset[1])
recall(sigmoid_predictions.numpy(), blobs_test_dataset[1])
auc(sigmoid_predictions.numpy(), blobs_test_dataset[1])

<tf.Tensor: id=10240921, shape=(), dtype=float32, numpy=0.0>

Display the performance measures.

In [28]:
print('Test accuracy : {}\nPrecision : {}\nRecall : {}\nAUC : {}'.format(accuracy.result(), precision.result(), recall.result(), auc.result()))

Test accuracy : 0.0
Precision : 1.0
Recall : 0.3333333432674408
AUC : 0.0


### Rectified Linear Units

$$ relu(x) = max(0, x) $$

In [14]:
model = NeuralNet([2, 16, 16, 16, 16, 16, 3], activation='relu')

We train the 5-layer neural net with `relu` activation.

In [15]:
model.train(blobs_training_dataset, epochs=epochs)

Epoch 10/100. Loss : 12.686515808105469, Accuracy : 0.9982850551605225
Epoch 20/100. Loss : 0.998207151889801, Accuracy : 0.9998094439506531
Epoch 30/100. Loss : 0.0068329269997775555, Accuracy : 1.0
Epoch 40/100. Loss : 0.001769143738783896, Accuracy : 1.0
Epoch 50/100. Loss : 0.000889322254806757, Accuracy : 1.0
Epoch 60/100. Loss : 0.0005361908697523177, Accuracy : 1.0
Epoch 70/100. Loss : 0.0003328489838168025, Accuracy : 1.0
Epoch 80/100. Loss : 0.0002011979086091742, Accuracy : 1.0
Epoch 90/100. Loss : 0.00011687181540764868, Accuracy : 1.0
Epoch 100/100. Loss : 6.567626405740157e-05, Accuracy : 1.0


We measure the classification performance of the neural net. First, we set up the metrics.

In [16]:
accuracy = tf.keras.metrics.Accuracy()
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
auc = tf.keras.metrics.AUC()

We get the classification performance on the test predictions from the deep neural net with hyperbolic tangent activations.

In [17]:
relu_predictions = tf.nn.softmax(model.predict(blobs_test_dataset[0]))
accuracy(relu_predictions.numpy(), blobs_test_dataset[1])
precision(relu_predictions.numpy(), blobs_test_dataset[1])
recall(relu_predictions.numpy(), blobs_test_dataset[1])
auc(relu_predictions.numpy(), blobs_test_dataset[1])

<tf.Tensor: id=3413641, shape=(), dtype=float32, numpy=0.81797624>

Display the performance measures.

In [18]:
print('Test accuracy : {}\nPrecision : {}\nRecall : {}\nAUC : {}'.format(accuracy.result(), precision.result(), recall.result(), auc.result()))

Test accuracy : 0.7976296544075012
Precision : 1.0
Recall : 0.6359525322914124
AUC : 0.8179762363433838


### Leaky ReLU

$$ relu(x) = max(0.02x, x) $$

In [None]:
model = NeuralNet([2, 16, 16, 16, 16, 16, 3], activation='leaky_relu')

We train the 5-layer neural net with `leaky_relu` activation.

In [None]:
model.train(blobs_training_dataset, epochs=epochs)

We measure the classification performance of the neural net. First, we set up the metrics.

In [None]:
accuracy = tf.keras.metrics.Accuracy()
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
auc = tf.keras.metrics.AUC()

We get the classification performance on the test predictions from the deep neural net with hyperbolic tangent activations.

In [None]:
leaky_relu_predictions = tf.nn.softmax(model.predict(blobs_test_dataset[0]))
accuracy(leaky_relu_predictions.numpy(), blobs_test_dataset[1])
precision(leaky_relu_predictions.numpy(), blobs_test_dataset[1])
recall(leaky_relu_predictions.numpy(), blobs_test_dataset[1])
auc(leaky_relu_predictions.numpy(), blobs_test_dataset[1])

Display the performance measures.

In [None]:
print('Test accuracy : {}\nPrecision : {}\nRecall : {}\nAUC : {}'.format(accuracy.result(), precision.result(), recall.result(), auc.result()))