In [None]:
# Exploration of Vanishing Gradients and its solutions
# Copyright (C) 2019  Abien Fred Agarap, Joshua Cruzada
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

Towards Understanding of the Vanishing Gradients Problem and its solutions
===

## Overview

We explore the performance of a neural network with different activation functions.

## Setup

We import libraries.

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

__version__ = '1.0.0'
__author__ = 'Abien Fred Agarap, Joshua Raphaelle Cruzada'

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import time
assert tf.__version__.startswith('2')

Set up seeds for reproducibility.

In [2]:
np.random.seed(42)
tf.random.set_seed(42)

Allocate GPU memory as needed.

In [3]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Dataset 

We use synthetic dataset for training and testing deep neural nets.

In [4]:
batch_size = 4096
epochs = 500

We use blobs, circles, and moons synthetic datasets.

In [5]:
(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.mnist.load_data()

train_features = train_features.astype(np.float32).reshape(-1, 784) / 255.
train_features += tf.random.normal(train_features.shape, stddev=5e-2, mean=0.)
test_features = test_features.astype(np.float32).reshape(-1, 784) / 255.

train_labels = tf.keras.utils.to_categorical(train_labels)
test_labels = tf.keras.utils.to_categorical(test_labels)

train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
train_dataset = train_dataset.prefetch(train_features.shape[0] // batch_size)
train_dataset = train_dataset.shuffle(batch_size * 2)
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)

W0612 10:13:14.373939 139936070940416 deprecation.py:323] From /home/abien_agarap/venv/lib/python3.5/site-packages/tensorflow/python/data/util/random_seed.py:58: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


## Deep Neural Network

In this notebook, we write a deep neural network using the TensorFlow Subclassing API.

In [6]:
class NeuralNet(tf.keras.Model):
    def __init__(self, units, num_classes):
        super(NeuralNet, self).__init__()
        self.hidden_layer_1 = tf.keras.layers.Dense(units=units, activation=tf.nn.relu)
        self.hidden_layer_2 = tf.keras.layers.Dense(units=units, activation=tf.nn.relu)
        self.hidden_layer_3 = tf.keras.layers.Dense(units=units, activation=tf.nn.relu)
        self.hidden_layer_4 = tf.keras.layers.Dense(units=units, activation=tf.nn.relu)
        self.hidden_layer_5 = tf.keras.layers.Dense(units=units, activation=tf.nn.relu)
        self.output_layer = tf.keras.layers.Dense(units=num_classes)
        self.optimizer = tf.optimizers.SGD(learning_rate=3e-4, momentum=9e-1)
    
    @tf.function
    def call(self, input_features):
        activations = self.hidden_layer_1(input_features)
        activations = self.hidden_layer_2(activations)
        activations = self.hidden_layer_3(activations)
        activations = self.hidden_layer_4(activations)
        activations = self.hidden_layer_5(activations)
        output = self.output_layer(activations)
        return output

We define the loss function for the model. For general applicability across the synthetic datasets, we shall use the softmax cross entropy function.

In [7]:
def loss_fn(logits, labels):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))

We define the train step for the model.

In [8]:
def train_step(model, loss, batch_features, batch_labels):
    with tf.GradientTape() as tape:
        logits = model(batch_features)
        train_loss = loss(logits=logits, labels=batch_labels)
    gradients = tape.gradient(train_loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return train_loss, gradients

We define a recorder of gradients for TensorBoard.

In [9]:
def plot_gradients(gradients, step):
    for index, gradient in enumerate(gradients):
        if len(gradient.shape) == 1:
            tf.summary.histogram('histogram/{}-bias-grad'.format(index), gradient, step)
        elif len(gradient.shape) != 1:
            tf.summary.histogram('histogram/{}-weights-grad'.format(index), gradient, step)

We define the training loop.

In [10]:
def train(model, loss_fn, dataset, epochs=10):
    
    writer = tf.summary.create_file_writer('tmp/{}-relu'.format(time.asctime()))    
    
    with writer.as_default():
        with tf.summary.record_if(True):
            step = 0
            for epoch in range(epochs):
                epoch_loss = 0
                epoch_accuracy = []
                for batch_features, batch_labels in dataset:
                    
                    batch_loss, train_gradients = train_step(model, loss_fn, batch_features, batch_labels)
                    
                    accuracy = tf.metrics.Accuracy()
                    accuracy(tf.argmax(model(batch_features), 1), tf.argmax(batch_labels, 1))

                    epoch_loss += batch_loss
                    epoch_accuracy.append(accuracy.result())
                    plot_gradients(train_gradients, step)
                    
                    step += 1
                    
                epoch_loss = tf.reduce_mean(epoch_loss)
                epoch_accuracy = tf.reduce_mean(epoch_accuracy)
                
                tf.summary.scalar('loss', epoch_loss, step=step)
                tf.summary.scalar('accuracy', epoch_accuracy, step=step)

                if epoch != 0 and (epoch + 1) % 100 == 0:
                    print('Epoch {}/{}. Loss : {}, Accuracy : {}'.format(epoch + 1, epochs, epoch_loss, epoch_accuracy))

### Rectified Linear Units

The ReLU function became a popular activation function as it solves the vanishing gradients problem by thresholding activations to 0 if they are less than or equal to 0.

$$ relu(x) = \max(0, x) $$

We instantiate a neural net class with `relu` activation function.

In [11]:
model = NeuralNet(units=512, num_classes=train_labels.shape[1])

We train the 5-layer neural net with `relu` activation.

In [12]:
start_time = time.time()
train(model, loss_fn, train_dataset, epochs)
print('training time : {}'.format(time.time() - start_time))

Epoch 100/500. Loss : 8.138785362243652, Accuracy : 0.8499232530593872
Epoch 200/500. Loss : 4.929592132568359, Accuracy : 0.8997279405593872
Epoch 300/500. Loss : 4.034863471984863, Accuracy : 0.9174456000328064
Epoch 400/500. Loss : 3.495504140853882, Accuracy : 0.9286412000656128
Epoch 500/500. Loss : 3.095698595046997, Accuracy : 0.9365059733390808
training time : 1530.5167181491852


We measure the classification performance of the neural net. First, we set up the metrics.

In [13]:
accuracy = tf.keras.metrics.Accuracy()
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
auc = tf.keras.metrics.AUC()

Then, we get test predictions.

In [14]:
predictions = tf.nn.softmax(model(test_features))

Lastly, we compute the classification performance.

In [15]:
test_accuracy = accuracy(np.argmax(predictions, 1), np.argmax(test_labels, 1))
test_precision = precision(predictions, test_labels)
test_recall = recall(predictions, test_labels)
test_auc = auc(predictions, test_labels)

Display the performance measures.

In [16]:
print('Accuracy : {}\nPrecision : {}\nRecall : {}\nAUC : {}'.format(test_accuracy.numpy(),
                                                                    test_precision.numpy(),
                                                                    test_recall.numpy(),
                                                                    test_auc.numpy()))

Accuracy : 0.9388999938964844
Precision : 1.0
Recall : 0.10000000149011612
AUC : 0.0


### Exporting trained model

Let's export the trained model for later analysis.

In [17]:
model.save_weights('models/relu/1', 'tf')

### Trust Score

In [18]:
from trustscore import TrustScore

In [19]:
ts_relu = TrustScore()
ts_relu.fit(train_features.numpy(), train_labels)

In [20]:
test_trust_score, _ = ts_relu.score(test_features, test_labels)

In [21]:
print(test_trust_score.mean())

1.422535205594187
