# DL(w3) DNN - Gradient Vanishment / Explosion
student ID: 7110018036\
name: Chieh-An, Chou

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

2023-03-05 18:03:51.827210: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-05 18:03:52.320768: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.7/lib64:/usr/local/cuda-11.7/extras/CUPTI/lib64
2023-03-05 18:03:52.320815: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.7/lib64:/usr/local/cuda-11.7/extras/CUPTI/

## 1. Weight initialization

In [2]:
[name for name in dir(keras.initializers) if not  name.startswith('_')]

['Constant',
 'GlorotNormal',
 'GlorotUniform',
 'HeNormal',
 'HeUniform',
 'Identity',
 'Initializer',
 'LecunNormal',
 'LecunUniform',
 'Ones',
 'Orthogonal',
 'RandomNormal',
 'RandomUniform',
 'TruncatedNormal',
 'VarianceScaling',
 'Zeros',
 'constant',
 'deserialize',
 'get',
 'glorot_normal',
 'glorot_uniform',
 'he_normal',
 'he_uniform',
 'identity',
 'lecun_normal',
 'lecun_uniform',
 'ones',
 'orthogonal',
 'random_normal',
 'random_uniform',
 'serialize',
 'truncated_normal',
 'variance_scaling',
 'zeros']

In [3]:
keras.layers.Dense(units=10,
                   activation='relu',
                   kernel_initializer='he_normal') # weight initialization

<keras.layers.core.dense.Dense at 0x7fb83d8d2ef0>

## 2. Activation function
#### Method 1: `keras.activations`

In [4]:
[name for name in dir(keras.activations) if not  name.startswith('_')]

['deserialize',
 'elu',
 'exponential',
 'gelu',
 'get',
 'hard_sigmoid',
 'linear',
 'relu',
 'selu',
 'serialize',
 'sigmoid',
 'softmax',
 'softplus',
 'softsign',
 'swish',
 'tanh']

In [5]:
keras.layers.Dense(units=10,
                   activation='selu',
                   kernel_initializer='lecun_normal')

<keras.layers.core.dense.Dense at 0x7fb782a8dba0>

#### Method 2: `keras.layers`

In [6]:
[name for name in dir(keras.layers) if 'elu' in name.lower()]

['ELU', 'LeakyReLU', 'PReLU', 'ReLU', 'ThresholdedReLU']

In [7]:
keras.layers.Dense(300, kernel_initializer='he_normal'),
keras.layers.LeakyReLU(alpha=.3),

<keras.layers.activation.leaky_relu.LeakyReLU at 0x7fb83d8d2830>

#### Method 3: `keras.layers.Activation`

In [8]:
keras.layers.Dense(300, kernel_initializer='he_normal'),
keras.layers.Activation('relu'),

<keras.layers.core.activation.Activation at 0x7fb782a8d180>

Ex1

In [9]:
# Load data
from tensorflow.keras.datasets import fashion_mnist
(x_train_set, y_train_set), (x_test, y_test) = fashion_mnist.load_data()

# Split data
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train_set, y_train_set, random_state = 1)

# Preprocessing
x_train = x_train/255
x_valid = x_valid/255
x_test = x_test/255

In [10]:
# clear and setting random seed
keras.backend.clear_session()
np.random.seed(1)
tf.random.set_seed(1)

In [11]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.Dense(units=300, activation='relu', kernel_initializer='he_normal'), # activation
    keras.layers.Dense(units=200,kernel_initializer='he_normal'),
    keras.layers.LeakyReLU(alpha=.01),                              # activation
    keras.layers.Dense(units=100, kernel_initializer='he_normal'),
    keras.layers.PReLU(),                                           # activation
    keras.layers.Dense(units=10, activation='softmax')
])

2023-03-05 18:03:53.572328: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-05 18:03:53.896656: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6470 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:01:00.0, compute capability: 8.6


`leaky_re_lu`: $\max(\alpha z_i,z_i)$, fixed $\alpha$.\
`p_re_lu`: $\max(a_iz_i,z_i)$, trainable $a_i$. (# of $a_i$ depends on the former layer)

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 300)               235500    
                                                                 
 dense_1 (Dense)             (None, 200)               60200     
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 200)               0         
                                                                 
 dense_2 (Dense)             (None, 100)               20100     
                                                                 
 p_re_lu (PReLU)             (None, 100)               100       
                                                                 
 dense_3 (Dense)             (None, 10)                1

In [13]:
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='sgd',
              metrics=['accuracy'])

In [14]:
train = model.fit(x_train, y_train,
                  epochs=2,
                  validation_data=(x_valid, y_valid))

Epoch 1/2


2023-03-05 18:05:15.275306: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-03-05 18:05:15.297103: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7fb3d6c62ef0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-03-05 18:05:15.297118: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2023-03-05 18:05:15.327020: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/2


## 3. Batch Normalization

#### Method 1: After activation function

In [15]:
keras.layers.Dense(300, activation='relu'),
keras.layers.BatchNormalization(),

(<keras.layers.normalization.batch_normalization.BatchNormalization at 0x7fb7762bd6c0>,)

Ex2

In [16]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(units=200, activation='relu', kernel_initializer='he_normal'), # activation
    keras.layers.BatchNormalization(),                                                # BN after activation
    keras.layers.Dense(units=100, activation='relu', kernel_initializer='he_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(units=10, activation='softmax')
])

$$z^{(i)}=\gamma\otimes\dfrac{x^{(i)}-\mu_B}{\sqrt{\sigma^2_B+\epsilon}}+\beta$$
+ `batch_normalization_1`: (784+784)+(784+784)
  + $\mu=784, \sigma^2=784$ - non-trainable, 
  + $\gamma=784, \beta=784$ - trainable.

+ `batch_normalization_2`: (200+200)+(200+200)
+ `batch_normalization_3`: (100+100)+(100+100)

In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 784)               0         
                                                                 
 batch_normalization_1 (Batc  (None, 784)              3136      
 hNormalization)                                                 
                                                                 
 dense_5 (Dense)             (None, 200)               157000    
                                                                 
 batch_normalization_2 (Batc  (None, 200)              800       
 hNormalization)                                                 
                                                                 
 dense_6 (Dense)             (None, 100)               20100     
                                                                 
 batch_normalization_3 (Batc  (None, 100)             

In [18]:
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='sgd',
              metrics=['accuracy'])

In [19]:
train = model.fit(x_train, y_train,
                  epochs=2,
                  validation_data=(x_valid, y_valid))

Epoch 1/2
Epoch 2/2


#### Method 2: Before activation function

In [20]:
keras.layers.Dense(300, use_bias=False),    # no need for bias 
keras.layers.BatchNormalization(),          # BN has "shift" (similar to bias)
keras.layers.Activation('relu'),

(<keras.layers.core.activation.Activation at 0x7fb775f6b940>,)

Ex3

In [21]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(units=200, use_bias=False), 
    keras.layers.BatchNormalization(),  # BN before activation
    keras.layers.Activation('relu'),    # activation
    keras.layers.Dense(units=100, use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.LeakyReLU(alpha=.3),
    keras.layers.Dense(units=10, activation='softmax')
])

In [22]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_2 (Flatten)         (None, 784)               0         
                                                                 
 batch_normalization_5 (Batc  (None, 784)              3136      
 hNormalization)                                                 
                                                                 
 dense_9 (Dense)             (None, 200)               156800    
                                                                 
 batch_normalization_6 (Batc  (None, 200)              800       
 hNormalization)                                                 
                                                                 
 activation_1 (Activation)   (None, 200)               0         
                                                                 
 dense_10 (Dense)            (None, 100)              

In [23]:
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='sgd',
              metrics=['accuracy'])

In [24]:
train = model.fit(x_train, y_train,
                  epochs=2,
                  validation_data=(x_valid, y_valid))

Epoch 1/2
Epoch 2/2


## 4. Gradient Clipping

#### Method 1: `clipvalue`
`clipvalue = r` : $\nabla \in [-r,r]$

In [25]:
opt = keras.optimizers.SGD(clipvalue=1.0)

#### Method 2: `clipnorm`
`clipnorm = r` : $\lVert\nabla\rVert^2 \le r$

In [26]:
opt = keras.optimizers.SGD(clipnorm=0.9)

Ex4

In [27]:
opt = keras.optimizers.SGD(learning_rate=1e-3, clipnorm=0.9) # gradient cliping
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer=opt,
              metrics=['accuracy'])

In [28]:
train = model.fit(x_train, y_train,
                  epochs=2,
                  validation_data=(x_valid, y_valid))

Epoch 1/2
Epoch 2/2
