In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Initialization in Keras

Default = Glorot initialization with uniform distribution

In [None]:
dense = tf.keras.layers.Dense(50, activation="relu", kernel_initializer="he_normal") # or "he_uniform"

### Other activation functions

`Leaky ReLU`

Use He initialization with Leaky ReLU

In [None]:
leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2) # default = 0.3
dense = tf.keras.layers.Dense(50, activation=leaky_relu, kernel_initializer="he_normal")

Or as a seperate layer

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(50, kernel_initializer="he_normal"),
    tf.keras.layers.LeakyReLU(alpha=0.2)
])

`Swish`

Also He initialization

In [None]:
dense = tf.keras.layers.Dense(50, activation="swish", kernel_initializer="he_normal")

### Batch Normalization

`Use after activation function`

In [2]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(300, activation="relu",
                              kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(100, activation="relu",
                              kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(10, activation="softmax")
])

`Parameter Calculation`

BatchNorm adds 4 parameters:

- Gamma, Beta = Learnable
- Mu, Sigma = Non-learnable

In this example:

- BN1: 784 inputs x 4 = 3136 parameters
- BN2: 300 x 4 = 1200 parameters
- BN3: 100 x 4 = 400 parameters

Total = 4736 parameters
Total non-trainable = 4736 / 2 = 2368 parameters

In [3]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 batch_normalization (Batch  (None, 784)               3136      
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 300)               235500    
                                                                 
 batch_normalization_1 (Bat  (None, 300)               1200      
 chNormalization)                                                
                                                                 
 dense_1 (Dense)             (None, 100)               30100     
                                                                 
 batch_normalization_2 (Bat  (None, 100)               4

`Use before activation function`

use_bias=False -> learnable parameters Gamma and Beta from BatchNorm are in essence biases -> no need for extra bias from Dense layer

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),

    tf.keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),

    tf.keras.layers.Dense(10, activation="softmax")
])

`Hyperparameters of BN`

- momentum: to compute moving averages
- axis: over which axis will mean and variance be computed

In [None]:
BN = tf.keras.layers.BatchNormalization(momentum=0.99, axis=-1) # These are default settings # axis=[1,2] if batchshape = (batch_size, height, width) and you want to comnpute mean and variance per pixel instead of per width

### Gradient Clipping

- `clipvalue=1.0` -> optimizer will clip components of gradient to range [-1.0, 1.0] (may change direction of gradient)
- `clipnorm=1.0` -> clips norm of gradient to 1.0 (preserves direction of gradient)

In [None]:
# just an example of an optimizer

rmsprop = tf.keras.optimzirs.RMSprop(clipnorm=1.0, clipvalue=1.0) # default = None

### Transfer Learning

- Suppose we have a model trained on a similar task
- Note: model_A and model_B_on_A now share layers -> if you train model_B_on_A -> model_A will also be affected

In [None]:
# Load existing model
model_A = tf.keras.models.load_model("my_model_A")
# Remove output layer
model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])
# Add new output layer
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))

`Clone models`

- clone_model() only copies architecture
- use set_weights() and get_weights() to copy them

In [None]:
model_A_clone = tf.keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())
model_B_on_A = tf.keras.Sequential(model_A_clone.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))

`Freeze Reused Layers`

Note: You must compile the model after freezing / unfreezing layers

In [None]:
for layer in model_B_on_A.layers[:-1]:
  layer.trainable = False

optimizer = tf.keras.optimizers.SGD(learning_rate=0.001) # just an example
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

`Unfreeze reused layers`

- After training for a few epochs -> optionally unfreeze (some) layers


In [None]:
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4, validation_data=(X_valid_B, y_valid_B))

for layer in model_B_on_A.layers[:-1]:
  layer.trainable = True

optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16, validation_data=(X_valid_B, y_valid_B))

### Regularization

`l1 and l2`

L2:

- Adds regularization term to cost function equal to sum of squared values of weights, multiplied by a constant (0.01 in this case)
- $Cost + \frac{C}{2} ∑_i{w_i^2}$
  - with $C$ = constant
  - with $w_i$ = weights
- Don't use with Adam optimizer -> use AdamW instead

L1:

- Adds regularization term to cost function equal to the sum of absolute values of weights, multiplied by a constant (0.01 in this case)
  - $Cost + C \sum_i |w_i|$
    - with $C$ = constant
    - with $w_i$ = weights
- Use L1 for sparse models

L1_L2:

- $Cost + C_1 \sum_i |w_i| + \frac{C_2}{2} \sum_i w_i^2$
    - with $C_1$ = constant of L1
    - with $C_2$ = constant of L2
    - with $w_i$ = weights

In [None]:
kr1 = tf.keras.regularizer.l1(0.01)
kr2 = tf.keras.regularizer.l2(0.01)
kr1_2 = tf.keras.regularizer.l1_l2(l1=0.01, l2=0.01)

layer = tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal", kernel_regularizer=kr1)

### Use partial to avoid repetition

In [None]:
from functools import partial

RegularizedDense = partial(tf.keras.layers.Dense,
                            activation="relu",
                            kernel_initializer="he_normal",
                            kernel_regularizer=tf.keras.regularizers.l2(0.01)
                           )

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28,28]),
    RegularizedDense(100),
    RegularizedDense(100),
    RegularizedDense(10, activation="softmax")
])

### Dropout in Keras

- Only does something during training (not during interference)
- If model is overfitting -> increase dropout rate and vice versa

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),

    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),

    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),

    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(10, activation="softmax")
])

`Monte Carlo (MC) Dropout`

In [None]:
# suppose you have a model called model
# with Dropout layers and you want to use MC Dropout

class MCDropout(tf.keras.layers.Dropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

Dropout = tf.keras.layers.Dropout
mc_model = tf.keras.Sequential([
    MCDropout(layer.rate) if isinstance(layer, Dropout) else layer
    for layer in model.layers
])

mc_model.set_weights(model.get_weights())