# How to set initilizer in deep neural model

In [1]:
import tensorflow as tf
from tensorflow import keras

he_initialization = keras.initializers.he_normal
keras.layers.Dense(30, activation = 'relu',kernel_initializer=he_initialization)
#or
keras.layers.Dense(30, activation = 'relu',kernel_initializer='he_normal')

2024-01-18 04:20:53.747030: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-18 04:20:53.940996: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-18 04:20:53.941051: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-18 04:20:53.943601: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-18 04:20:53.960879: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-18 04:20:53.961740: I tensorflow/core/platform/cpu_feature_guard.cc:1

<keras.src.layers.core.dense.Dense at 0x7fccb8baded0>

# If we want to initialize fan_avg in he_uniform

In [2]:
init = keras.initializers.VarianceScaling(scale=2, mode='fan_avg', distribution='uniform')
keras.layers.Dense(30, activation='relu',kernel_initializer=init)

<keras.src.layers.core.dense.Dense at 0x7fccb8d1acb0>

# Leaky Relu

In [3]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(100, kernel_initializer="he_normal"),
    keras.layers.LeakyReLU(), # Leaky Relu is used like this separately which is why I wrote ir an example
    keras.layers.Dense(10, activation="softmax")
])


# Batch Normalization

In [4]:
model = keras.models.Sequential([
    keras.layers.Dense(30, use_bias=False), # Deactivating bias is very common in BN as it uses offset
    keras.layers.BatchNormalization(momentum=.99, axis=-1), # 2 parameter that can be tweaked see the axis explanation below
    keras.layers.Activation('relu'), # Using activation func after BN will normalize the input for next layer
    keras.layers.Dense(10, activation='softmax')
]) # This just an example not a model, just to show case the batch normalization


In [5]:
# axis=-1 (channel-wise normalization): This is the most common setting in 2D convolutional neural networks (CNNs). 
#          It tells the layer to perform normalization within each image,independently for each channel. 
#          For instance, it would calculate the mean and variance of all whisker pixel values across all cat images in the batch
#          and then normalize each cat's whisker values based on those statistics. 
#          This ensures that whiskers from different cats are compared fairly even if they have differing brightness or contrast.
# axis=1 (also channel-wise, but in 3D CNNs): This works similarly to axis=-1 but is used in 3D CNNs,
#          where your data might have additional dimensions like time or depth.
# axis=None (all-axis normalization): This is less common but tells the layer to perform normalization across all dimensions
#          of the input data, treating it as a single entity. This might be suitable for certain specialized situations but often
#          isn't used in typical image classification tasks.

In [6]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10, activation="softmax")
])

In [7]:
bn1 = model.layers[1]
[(var.name, var.trainable) for var in bn1.variables] # By non trainable it means parameters that will not be twiked during backpropagation
                                                     # for example std and mean

[('batch_normalization_1/gamma:0', True),
 ('batch_normalization_1/beta:0', True),
 ('batch_normalization_1/moving_mean:0', False),
 ('batch_normalization_1/moving_variance:0', False)]

# Graddient clipping

In [8]:
# Clipping used to solve gradient explotion by setting an threshold for gradient vector
optimizer = keras.optimizers.SGD(clipvalue=1.0)
optimizer = keras.optimizers.SGD(clipnorm=1.0)

# Tranfer learning

In [9]:
model_A = keras.models.load_model('best_model.h5')
# Let's say I want to build another model_B using the same neurons of the A model.
model_B_on_A = keras.models.Sequential(model_A.layers[:-1]) # Expect for the outpur layer so we typed -1
model_B_on_A.add(keras.layers.Dense(1, activation='sigmoid')) # output layer of the B model


# We are using the A model's neuron layer in our new model for boolean classification

# Cloning model

In [10]:
# If we want to train model B explicitely on a dataset, we have to clone it so changes do not effect the model A's neurons

model_A_clone = keras.models.clone_model(model_A) # This code effectively copies the learned knowledge (weights and biases) from model A
model_A_clone.set_weights(model_A.get_weights())

# Freezing some layers

In [11]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False # Freezing all the neurons that are from the other model

# model.compile and model.fit to train the non reused neurons then

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

# Updating learning rate at each iteration

In [12]:
# If you want to update the learning rate at each iteration rather than at each epoch, 
# you must write your own callback class:
from keras.datasets import boston_housing
K = keras.backend

class ExponentialDecay(keras.callbacks.Callback):
    def __init__(self, s=40000):
        super().__init__()
        self.s = s

    def on_batch_begin(self, batch, logs=None):
        # Note: the `batch` argument is reset at each epoch
        lr = K.get_value(self.model.optimizer.learning_rate)
        K.set_value(self.model.optimizer.learning_rate, lr * 0.1**(1 / self.s))

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs['lr'] = K.get_value(self.model.optimizer.learning_rate)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.Dense(10, activation="softmax")
])
lr0 = 0.01
optimizer = keras.optimizers.Nadam(learning_rate=lr0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
n_epochs = 25

exp_decay = ExponentialDecay()
# history = model.fit(epochs=n_epochs,
#                     validation_data=(X_valid_scaled, y_valid),
#                     callbacks=[exp_decay])

In [13]:
import tensorflow as tf
from keras.layers import Dense
from keras.models import Sequential
from keras.constraints import max_norm

# Create a simple model with two Dense layers
model = Sequential([
    Dense(64, activation='relu', input_shape=(10,), kernel_constraint=max_norm(3)),
    Dense(10, activation='softmax', kernel_constraint=max_norm(2))
])

# Compile the model with an optimizer and loss function
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Generate some sample data
X_train = tf.random.normal((1000, 10))
y_train = tf.one_hot(tf.random.uniform((1000,), minval=0, maxval=10, dtype=tf.int32), depth=10)

# Train the model (max-norm constraint is enforced during training)
model.fit(X_train, y_train, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fcc3c656440>

In [15]:
import numpy as np

class MCDropout(keras.layers.Dropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

class MCAlphaDropout(keras.layers.AlphaDropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

tf.random.set_seed(42)
np.random.seed(42)

mc_model = keras.models.Sequential([
    MCAlphaDropout(layer.rate) if isinstance(layer, keras.layers.AlphaDropout) else layer
    for layer in model.layers
])

optimizer = keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
mc_model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

mc_model.set_weights(model.get_weights())


np.round(np.mean([mc_model.predict(X_train[:1]) for sample in range(100)], axis=0), 2)




array([[0.08, 0.08, 0.11, 0.13, 0.11, 0.09, 0.1 , 0.09, 0.1 , 0.1 ]],
      dtype=float32)