In [None]:
# Importing necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

In [None]:
# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Normalize the dataset (0 to 1 range) and reshape input
x_train = x_train.reshape(-1, 28 * 28).astype("float32") / 255.0
x_test = x_test.reshape(-1, 28 * 28).astype("float32") / 255.0

In [None]:
# Convert labels to one-hot encoding
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

In [None]:
# Build the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(784,)),  # Hidden layer
    Dense(10, activation='softmax')  # Output layer
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model with Adam optimizer and categorical cross-entropy loss
model.compile(optimizer=Adam(), loss=CategoricalCrossentropy(), metrics=['accuracy'])

In [None]:
# Custom callback to monitor the gradients during backpropagation
class GradientMonitor(tf.keras.callbacks.Callback):
    def on_batch_end(self, batch, logs=None):
        with tf.GradientTape() as tape:
            y_pred = self.model(x_train[:32])  # Use a small batch for gradient computation
            loss = tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y_train[:32], y_pred))

        # Compute gradients
        grads = tape.gradient(loss, self.model.trainable_weights)

        # Print gradient norms to investigate vanishing/exploding gradients
        for i, grad in enumerate(grads):
            print(f"Layer {i} gradient norm: {np.linalg.norm(grad.numpy())}")

In [8]:
# Train the model and monitor gradients
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test),
          callbacks=[GradientMonitor()])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Layer 1 gradient norm: 0.04950471222400665
Layer 2 gradient norm: 0.3750082850456238
Layer 3 gradient norm: 0.03240017965435982
Layer 0 gradient norm: 0.3981270492076874
Layer 1 gradient norm: 0.04716205224394798
Layer 2 gradient norm: 0.35756489634513855
Layer 3 gradient norm: 0.030836086720228195
[1m 627/1875[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m28s[0m 23ms/step - accuracy: 0.9880 - loss: 0.0411Layer 0 gradient norm: 0.37733474373817444
Layer 1 gradient norm: 0.0446072518825531
Layer 2 gradient norm: 0.3386405110359192
Layer 3 gradient norm: 0.02915327250957489
Layer 0 gradient norm: 0.36225149035453796
Layer 1 gradient norm: 0.042778126895427704
Layer 2 gradient norm: 0.3248521387577057
Layer 3 gradient norm: 0.0279499851167202
Layer 0 gradient norm: 0.34162434935569763
Layer 1 gradient norm: 0.04028243198990822
Layer 2 gradient norm: 0.3049949109554291
Layer 3 gradient norm: 0.02626638486981392
[1m 630/18

<keras.src.callbacks.history.History at 0x79389ee37610>