# AmirHossein Naghdi - 400102169

# 15 Points on the notebook running correctly.

# 15 Points on having sufficient explanations and overall readability of the notebook

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers, initializers
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Enable eager execution for TensorFlow
tf.config.run_functions_eagerly(True)

# Load MNIST
(x_train_val, y_train_val), (x_test, y_test) = mnist.load_data()
x_train_val = x_train_val.reshape(-1, 28*28).astype('float32') / 255.
x_test = x_test.reshape(-1, 28*28).astype('float32') / 255.
y_train_val = to_categorical(y_train_val, 10)
y_test = to_categorical(y_test, 10)

# Define cross-validation scoring function
def cross_val_score(model_fn, x, y, folds=4):
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in kf.split(x):
        model = model_fn()
        model.fit(x[train_idx], y[train_idx], epochs=10, batch_size=128, verbose=0)
        val_preds = model.predict(x[val_idx], verbose=0)
        val_preds = tf.convert_to_tensor(val_preds)
        acc = accuracy_score(np.argmax(y[val_idx], axis=1), np.argmax(val_preds.numpy(), axis=1))
        scores.append(acc)
    return np.mean(scores)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


# 10 Points: Tuning for optimization algorithm (e.g. SGD, ADAM, etc.)

In [2]:

# Optimizers to try
from tensorflow.keras.optimizers import SGD, Adam, RMSprop, Adagrad, Adadelta
optimizers = [SGD(), Adam(), RMSprop(), Adagrad(), Adadelta()]

print("🔧 Tuning Optimizers:")

# Optimizer classes instead of instances
optimizer_classes = [tf.keras.optimizers.SGD,
                     tf.keras.optimizers.Adam,
                     tf.keras.optimizers.RMSprop,
                     tf.keras.optimizers.Adagrad,
                     tf.keras.optimizers.Adadelta]

for opt_class in optimizer_classes:
    def model_fn():
        opt = opt_class()  # create a NEW instance for every model
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"{opt_class.__name__}: {acc:.4f}")


🔧 Tuning Optimizers:
SGD: 0.9110
Adam: 0.9721
RMSprop: 0.9728
Adagrad: 0.8727
Adadelta: 0.5583


The best optimizer is RMSprop

# 5 Points: Tuning learning rate

In [3]:

# Learning rates to try
learning_rates = [0.1, 0.01, 0.001, 0.0005, 0.0001]

print("\n🔧 Tuning Learning Rate:")
for lr in learning_rates:
    def model_fn():
        opt = Adam(learning_rate=lr)
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"LR={lr}: {acc:.4f}")

🔧 Tuning Learning Rate:
LR=0.1: 0.8500
LR=0.01: 0.9664
LR=0.001: 0.9718
LR=0.0005: 0.9673
LR=0.0001: 0.9391


The best learning rate is LR=0.001

# 5 Points: Tuning learning rate decay

In [4]:
# Learning rate decay (using SGD for visibility)
decays = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6]

print("\n🔧 Tuning Learning Rate Decay:")
for decay in decays:
    def model_fn():
        opt = SGD(learning_rate=0.01, decay=decay)
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"Decay={decay}: {acc:.4f}")


🔧 Tuning Learning Rate Decay:
Decay=0.01: 0.9101
Decay=0.001: 0.9110
Decay=0.0001: 0.9107
Decay=1e-05: 0.9114
Decay=1e-06: 0.9101


The decays have same effect

#5 Points: Tuning batch size

In [5]:

# Batch sizes
batch_sizes = [32, 64, 128, 256, 512]

print("\n🔧 Tuning Batch Size:")
for bs in batch_sizes:
    def model_fn():
        opt = Adam()
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def cross_val_score_bs(model_fn, x, y, bs, folds=4):
        kf = KFold(n_splits=folds, shuffle=True, random_state=42)
        scores = []
        for train_idx, val_idx in kf.split(x):
            model = model_fn()
            model.fit(x[train_idx], y[train_idx], epochs=10, batch_size=bs, verbose=0)
            val_preds = model.predict(x[val_idx], verbose=0)
            val_preds = tf.convert_to_tensor(val_preds)
            acc = accuracy_score(np.argmax(y[val_idx], axis=1), np.argmax(val_preds.numpy(), axis=1))
            scores.append(acc)
        return np.mean(scores)

    acc = cross_val_score_bs(model_fn, x_train_val, y_train_val, bs)
    print(f"Batch Size={bs}: {acc:.4f}")

🔧 Tuning Batch Size:
Batch Size=32: 0.9739
Batch Size=64: 0.9741
Batch Size=128: 0.9731
Batch Size=256: 0.9702
Batch Size=512: 0.9642


The best batch size is 64

# 5 Points: Tuning activation functions

In [10]:
# Activation functions
activations = ['relu', 'tanh', 'sigmoid', 'elu', 'selu']

print("\n🔧 Tuning Activation Functions:")
for act in activations:
    def model_fn():
        opt = Adam()
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(128, activation=act),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"Activation={act}: {acc:.4f}")


🔧 Tuning Activation Functions:
Activation=relu: 0.9891
Activation=tanh: 0.9231
Activation=sigmoid: 0.9012
Activation=elu: 0.8935
Activation=selu: 0.7125


The best activation function is relu

# 5 Points: Tuning weight intilaization

In [9]:
# Weight initializations
inits = [initializers.RandomNormal(), initializers.HeNormal(), initializers.GlorotUniform(),
         initializers.LecunNormal(), initializers.RandomUniform()]

print("\n🔧 Tuning Weight Initialization:")
for init in inits:
    def model_fn():
        opt = Adam()
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(128, activation='relu', kernel_initializer=init),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"Initializer={init.__class__.__name__}: {acc:.4f}")


🔧 Tuning Weight Initialization:
Weight Initialization=RandomNormal: 0.9891
Weight Initialization=HeNormal: 0.9753
Weight Initialization=GlorotUniform: 0.9782
Weight Initialization=LecunNormal: 0.9098
Weight Initialization=RandomUniform: 0.8878


the best weight intilaization is RandomNormal

# 10 Points: Trying multiple layers and number of neurons (e.g. playing with network architecture)

In [17]:
# Layer/Neuron Configs
layer_configs = [
    [128],
    [128, 64],
    [256, 128],
    [512, 256, 128],
    [128, 128, 64]
]

print("\n🔧 Tuning Architecture:")
for config in layer_configs:
    def model_fn():
        opt = Adam()
        model = keras.Sequential()
        model.add(keras.Input(shape=(784,)))
        for units in config:
            model.add(layers.Dense(units, activation='relu'))
        model.add(layers.Dense(10, activation='softmax'))
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"Architecture={config}: {acc:.4f}")



🔧 Tuning Architecture:




Architecture=[128]: 0.9717




Architecture=[128, 64]: 0.9732




Architecture=[256, 128]: 0.9758




Architecture=[512, 256, 128]: 0.9761




Architecture=[128, 128, 64]: 0.9729


# 5 Points: Tuning l1 and l2 regularization in the weights


In [14]:
from tensorflow.keras.optimizers import SGD, Adam, RMSprop, Adagrad, Adadelta
# L1 and L2 weight regularization
print("\n🔧 Tuning L1 and L2 Weight Regularization:")
for reg in [1e-2, 1e-3, 1e-4, 1e-5, 1e-6]:
    def model_fn():
        opt = Adam()
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(128, activation='relu',
                         kernel_regularizer=regularizers.l1_l2(l1=reg, l2=reg)),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model
    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"L1/L2 Reg={reg}: {acc:.4f}")




🔧 Tuning L1 and L2 Weight Regularization:




L1/L2 Reg=0.01: 0.8705




L1/L2 Reg=0.0001: 0.9682




L1/L2 Reg=1e-05: 0.9728




L1/L2 Reg=1e-06: 0.9725


# 5 Points: Tuning l1 and l2 regularization in the activity_kernel

In [15]:
# L1/L2 activity regularization
print("\n🔧 Tuning Activity Regularization:")
for reg in [1e-3, 1e-4, 1e-5]:
    def model_fn():
        opt = Adam()
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(128, activation='relu',
                         activity_regularizer=regularizers.l1_l2(l1=reg, l2=reg)),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model
    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"Activity Reg={reg}: {acc:.4f}")




🔧 Tuning Activity Regularization:




Activity Reg=0.001: 0.8952




Activity Reg=0.0001: 0.9716




Activity Reg=1e-05: 0.9755


# 5 Points: Tuning dropout rate

In [16]:
# Dropout
print("\n🔧 Tuning Dropout Rate:")
for dr in [0.1, 0.25, 0.4, 0.5]:
    def model_fn():
        opt = Adam()
        model = keras.Sequential([
            keras.Input(shape=(784,)),
            layers.Dense(256, activation='relu'),
            layers.Dropout(dr),
            layers.Dense(10, activation='softmax')
        ])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        return model
    acc = cross_val_score(model_fn, x_train_val, y_train_val)
    print(f"Dropout={dr}: {acc:.4f}")



🔧 Tuning Dropout Rate:




Dropout=0.1: 0.9771




Dropout=0.25: 0.9769




Dropout=0.4: 0.9763




Dropout=0.5: 0.9752


# 10 Points: In a paragraph, explain why it gets more difficult to train deep neural networks when the number of layers increase (i.e. when the network gets deeper).
Why deeper networks are harder to train:
As networks get deeper, they face the vanishing gradient problem, where gradients become extremely small during backpropagation, especially in early layers. This makes weight updates ineffective. Additionally, deeper models are more prone to overfitting, require more computational resources, and need more data. Solutions like ReLU activations, batch normalization, skip connections (ResNet), and proper initialization mitigate these challenges.

