In [35]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from functools import partial
import math

## Applying selu sctivation

In [2]:
# neutal net with 100 hidden layers
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape=(28, 28)))
model.add(tf.keras.layers.Flatten())
for layer in range(100):
    model.add(
        tf.keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal")
    )
model.add(tf.keras.layers.Dense(10, activation="softmax"))

In [3]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
    metrics=["accuracy"],
)

In [4]:
fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]
X_train, X_valid, X_test = X_train / 255, X_valid / 255, X_test / 255

In [5]:
class_names = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

In [6]:
# standardizing the pixels
pixel_means = X_train.mean(axis=0, keepdims=True)
pixel_stds = X_train.std(axis=0, keepdims=True)
X_train_scaled = (X_train - pixel_means) / pixel_stds
X_valid_scaled = (X_valid - pixel_means) / pixel_stds
X_test_scaled = (X_test - pixel_means) / pixel_stds

In [7]:
history = model.fit(
    X_train_scaled, y_train, epochs=5, validation_data=[X_valid_scaled, y_valid]
)

Epoch 1/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 12ms/step - accuracy: 0.4129 - loss: 1.5114 - val_accuracy: 0.6950 - val_loss: 0.8191
Epoch 2/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.7051 - loss: 0.8133 - val_accuracy: 0.7470 - val_loss: 0.7024
Epoch 3/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.7471 - loss: 0.6966 - val_accuracy: 0.7728 - val_loss: 0.6358
Epoch 4/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 13ms/step - accuracy: 0.7758 - loss: 0.6123 - val_accuracy: 0.7812 - val_loss: 0.6117
Epoch 5/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.7831 - loss: 0.5950 - val_accuracy: 0.7904 - val_loss: 0.5591


In [8]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)

In [9]:
# Batch Normalization
dense_layer = partial(
    tf.keras.layers.Dense, activation="relu", kernel_initializer="he_normal"
)

model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(28, 28)),
        tf.keras.layers.Flatten(),
        dense_layer(300, use_bias=False),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation("relu"),
        dense_layer(100, use_bias=False),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation("relu"),
        tf.keras.layers.Dense(10, activation="softmax"),
    ]
)

In [10]:
[(var.name, var.trainable) for var in model.layers[2].variables]

[('gamma', True),
 ('beta', True),
 ('moving_mean', False),
 ('moving_variance', False)]

In [11]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"]
)
model.fit(X_train_scaled, y_train, epochs=2, validation_data=[X_valid_scaled, y_valid])

Epoch 1/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7237 - loss: 0.8530 - val_accuracy: 0.8490 - val_loss: 0.4218
Epoch 2/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8510 - loss: 0.4339 - val_accuracy: 0.8718 - val_loss: 0.3761


<keras.src.callbacks.history.History at 0x325e72bd0>

In [12]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"]
)
model.fit(X_train_scaled, y_train, epochs=2, validation_data=[X_valid_scaled, y_valid])

Epoch 1/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8722 - loss: 0.3630 - val_accuracy: 0.8746 - val_loss: 0.3578
Epoch 2/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8894 - loss: 0.3170 - val_accuracy: 0.8760 - val_loss: 0.3521


<keras.src.callbacks.history.History at 0x325b7ac90>

In [13]:
# Gradient Clipping

optimizer = tf.keras.optimizers.SGD(clipvalue=1.0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer)

optimizer = tf.keras.optimizers.SGD(clipnorm=1.0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer)

In [14]:
# Reusing pretrained layers
pos_class_id = class_names.index("Pullover")
neg_class_id = class_names.index("T-shirt/top")


def split_dataset(X, y):
    y_for_B = (y == pos_class_id) | (y == neg_class_id)
    y_A = y[~y_for_B]
    y_B = (y[y_for_B] == pos_class_id).astype(np.float32)
    old_class_ids = list(set(range(10)) - set([neg_class_id, pos_class_id]))
    for old_class_id, new_class_id in zip(old_class_ids, range(8)):
        y_A[y_A == old_class_id] = new_class_id  # reorder class ids for A
    return ((X[~y_for_B], y_A), (X[y_for_B], y_B))


(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

tf.random.set_seed(42)

model_A = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(28, 28)),
        tf.keras.layers.Flatten(),
        dense_layer(100),
        dense_layer(100),
        dense_layer(100),
        tf.keras.layers.Dense(8, activation="softmax"),
    ]
)

model_A.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
    metrics=["accuracy"],
)

history = model_A.fit(
    X_train_A, y_train_A, epochs=20, validation_data=[X_valid_A, y_valid_A]
)
model_A.save("my_model_a.keras")

Epoch 1/20
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 833us/step - accuracy: 0.5237 - loss: 1.5058 - val_accuracy: 0.7824 - val_loss: 0.7006
Epoch 2/20
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 759us/step - accuracy: 0.7894 - loss: 0.6520 - val_accuracy: 0.8406 - val_loss: 0.5147
Epoch 3/20
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 837us/step - accuracy: 0.8404 - loss: 0.5018 - val_accuracy: 0.8586 - val_loss: 0.4380
Epoch 4/20
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 861us/step - accuracy: 0.8619 - loss: 0.4321 - val_accuracy: 0.8661 - val_loss: 0.3957
Epoch 5/20
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 784us/step - accuracy: 0.8721 - loss: 0.3914 - val_accuracy: 0.8737 - val_loss: 0.3694
Epoch 6/20
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 761us/step - accuracy: 0.8789 - loss: 0.3647 - val_accuracy: 0.8772 - val_loss: 0.3511
Epoc

In [15]:
# evaluating model B

tf.random.set_seed(42)
model_B = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(28, 28)),
        tf.keras.layers.Flatten(),
        dense_layer(100),
        dense_layer(100),
        dense_layer(100),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model_B.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
    metrics=["accuracy"],
)

history = model_B.fit(
    X_train_B, y_train_B, epochs=20, validation_data=[X_valid_B, y_valid_B]
)
model_B.evaluate(X_test_B, y_test_B)

Epoch 1/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.4301 - loss: 1.2276 - val_accuracy: 0.4847 - val_loss: 0.9876
Epoch 2/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4301 - loss: 1.0418 - val_accuracy: 0.4847 - val_loss: 0.8724
Epoch 3/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4301 - loss: 0.9150 - val_accuracy: 0.4847 - val_loss: 0.7956
Epoch 4/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4301 - loss: 0.8291 - val_accuracy: 0.4847 - val_loss: 0.7418
Epoch 5/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4301 - loss: 0.7681 - val_accuracy: 0.4857 - val_loss: 0.7032
Epoch 6/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4337 - loss: 0.7235 - val_accuracy: 0.4876 - val_loss: 0.6753
Epoch 7/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━

[0.5190897583961487, 0.8964999914169312]

In [16]:
model_A = tf.keras.models.load_model("my_model_a.keras")
model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])  # unitl the last layer
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))

# note that model_B_on_A andModel_A actually share layers now, so when we train one, it will update both models
# If we want to avoid that, we need to build model_B_on_A on top of clone of model_A

In [17]:
tf.random.set_seed(42)
model_A_clone = tf.keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

In [18]:
model_B_on_A = tf.keras.Sequential(model_A_clone.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))

# not training the/updating the weights that were taken from the previous model_A
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False

optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(
    loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)
history = model_B_on_A.fit(
    X_train_B, y_train_B, epochs=4, validation_data=(X_valid_B, y_valid_B)
)

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(
    loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)
history = model_B_on_A.fit(
    X_train_B, y_train_B, epochs=16, validation_data=(X_valid_B, y_valid_B)
)

Epoch 1/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.5757 - loss: 1.0715 - val_accuracy: 0.5232 - val_loss: 0.7999
Epoch 2/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5444 - loss: 0.7284 - val_accuracy: 0.5589 - val_loss: 0.7306
Epoch 3/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6016 - loss: 0.6939 - val_accuracy: 0.5668 - val_loss: 0.7151
Epoch 4/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6054 - loss: 0.6826 - val_accuracy: 0.5875 - val_loss: 0.7011
Epoch 1/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.6519 - loss: 0.6589 - val_accuracy: 0.6677 - val_loss: 0.6487
Epoch 2/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6918 - loss: 0.6073 - val_accuracy: 0.7211 - val_loss: 0.5977
Epoch 3/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [19]:
model_B_on_A.evaluate(X_test_B, y_test_B)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 738us/step - accuracy: 0.9027 - loss: 0.3320


[0.33885955810546875, 0.902999997138977]

In [20]:
# Faster Optimizers
# Momentum OPtimizer


def build_model(seed=42):
    tf.random.set_seed(seed)
    return tf.keras.Sequential(
        [
            tf.keras.layers.Input(shape=(28, 28)),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(
                100, activation="relu", kernel_initializer="he_normal"
            ),
            tf.keras.layers.Dense(
                100, activation="relu", kernel_initializer="he_normal"
            ),
            tf.keras.layers.Dense(
                100, activation="relu", kernel_initializer="he_normal"
            ),
            tf.keras.layers.Dense(10, activation="softmax"),
        ]
    )


def build_and_train_model(optimizer):
    model = build_model()
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=optimizer,
        metrics=["accuracy"],
    )

    return model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))


optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)
history_sgd = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 941us/step - accuracy: 0.6627 - loss: 0.9948 - val_accuracy: 0.8158 - val_loss: 0.5080
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 818us/step - accuracy: 0.8293 - loss: 0.4897 - val_accuracy: 0.8328 - val_loss: 0.4599
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 809us/step - accuracy: 0.8478 - loss: 0.4342 - val_accuracy: 0.8434 - val_loss: 0.4310
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 824us/step - accuracy: 0.8575 - loss: 0.4024 - val_accuracy: 0.8518 - val_loss: 0.4086
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 790us/step - accuracy: 0.8663 - loss: 0.3799 - val_accuracy: 0.8566 - val_loss: 0.3913
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 813us/step - accuracy: 0.8712 - loss: 0.3619 - val_accuracy: 0.8616 - val_loss: 0.3806
Epoc

In [21]:
# Neterov Optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True)
history_nesterov = build_and_train_model(optimizer=optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 919us/step - accuracy: 0.6761 - loss: 0.9837 - val_accuracy: 0.8208 - val_loss: 0.4970
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 862us/step - accuracy: 0.8312 - loss: 0.4799 - val_accuracy: 0.8350 - val_loss: 0.4503
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 872us/step - accuracy: 0.8482 - loss: 0.4307 - val_accuracy: 0.8450 - val_loss: 0.4235
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 989us/step - accuracy: 0.8571 - loss: 0.4022 - val_accuracy: 0.8496 - val_loss: 0.4090
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 901us/step - accuracy: 0.8650 - loss: 0.3806 - val_accuracy: 0.8532 - val_loss: 0.3962
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 890us/step - accuracy: 0.8702 - loss: 0.3634 - val_accuracy: 0.8576 - val_loss: 0.3868
Epoc

In [22]:
# AdaGrad OPtimizer
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.001)
history_ada_grad = build_and_train_model(optimizer=optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5775 - loss: 1.3025 - val_accuracy: 0.7822 - val_loss: 0.6708
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7827 - loss: 0.6596 - val_accuracy: 0.8084 - val_loss: 0.5740
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 952us/step - accuracy: 0.8070 - loss: 0.5790 - val_accuracy: 0.8176 - val_loss: 0.5302
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8190 - loss: 0.5389 - val_accuracy: 0.8246 - val_loss: 0.5038
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 950us/step - accuracy: 0.8264 - loss: 0.5137 - val_accuracy: 0.8312 - val_loss: 0.4861
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 941us/step - accuracy: 0.8318 - loss: 0.4963 - val_accuracy: 0.8350 - val_loss: 0.4729
Epoch 7/10

In [23]:
# RMSProp
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
history_rms = build_and_train_model(optimizer=optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7626 - loss: 0.6627 - val_accuracy: 0.8442 - val_loss: 0.4327
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8564 - loss: 0.4006 - val_accuracy: 0.8550 - val_loss: 0.3965
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8705 - loss: 0.3689 - val_accuracy: 0.8588 - val_loss: 0.4083
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 986us/step - accuracy: 0.8748 - loss: 0.3559 - val_accuracy: 0.8550 - val_loss: 0.4371
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 983us/step - accuracy: 0.8786 - loss: 0.3475 - val_accuracy: 0.8558 - val_loss: 0.4467
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 961us/step - accuracy: 0.8799 - loss: 0.3445 - val_accuracy: 0.8442 - val_loss: 0.5449
Epoch 7/10

In [24]:
# Adam optimizers
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
history_adam = build_and_train_model(optimizer=optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7739 - loss: 0.6320 - val_accuracy: 0.8268 - val_loss: 0.4286
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8577 - loss: 0.3919 - val_accuracy: 0.8420 - val_loss: 0.4055
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8721 - loss: 0.3468 - val_accuracy: 0.8440 - val_loss: 0.4070
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8840 - loss: 0.3170 - val_accuracy: 0.8570 - val_loss: 0.3690
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8922 - loss: 0.2959 - val_accuracy: 0.8666 - val_loss: 0.3703
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8961 - loss: 0.2793 - val_accuracy: 0.8710 - val_loss: 0.3557
Epoch 7/10
[1m1

In [25]:
optimizer = tf.keras.optimizers.Adamax(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
optimizer = tf.keras.optimizers.AdamW(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
history_adamw = build_and_train_model(optimizer=optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7736 - loss: 0.6421 - val_accuracy: 0.8398 - val_loss: 0.4102
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8598 - loss: 0.3847 - val_accuracy: 0.8434 - val_loss: 0.4117
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8735 - loss: 0.3425 - val_accuracy: 0.8518 - val_loss: 0.3955
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8833 - loss: 0.3143 - val_accuracy: 0.8492 - val_loss: 0.4035
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8917 - loss: 0.2948 - val_accuracy: 0.8622 - val_loss: 0.3898
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8985 - loss: 0.2774 - val_accuracy: 0.8598 - val_loss: 0.4056
Epoch 7/10
[1m1

In [26]:
# Learning Scheduling
# Power Scheduling

lr_schdeule = tf.keras.optimizers.schedules.InverseTimeDecay(
    initial_learning_rate=0.01, decay_steps=10_000, decay_rate=1.0, staircase=False
)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schdeule)
history_power_scheduling = build_and_train_model(optimizer=optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 866us/step - accuracy: 0.6864 - loss: 0.9502 - val_accuracy: 0.8288 - val_loss: 0.4913
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 795us/step - accuracy: 0.8270 - loss: 0.4894 - val_accuracy: 0.8390 - val_loss: 0.4487
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 772us/step - accuracy: 0.8453 - loss: 0.4364 - val_accuracy: 0.8448 - val_loss: 0.4285
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 768us/step - accuracy: 0.8563 - loss: 0.4068 - val_accuracy: 0.8502 - val_loss: 0.4135
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 762us/step - accuracy: 0.8631 - loss: 0.3872 - val_accuracy: 0.8566 - val_loss: 0.4021
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 870us/step - accuracy: 0.8677 - loss: 0.3726 - val_accuracy: 0.8608 - val_loss: 0.3917
Epoc

In [27]:
# exponential Scheduling
# learning_rate = initial_learning_rate * decay_rate ** (step/ decay_steps)

lr_schdeule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01, decay_steps=20_000, decay_rate=0.1, staircase=False
)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schdeule)
history_exponential_scheduling = build_and_train_model(optimizer=optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 835us/step - accuracy: 0.6757 - loss: 0.9730 - val_accuracy: 0.8266 - val_loss: 0.4951
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 831us/step - accuracy: 0.8277 - loss: 0.4921 - val_accuracy: 0.8394 - val_loss: 0.4495
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 816us/step - accuracy: 0.8447 - loss: 0.4402 - val_accuracy: 0.8470 - val_loss: 0.4274
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 777us/step - accuracy: 0.8558 - loss: 0.4120 - val_accuracy: 0.8516 - val_loss: 0.4098
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 789us/step - accuracy: 0.8624 - loss: 0.3934 - val_accuracy: 0.8568 - val_loss: 0.3983
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 779us/step - accuracy: 0.8658 - loss: 0.3800 - val_accuracy: 0.8602 - val_loss: 0.3894
Epoc

In [28]:
# custom scheduling callbacks


def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1 ** (epoch / s)

    return exponential_decay_fn


exponential_decay_fn = exponential_decay(lr0=0.01, s=20)

In [29]:
tf.random.set_seed(42)
model = build_model()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)

n_epochs = 20
lr_schduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)
history = model.fit(
    X_train,
    y_train,
    epochs=n_epochs,
    validation_data=(X_valid, y_valid),
    callbacks=[lr_schduler],
)

Epoch 1/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 834us/step - accuracy: 0.6915 - loss: 0.9541 - val_accuracy: 0.8256 - val_loss: 0.4851 - learning_rate: 0.0100
Epoch 2/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 763us/step - accuracy: 0.8302 - loss: 0.4862 - val_accuracy: 0.8426 - val_loss: 0.4423 - learning_rate: 0.0089
Epoch 3/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 751us/step - accuracy: 0.8472 - loss: 0.4337 - val_accuracy: 0.8484 - val_loss: 0.4234 - learning_rate: 0.0079
Epoch 4/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 769us/step - accuracy: 0.8573 - loss: 0.4037 - val_accuracy: 0.8514 - val_loss: 0.4119 - learning_rate: 0.0071
Epoch 5/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 795us/step - accuracy: 0.8638 - loss: 0.3830 - val_accuracy: 0.8574 - val_loss: 0.4011 - learning_rate: 0.0063
Epoch 6/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━

In [None]:
k = tf.keras.backend


class ExponentialDecay(tf.keras.callbacks.Callback):
    def __init__(self, n_steps=40000):
        super().__init__()
        self.n_steps = n_steps

    def on_batch_begin(self, batch, logs=None):
        # Note: the `batch` argument is reset at each epoch
        lr = self.model.optimizer.learning_rate.numpy()
        new_leatning_rate = lr * 0.1 ** (1 / self.n_steps)
        self.model.optimizer.learning_rate = new_leatning_rate

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs["lr"] = self.model.optimizer.learning_rate.numpy()

In [None]:
lr0 = 0.01
model = build_model()
optimizer = tf.keras.optimizers.SGD(learning_rate=lr0)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)

In [36]:
batch_size = 32
n_steps = n_epochs * math.ceil(len(X_train) / batch_size)
exp_decay = ExponentialDecay(n_steps)
history = model.fit(
    X_train,
    y_train,
    epochs=n_epochs,
    validation_data=(X_valid, y_valid),
    callbacks=[exp_decay],
)

Epoch 1/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6865 - loss: 0.9407 - val_accuracy: 0.8274 - val_loss: 0.4930 - lr: 0.0089
Epoch 2/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 960us/step - accuracy: 0.8293 - loss: 0.4873 - val_accuracy: 0.8418 - val_loss: 0.4523 - lr: 0.0079
Epoch 3/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 961us/step - accuracy: 0.8475 - loss: 0.4363 - val_accuracy: 0.8472 - val_loss: 0.4330 - lr: 0.0071
Epoch 4/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 959us/step - accuracy: 0.8573 - loss: 0.4080 - val_accuracy: 0.8484 - val_loss: 0.4239 - lr: 0.0063
Epoch 5/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 972us/step - accuracy: 0.8627 - loss: 0.3887 - val_accuracy: 0.8510 - val_loss: 0.4164 - lr: 0.0056
Epoch 6/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 957us/step - accuracy: 0.868

In [None]:
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[50_000, 80_000], values=[0.01, 0.005, 0.001]
)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schdeule)
history_piecewise_scheduling = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 863us/step - accuracy: 0.6684 - loss: 0.9834 - val_accuracy: 0.8306 - val_loss: 0.4933
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 775us/step - accuracy: 0.8253 - loss: 0.5006 - val_accuracy: 0.8396 - val_loss: 0.4439
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 769us/step - accuracy: 0.8451 - loss: 0.4466 - val_accuracy: 0.8454 - val_loss: 0.4220
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 776us/step - accuracy: 0.8557 - loss: 0.4179 - val_accuracy: 0.8510 - val_loss: 0.4088
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 788us/step - accuracy: 0.8618 - loss: 0.3990 - val_accuracy: 0.8566 - val_loss: 0.3985
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 790us/step - accuracy: 0.8655 - loss: 0.3855 - val_accuracy: 0.8588 - val_loss: 0.3901
Epoc

In [None]:
# performance Scheduling
model = build_model()
optimizer = tf.keras.optimizers.SGD(learning_rate=lr0)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)

In [None]:
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
history_performance_scheduler = model.fit(
    X_train,
    y_train,
    epochs=n_epochs,
    validation_data=(X_valid, y_valid),
    callbacks=[lr_schduler],
)

Epoch 1/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 835us/step - accuracy: 0.6724 - loss: 0.9592 - val_accuracy: 0.8290 - val_loss: 0.4922 - learning_rate: 0.0100
Epoch 2/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 768us/step - accuracy: 0.8270 - loss: 0.4896 - val_accuracy: 0.8386 - val_loss: 0.4477 - learning_rate: 0.0089
Epoch 3/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 785us/step - accuracy: 0.8481 - loss: 0.4341 - val_accuracy: 0.8490 - val_loss: 0.4269 - learning_rate: 0.0079
Epoch 4/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 766us/step - accuracy: 0.8588 - loss: 0.4035 - val_accuracy: 0.8514 - val_loss: 0.4108 - learning_rate: 0.0071
Epoch 5/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 772us/step - accuracy: 0.8663 - loss: 0.3832 - val_accuracy: 0.8566 - val_loss: 0.3978 - learning_rate: 0.0063
Epoch 6/20
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━

In [None]:
class OneCycleScheduler(tf.keras.callbacks.Callback):
    def __init__(
        self, iterations, max_lr=1e-3, start_lr=None, last_iterations=None, last_lr=None
    ):
        self.iterations = iterations
        self.max_lr = max_lr
        self.start_lr = start_lr or max_lr / 10
        self.last_iterations = last_iterations or iterations // 10 + 1
        self.half_iteration = (iterations - self.last_iterations) // 2
        self.last_lr = last_lr or self.start_lr / 1000
        self.iteration = 0

    def _interpolate(self, iter1, iter2, lr1, lr2):
        return (lr2 - lr1) * (self.iteration - iter1) / (iter2 - iter1) + lr1

    def on_batch_begin(self, batch, logs):
        if self.iteration < self.half_iteration:
            lr = self._interpolate(0, self.half_iteration, self.start_lr, self.max_lr)
        elif self.iteration < 2 * self.half_iteration:
            lr = self._interpolate(
                self.half_iteration, 2 * self.half_iteration, self.max_lr, self.start_lr
            )
        else:
            lr = self._interpolate(
                2 * self.half_iteration, self.iterations, self.start_lr, self.last_lr
            )
        self.iteration += 1
        self.model.optimizer.learning_rate = lr

In [None]:
model = build_model()
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.SGD(),
    metrics=["accuracy"],
)
n_epochs = 25
onecycle = OneCycleScheduler(
    math.ceil(len(X_train) / batch_size) * n_epochs, max_lr=0.1
)
history = model.fit(
    X_train,
    y_train,
    epochs=n_epochs,
    validation_data=(X_test, y_test),
    callbacks=[onecycle],
)

Epoch 1/25
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 993us/step - accuracy: 0.6824 - loss: 0.9495 - val_accuracy: 0.8184 - val_loss: 0.5158
Epoch 2/25
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 908us/step - accuracy: 0.8321 - loss: 0.4795 - val_accuracy: 0.8206 - val_loss: 0.4938
Epoch 3/25
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 893us/step - accuracy: 0.8491 - loss: 0.4178 - val_accuracy: 0.8284 - val_loss: 0.4723
Epoch 4/25
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 914us/step - accuracy: 0.8612 - loss: 0.3847 - val_accuracy: 0.8455 - val_loss: 0.4252
Epoch 5/25
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 905us/step - accuracy: 0.8675 - loss: 0.3632 - val_accuracy: 0.8465 - val_loss: 0.4218
Epoch 6/25
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 915us/step - accuracy: 0.8733 - loss: 0.3439 - val_accuracy: 0.8459 - val_loss: 0.4206
Epoc

In [None]:
# Avoiding Overfittin through regularization
RegularDenselayer = partial(
    tf.keras.layers.Dense,
    activation="relu",
    kernel_initializer="he_normal",
    kernel_regularizer=tf.keras.regularizers.l2(0.01),
)

model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(28, 28)),
        tf.keras.layers.Flatten(),
        RegularDenselayer(100),
        RegularDenselayer(100),
        RegularDenselayer(10, activation="softmax"),
    ]
)

In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.02)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)
history = model.fit(X_train, y_train, epochs=2, validation_data=(X_valid, y_valid))

Epoch 1/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 812us/step - accuracy: 0.7088 - loss: 4.0575 - val_accuracy: 0.8220 - val_loss: 1.8543
Epoch 2/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 752us/step - accuracy: 0.8134 - loss: 1.6180 - val_accuracy: 0.8256 - val_loss: 1.1174
