In [57]:
import tensorflow as tf
import numpy as np
from tensorflow import keras

In [58]:
[name for name in dir(keras.initializers) if not name.startswith('_')]

['Constant',
 'GlorotNormal',
 'GlorotUniform',
 'HeNormal',
 'HeUniform',
 'Identity',
 'IdentityInitializer',
 'Initializer',
 'LecunNormal',
 'LecunUniform',
 'Ones',
 'Orthogonal',
 'OrthogonalInitializer',
 'RandomNormal',
 'RandomUniform',
 'TruncatedNormal',
 'VarianceScaling',
 'Zeros',
 'constant',
 'deserialize',
 'get',
 'glorot_normal',
 'glorot_uniform',
 'he_normal',
 'he_uniform',
 'identity',
 'lecun_normal',
 'lecun_uniform',
 'ones',
 'orthogonal',
 'random_normal',
 'random_uniform',
 'serialize',
 'truncated_normal',
 'variance_scaling',
 'zeros']

# Nonsaturating Activation Functions

In [60]:
dense = tf.keras.layers.Dense(50,activation = 'relu',
                             kernel_initializer= 'he_normal')

In [61]:
leaky_relu = tf.keras.layers.LeakyReLU(alpha = 0.2)
dense = tf.keras.layers.Dense(50,activation = leaky_relu,
                             kernel_initializer = 'he_normal')

In [62]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(50,kernel_initializer='he_normal'),
    tf.keras.layers.LeakyReLU(alpha = 0.2)
])

In [63]:
dense = tf.keras.layers.Dense(50,activation = 'elu',
                             kernel_initializer = 'he_normal')

In [64]:
dense = tf.keras.layers.Dense(50,activation = 'selu',
                              kernel_initializer = 'lecun_normal')

In [65]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape = [28,28]))
for layer in range(100):
    model.add(tf.keras.layers.Dense(50,activation = 'selu',
                                   kernel_initializer= 'lecun_normal'))

model.add(tf.keras.layers.Dense(10,activation = 'softmax'))

In [66]:
model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3),
             metrics=['accuracy'])

In [67]:
fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]
X_train, X_valid, X_test = X_train / 255, X_valid / 255, X_test / 255

In [68]:
class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
               "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

In [69]:
pixel_means = X_train.mean(axis = 0, keepdims = True)
pixel_stds = X_train.std(axis = 0, keepdims = True)
X_train_scaled = (X_train-pixel_means)/pixel_stds
X_test_scaled = (X_test-pixel_means)/pixel_stds
X_valid_scaled = (X_valid - pixel_means)/pixel_stds

In [70]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape = [28,28]))
for layer in range(100):
    model.add(tf.keras.layers.Dense(50,activation = 'relu',
                                   kernel_initializer='he_normal'))

model.add(tf.keras.layers.Dense(10,activation = 'softmax'))

In [71]:
model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3),
             metrics = ['accuracy'])

In [72]:
history = model.fit(X_train_scaled,y_train,epochs = 10,
                   validation_data = (X_valid_scaled,y_valid))

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 12ms/step - accuracy: 0.1423 - loss: 2.2388 - val_accuracy: 0.2236 - val_loss: 2.0057
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.2612 - loss: 1.8971 - val_accuracy: 0.3950 - val_loss: 1.2998
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.4524 - loss: 1.2443 - val_accuracy: 0.5894 - val_loss: 1.0151
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.5515 - loss: 1.0399 - val_accuracy: 0.4650 - val_loss: 1.2862
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.5338 - loss: 1.1041 - val_accuracy: 0.6288 - val_loss: 0.9527
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.6187 - loss: 0.9558 - val_accuracy: 0.5676 - val_loss: 1.0203
Epoc

# Batch Normalization

In [74]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = [28,28]),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(300,activation = 'relu',
                         kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(100,activation = 'relu',
                         kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(10,activation = 'softmax')
])

In [75]:
model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3),
             metrics = ['accuracy'])

In [76]:
history = model.fit(X_train_scaled,y_train,epochs = 2,
                   validation_data = (X_valid_scaled,y_valid))

Epoch 1/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.5588 - loss: 1.3383 - val_accuracy: 0.7808 - val_loss: 0.6327
Epoch 2/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.7760 - loss: 0.6483 - val_accuracy: 0.8116 - val_loss: 0.5337


In [77]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = [28,28]),
    tf.keras.layers.Dense(300,kernel_initializer = 'he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dense(100,kernel_initializer = 'he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dense(10),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('softmax')
])

In [78]:
model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3),
             metrics = ['accuracy'])

In [79]:
history = model.fit(X_train_scaled,y_train,epochs = 2,
                   validation_data = (X_valid_scaled,y_valid))

Epoch 1/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.5291 - loss: 1.4864 - val_accuracy: 0.7610 - val_loss: 0.9080
Epoch 2/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.7405 - loss: 0.9680 - val_accuracy: 0.7940 - val_loss: 0.7630


# Gardient Clipping

In [81]:
optimizer = tf.keras.optimizers.SGD(clipvalue = 1)
model.compile(loss = 'sparse_cateorical_crossentropy',optimizer = optimizer)

In [82]:
optimizer = tf.keras.optimizers.SGD(clipnorm = 1)
model.compile(loss = 'sparse_cateorical_crossentropy',optimizer = optimizer)

# Transfer Learning

In [84]:
pos_class_id = class_names.index("Pullover")
neg_class_id = class_names.index("T-shirt/top")

def split_dataset(X, y):
    y_for_B = (y == pos_class_id) | (y == neg_class_id)
    y_A = y[~y_for_B]
    y_B = (y[y_for_B] == pos_class_id).astype(np.float32)
    old_class_ids = list(set(range(10)) - set([neg_class_id, pos_class_id]))
    for old_class_id, new_class_id in zip(old_class_ids, range(8)):
        y_A[y_A == old_class_id] = new_class_id  # reorder class ids for A
    return ((X[~y_for_B], y_A), (X[y_for_B], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

model_A = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = [28,28]),
    tf.keras.layers.Dense(100,activation = 'relu',
                         kernel_initializer = 'he_normal'),
    tf.keras.layers.Dense(100,activation = 'relu',
                         kernel_initializer = 'he_normal'),
    tf.keras.layers.Dense(100,activation = 'relu',
                         kernel_initializer = 'he_normal'),
    tf.keras.layers.Dense(8,activation = 'softmax')
])


In [85]:
model_A.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3),
             metrics = ['accuracy'])

In [86]:
history = model_A.fit(X_train_A,y_train_A,epochs = 10,
                     validation_data = (X_valid_A,y_valid_A))

Epoch 1/10
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4308 - loss: 1.6578 - val_accuracy: 0.7626 - val_loss: 0.7252
Epoch 2/10
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7828 - loss: 0.6657 - val_accuracy: 0.8260 - val_loss: 0.5267
Epoch 3/10
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8369 - loss: 0.5099 - val_accuracy: 0.8556 - val_loss: 0.4463
Epoch 4/10
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8588 - loss: 0.4374 - val_accuracy: 0.8666 - val_loss: 0.4001
Epoch 5/10
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8718 - loss: 0.3939 - val_accuracy: 0.8739 - val_loss: 0.3706
Epoch 6/10
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8789 - loss: 0.3647 - val_accuracy: 0.8777 - val_loss: 0.3501
Epoch 7/10
[1m1

In [87]:
model_A.save('my_model_A.keras')

In [88]:
model_B = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = [28,28]),
    tf.keras.layers.Dense(100,activation = 'relu',kernel_initializer = 'he_normal'),
    tf.keras.layers.Dense(100,activation = 'relu',kernel_initializer  = 'he_normal'),
    tf.keras.layers.Dense(100,activation = 'relu',kernel_initializer = 'he_normal'),
    tf.keras.layers.Dense(1,activation = 'sigmoid')
])

In [89]:
model_B.compile(loss = 'binary_crossentropy',
             optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3),
             metrics = ['accuracy'])

In [90]:
history = model_B.fit(X_train_B,y_train_B,epochs = 20,
                     validation_data = (X_valid_B,y_valid_B))

Epoch 1/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 78ms/step - accuracy: 0.4430 - loss: 0.7409 - val_accuracy: 0.5173 - val_loss: 0.6881
Epoch 2/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5002 - loss: 0.6915 - val_accuracy: 0.5915 - val_loss: 0.6541
Epoch 3/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.6072 - loss: 0.6517 - val_accuracy: 0.6894 - val_loss: 0.6265
Epoch 4/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.7151 - loss: 0.6197 - val_accuracy: 0.7844 - val_loss: 0.6033
Epoch 5/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.8328 - loss: 0.5933 - val_accuracy: 0.8229 - val_loss: 0.5832
Epoch 6/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8463 - loss: 0.5706 - val_accuracy: 0.8536 - val_loss: 0.5653
Epoch 7/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━

In [91]:
model_A = tf.keras.models.load_model('my_model_A.keras')
model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1,activation = 'sigmoid'))

In [92]:
model_A_clone = tf.keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

In [93]:
model_B_on_A = tf.keras.Sequential(model_A_clone.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1,activation = 'sigmoid'))

In [94]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False

model_B_on_A.compile(loss = 'binary_crossentropy',
                    optimizer = tf.keras.optimizers.SGD(learning_rate = 0.001),
                    metrics = ['accuracy'])

In [95]:
history = model_B_on_A.fit(X_train_B,y_train_B,epochs = 4,
                          validation_data = (X_valid_B,y_valid_B))

Epoch 1/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 78ms/step - accuracy: 0.5891 - loss: 0.7012 - val_accuracy: 0.7043 - val_loss: 0.5339
Epoch 2/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.7430 - loss: 0.4916 - val_accuracy: 0.8467 - val_loss: 0.4723
Epoch 3/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8806 - loss: 0.4518 - val_accuracy: 0.8754 - val_loss: 0.4609
Epoch 4/4
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8531 - loss: 0.4450 - val_accuracy: 0.8823 - val_loss: 0.4568


In [96]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

model_B_on_A.compile(loss = 'binary_crossentropy',
                    optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3),
                    metrics = ['accuracy'])

In [97]:
history= model_B_on_A.fit(X_train_B,y_train_B,epochs = 16,
                          validation_data = (X_valid_B,y_valid_B))

Epoch 1/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 78ms/step - accuracy: 0.8713 - loss: 0.4360 - val_accuracy: 0.8872 - val_loss: 0.4293
Epoch 2/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.8713 - loss: 0.4073 - val_accuracy: 0.9041 - val_loss: 0.4026
Epoch 3/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9016 - loss: 0.3795 - val_accuracy: 0.9110 - val_loss: 0.3800
Epoch 4/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9282 - loss: 0.3561 - val_accuracy: 0.9149 - val_loss: 0.3608
Epoch 5/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9535 - loss: 0.3363 - val_accuracy: 0.9209 - val_loss: 0.3446
Epoch 6/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9535 - loss: 0.3197 - val_accuracy: 0.9248 - val_loss: 0.3308
Epoch 7/16
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━

In [98]:
model_B_on_A.evaluate(X_test_B,y_test_B)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9361 - loss: 0.2552


[0.2629643380641937, 0.9330000281333923]

# Faster Optimizers

In [99]:
def build_model(seed = 42):
    tf.random.set_seed(seed)
    return tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape = [28,28]),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,activation = 'relu',
                             kernel_initializer='he_normal'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,activation = 'relu',
                             kernel_initializer = 'he_normal'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100, activation = 'relu',
                             kernel_initializer = 'he_normal'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(10,activation = 'softmax')
    ])

In [100]:
def build_and_train_model(optimizer):
    model = build_model()
    model.compile(loss = 'sparse_categorical_crossentropy',
                 optimizer = optimizer,
                 metrics = ['accuracy'])
    return model.fit(X_train,y_train,epochs = 10,
                    validation_data=(X_valid,y_valid))

In [101]:
optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3, momentum = 0.9)

In [102]:
build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.7181 - loss: 0.8365 - val_accuracy: 0.8442 - val_loss: 0.4332
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8425 - loss: 0.4526 - val_accuracy: 0.8574 - val_loss: 0.3930
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8641 - loss: 0.3944 - val_accuracy: 0.8596 - val_loss: 0.3777
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8758 - loss: 0.3570 - val_accuracy: 0.8644 - val_loss: 0.3691
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8863 - loss: 0.3278 - val_accuracy: 0.8676 - val_loss: 0.3676
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8952 - loss: 0.3030 - val_accuracy: 0.8662 - val_loss: 0.3673
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x142b2010bd0>

In [105]:
optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3,momentum=0.9,nesterov = True)

In [107]:
build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - accuracy: 0.7203 - loss: 0.8249 - val_accuracy: 0.8446 - val_loss: 0.4218
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.8431 - loss: 0.4466 - val_accuracy: 0.8560 - val_loss: 0.3854
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8630 - loss: 0.3867 - val_accuracy: 0.8596 - val_loss: 0.3708
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8758 - loss: 0.3477 - val_accuracy: 0.8680 - val_loss: 0.3645
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8864 - loss: 0.3176 - val_accuracy: 0.8672 - val_loss: 0.3611
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.8956 - loss: 0.2925 - val_accuracy: 0.8678 - val_loss: 0.3612
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x142e8995a10>

## AdaGrad Optimizer

In [116]:
optimizer = tf.keras.optimizers.Adagrad(learning_rate = 0.001)
histoty = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.6412 - loss: 1.0685 - val_accuracy: 0.7928 - val_loss: 0.5826
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.7827 - loss: 0.6284 - val_accuracy: 0.8154 - val_loss: 0.5181
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.8007 - loss: 0.5662 - val_accuracy: 0.8238 - val_loss: 0.4877
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8125 - loss: 0.5325 - val_accuracy: 0.8308 - val_loss: 0.4690
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8205 - loss: 0.5097 - val_accuracy: 0.8378 - val_loss: 0.4562
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8265 - loss: 0.4926 - val_accuracy: 0.8410 - val_loss: 0.4460
Epoch 7/10
[1m

## RMSProp

In [119]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate = 0.001,rho =0.9)
histoty = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.7839 - loss: 0.6225 - val_accuracy: 0.8618 - val_loss: 0.3737
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8594 - loss: 0.3861 - val_accuracy: 0.8758 - val_loss: 0.3435
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.8812 - loss: 0.3313 - val_accuracy: 0.8772 - val_loss: 0.3489
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8939 - loss: 0.2931 - val_accuracy: 0.8790 - val_loss: 0.3512
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9040 - loss: 0.2644 - val_accuracy: 0.8838 - val_loss: 0.3536
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9128 - loss: 0.2373 - val_accuracy: 0.8798 - val_loss: 0.3725
Epoch 7/10
[1m

## Adam Optimizer

In [121]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001,beta_1 = 0.9,beta_2 = 0.999)
histoty = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 7ms/step - accuracy: 0.7766 - loss: 0.6370 - val_accuracy: 0.8512 - val_loss: 0.3879
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8587 - loss: 0.3891 - val_accuracy: 0.8674 - val_loss: 0.3506
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8771 - loss: 0.3339 - val_accuracy: 0.8712 - val_loss: 0.3481
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8916 - loss: 0.2949 - val_accuracy: 0.8708 - val_loss: 0.3572
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9022 - loss: 0.2654 - val_accuracy: 0.8694 - val_loss: 0.3680
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9111 - loss: 0.2376 - val_accuracy: 0.8664 - val_loss: 0.3926
Epoch 7/10
[1

## AdaMax Optimier

In [124]:
optimizer = tf.keras.optimizers.Adamax(learning_rate = 0.001,beta_1 = 0.9,beta_2 = 0.999)
histoty = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 6ms/step - accuracy: 0.7496 - loss: 0.7351 - val_accuracy: 0.8552 - val_loss: 0.3890
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8538 - loss: 0.4103 - val_accuracy: 0.8718 - val_loss: 0.3534
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8776 - loss: 0.3458 - val_accuracy: 0.8796 - val_loss: 0.3399
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8922 - loss: 0.3041 - val_accuracy: 0.8772 - val_loss: 0.3360
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9024 - loss: 0.2721 - val_accuracy: 0.8808 - val_loss: 0.3410
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9131 - loss: 0.2431 - val_accuracy: 0.8788 - val_loss: 0.3465
Epoch 7/10
[

## Nadam

In [131]:
optimizer = tf.keras.optimizers.Nadam(learning_rate = 0.001,beta_1 = 0.9,beta_2 = 0.999)
histoty = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.7779 - loss: 0.6422 - val_accuracy: 0.8594 - val_loss: 0.3599
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.8587 - loss: 0.3842 - val_accuracy: 0.8686 - val_loss: 0.3416
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.8767 - loss: 0.3298 - val_accuracy: 0.8724 - val_loss: 0.3361
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8934 - loss: 0.2900 - val_accuracy: 0.8726 - val_loss: 0.3446
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9036 - loss: 0.2585 - val_accuracy: 0.8744 - val_loss: 0.3601
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.9123 - loss: 0.2334 - val_accuracy: 0.8734 - val_loss: 0.3760
Epoch 7/10


## AdamW Optimizer

In [135]:
optimizer = tf.keras.optimizers.Adam(weight_decay = 1e-5,learning_rate = 0.001,beta_1 = 0.9,beta_2 = 0.999)
histoty = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.7726 - loss: 0.6525 - val_accuracy: 0.8630 - val_loss: 0.3699
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8588 - loss: 0.3893 - val_accuracy: 0.8660 - val_loss: 0.3555
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.8773 - loss: 0.3336 - val_accuracy: 0.8720 - val_loss: 0.3476
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8913 - loss: 0.2936 - val_accuracy: 0.8716 - val_loss: 0.3570
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9012 - loss: 0.2637 - val_accuracy: 0.8728 - val_loss: 0.3671
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9104 - loss: 0.2399 - val_accuracy: 0.8670 - val_loss: 0.4045
Epoch 7/10


# Learning Rate Scheduling

## Power scheduling

In [155]:
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    initial_learning_rate = 0.01,
    decay_steps = 10_000,
    decay_rate = 1,
    staircase = False
)
optimizer = tf.keras.optimizers.SGD(learning_rate = lr_schedule)
histoty = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.7310 - loss: 0.7974 - val_accuracy: 0.8378 - val_loss: 0.4270
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8428 - loss: 0.4463 - val_accuracy: 0.8538 - val_loss: 0.3936
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8618 - loss: 0.3902 - val_accuracy: 0.8606 - val_loss: 0.3774
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8720 - loss: 0.3568 - val_accuracy: 0.8638 - val_loss: 0.3691
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8804 - loss: 0.3327 - val_accuracy: 0.8648 - val_loss: 0.3640
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8880 - loss: 0.3132 - val_accuracy: 0.8684 - val_loss: 0.3599
Epoch 7/10
[1m1

## ExponentialDecay

In [151]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = 0.01,
    decay_steps = 20_000,
    decay_rate = 1,
    staircase = False
)
optimizer = tf.keras.optimizers.SGD(learning_rate = lr_schedule)
histoty = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.5158 - loss: 1.4842 - val_accuracy: 0.7802 - val_loss: 0.6650
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7606 - loss: 0.7088 - val_accuracy: 0.8122 - val_loss: 0.5534
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.7892 - loss: 0.6078 - val_accuracy: 0.8238 - val_loss: 0.5049
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8060 - loss: 0.5558 - val_accuracy: 0.8312 - val_loss: 0.4755
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8181 - loss: 0.5213 - val_accuracy: 0.8386 - val_loss: 0.4558
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8271 - loss: 0.4961 - val_accuracy: 0.8442 - val_loss: 0.4416
Epoch 7/10
[1m1

## Piecewise constant scheduling

In [158]:
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries = [50_00,80_000],
    values = [0.01,0.005,0.001]
)
optimizer = tf.keras.optimizers.SGD(learning_rate = lr_schedule)
history_piecewise = build_and_train_model(optimizer)

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.7369 - loss: 0.7819 - val_accuracy: 0.8474 - val_loss: 0.4185
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8427 - loss: 0.4442 - val_accuracy: 0.8606 - val_loss: 0.3808
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.8620 - loss: 0.3839 - val_accuracy: 0.8704 - val_loss: 0.3579
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.8800 - loss: 0.3390 - val_accuracy: 0.8732 - val_loss: 0.3534
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8872 - loss: 0.3204 - val_accuracy: 0.8746 - val_loss: 0.3507
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8925 - loss: 0.3046 - val_accuracy: 0.8744 - val_loss: 0.3502
Epoch 7/10
[1m

# Avoiding Overfiting

## ℓ1 and ℓ2 Regularization

In [165]:
layers = tf.keras.layers.Dense(100,activation = 'relu',
                              kernel_initializer = 'he_normal',
                              kernel_regularizer = tf.keras.regularizers.l2(0.01))

In [171]:
tf.random.set_seed(42)

In [185]:
from functools import partial

RegularizeDense = partial(tf.keras.layers.Dense,
                         activation = 'relu',
                         kernel_initializer = 'he_normal',
                         kernel_regularizer = tf.keras.regularizers.l2(0.01))

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = [28,28]),
    RegularizeDense(100),
    RegularizeDense(100),
    RegularizeDense(10,activation = 'softmax')
])

model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = tf.keras.optimizers.SGD(learning_rate = 0.02),
             metrics = ['accuracy'])

histrory = model.fit(X_train,y_train,epochs =2,
                    validation_data = (X_valid,y_valid))

  super().__init__(**kwargs)


Epoch 1/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7198 - loss: 4.1198 - val_accuracy: 0.8262 - val_loss: 1.8791
Epoch 2/2
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8146 - loss: 1.6363 - val_accuracy: 0.8246 - val_loss: 1.1240


## Dropout

In [193]:
mode = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = [28,28]),
    tf.keras.layers.Dropout(rate = 0.2),
    tf.keras.layers.Dense(100,activation = 'relu',
                         kernel_initializer = 'he_normal'),
    tf.keras.layers.Dropout(rate = 0.2),
    tf.keras.layers.Dense(100,activation = 'relu',
                         kernel_initializer = 'he_normal'),
    tf.keras.layers.Dropout(rate = 0.2),
    tf.keras.layers.Dense(100,activation = 'relu',
                         kernel_initializer = 'he_normal'),
    tf.keras.layers.Dropout(rate = 0.2),
    tf.keras.layers.Dense(10,activation = 'softmax')
])

model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = tf.keras.optimizers.SGD(learning_rate = 0.02),
             metrics = ['accuracy'])

histrory = model.fit(X_train,y_train,epochs =10,
                    validation_data = (X_valid,y_valid))

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8193 - loss: 0.8928 - val_accuracy: 0.8238 - val_loss: 0.8664
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8200 - loss: 0.8793 - val_accuracy: 0.8238 - val_loss: 0.8601
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8199 - loss: 0.8732 - val_accuracy: 0.8250 - val_loss: 0.8566
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8195 - loss: 0.8694 - val_accuracy: 0.8242 - val_loss: 0.8543
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8198 - loss: 0.8665 - val_accuracy: 0.8248 - val_loss: 0.8523
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8197 - loss: 0.8641 - val_accuracy: 0.8234 - val_loss: 0.8507
Epoch 7/10
[1m

In [194]:
model.evaluate(X_train,y_train)

[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8273 - loss: 0.8465


[0.8489376306533813, 0.8253818154335022]

## MC Dropout

In [208]:
y_probas = np.stack([model(X_test,training = True) for sample in range(100)])   # predict edir 100 defe, np.stack sa bir yere yigir
y_probas.mean(axis = 0)

array([[1.5976126e-03, 1.6145157e-03, 1.1288245e-03, ..., 3.3341634e-01,
        3.0063409e-02, 2.9054248e-01],
       [3.7417712e-03, 6.3227868e-04, 8.4106451e-01, ..., 1.3219779e-06,
        1.5907229e-03, 4.9929917e-05],
       [3.3810723e-04, 9.9560487e-01, 7.3681021e-04, ..., 1.9620005e-04,
        1.8683920e-06, 2.4765272e-05],
       ...,
       [7.4169666e-02, 9.0527046e-04, 7.2520222e-03, ..., 8.5444441e-03,
        6.4090830e-01, 3.5498524e-03],
       [1.4842604e-03, 9.3996185e-01, 3.3831787e-03, ..., 4.5983726e-03,
        4.0082941e-05, 2.7504188e-04],
       [2.8247437e-03, 4.0887664e-03, 5.7104486e-03, ..., 2.1579172e-01,
        4.5343563e-02, 2.5630387e-02]], dtype=float32)

In [212]:
model.predict(X_test[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step


array([[0.00159761, 0.00161452, 0.00112883, 0.00124662, 0.00090978,
        0.33834454, 0.00113604, 0.33341643, 0.03006344, 0.29054227]],
      dtype=float32)