In [1]:
import numpy as np
import tensorflow as tf
from tqdm.notebook import trange
from collections import OrderedDict

2023-06-24 20:53:22.933003: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
(X_train_full, y_train_full), (
    X_test,
    y_test,
) = tf.keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full.astype(np.float32) / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test.astype(np.float32) / 255.0

np.random.seed(69)
tf.random.set_seed(69)

In [3]:
model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Flatten(input_shape=[28, 28]),
        tf.keras.layers.Dense(100, activation="relu"),
        tf.keras.layers.Dense(10, activation="softmax"),
    ]
)

n_epochs = 10
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [4]:
def random_batch(X, y, batch_size=32):
    idx = np.random.randint(len(X), size=batch_size)
    return X[idx], y[idx]

In [5]:
def print_status_bar(step, total, loss, metrics=None):
    metrics = " - ".join(
        [f"{m.name}: {m.result():.4f}" for m in [loss] + (metrics or [])]
    )
    end = "" if step < total else "\n"
    print(f"\r{step}/{total} - " + metrics, end=end)

In [6]:
with trange(1, n_epochs + 1, desc="All epochs") as epochs:
    for epoch in epochs:
        with trange(1, n_steps + 1, desc=f"Epoch {epoch}/{n_epochs}") as steps:
            for step in steps:
                X_batch, y_batch = random_batch(X_train, y_train)
                with tf.GradientTape() as tape:
                    y_pred = model(X_batch)
                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                    loss = tf.add_n([main_loss] + model.losses)
                gradients = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(gradients, model.trainable_variables))
                for variable in model.variables:
                    if variable.constraint is not None:
                        variable.assign(variable.constraint(variable))
                status = OrderedDict()
                mean_loss(loss)
                status["loss"] = mean_loss.result().numpy()
                for metric in metrics:
                    metric(y_batch, y_pred)
                    status[metric.name] = metric.result().numpy()
                steps.set_postfix(status)
            y_pred = model(X_valid)
            status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
            status["val_accuracy"] = np.mean(
                tf.keras.metrics.sparse_categorical_accuracy(
                    tf.constant(y_valid, dtype=np.float32), y_pred
                )
            )
            steps.set_postfix(status)
        for metric in [mean_loss] + metrics:
            metric.reset_states()

All epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 2/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 3/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 4/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 5/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 6/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 7/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 8/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 9/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 10/10:   0%|          | 0/1718 [00:00<?, ?it/s]

In [14]:
# what happens if we use a differnt optimizer? with a different learning rate for the upper layers and the lower layers?
tf.keras.backend.clear_session()
np.random.seed(69)
tf.random.set_seed(69)

In [15]:
lower_layers = tf.keras.models.Sequential(
    [
        tf.keras.layers.Flatten(input_shape=[28, 28]),
        tf.keras.layers.Dense(100, activation="relu"),
    ]
)
upper_layers = tf.keras.models.Sequential(
    [tf.keras.layers.Dense(10, activation="softmax")]
)
model = tf.keras.models.Sequential([lower_layers, upper_layers])

lower_optimizer = tf.keras.optimizers.SGD(
    learning_rate=1e-3
)  # 1e-4 and 1e-2 for upper resulted in .797
upper_optimizer = tf.keras.optimizers.SGD(
    learning_rate=1e-2
)  # 1e-4 and 1e-3 didnt work well at all it was suck around 70% accuracy

n_epochs = 10
batch_size = 32
n_steps = len(X_train) // batch_size
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [16]:
with trange(1, n_epochs + 1, desc="All epochs") as epochs:
    for epoch in epochs:
        with trange(1, n_steps + 1, desc=f"Epoch {epoch}/{n_epochs}") as steps:
            for step in steps:
                X_batch, y_batch = random_batch(X_train, y_train)
                with tf.GradientTape(persistent=True) as tape:
                    y_pred = model(X_batch)
                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                    loss = tf.add_n([main_loss] + model.losses)
                for layers, optimizer in (
                    (lower_layers, lower_optimizer),
                    (upper_layers, upper_optimizer),
                ):
                    gradients = tape.gradient(loss, layers.trainable_variables)
                    optimizer.apply_gradients(
                        zip(gradients, layers.trainable_variables)
                    )
                del tape
                for variable in model.variables:
                    if variable.constraint is not None:
                        variable.assign(variable.constraint(variable))
                status = OrderedDict()
                mean_loss(loss)
                status["loss"] = mean_loss.result().numpy()
                for metric in metrics:
                    metric(y_batch, y_pred)
                    status[metric.name] = metric.result().numpy()
                steps.set_postfix(status)
            y_pred = model(X_valid)
            status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
            status["val_accuracy"] = np.mean(
                tf.keras.metrics.sparse_categorical_accuracy(
                    tf.constant(y_valid, dtype=np.float32), y_pred
                )
            )
            steps.set_postfix(status)
        for metric in [mean_loss] + metrics:
            metric.reset_states()

All epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 2/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 3/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 4/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 5/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 6/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 7/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 8/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 9/10:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 10/10:   0%|          | 0/1718 [00:00<?, ?it/s]