#### MNIST Deep Neural Network

In [1]:
import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds

In [2]:
### Load the dataset ###

#  tfds.list_builders() # List all registered datasets

# Load datasets from tf datasets. Also saves data locally to C:\Users\*\tensorflow_datasets the first time dataset is loaded
# as_supervised=True loads in a tuple with structure: [input, target]
# with_info=True will also return another tuple containing info on version, features, # samples of the dataset, etc.
mnist_dataset, mnist_info = tfds.load(name='mnist', with_info=True, as_supervised=True)

In [3]:
mnist_dataset

{'train': <_PrefetchDataset element_spec=(TensorSpec(shape=(28, 28, 1), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 'test': <_PrefetchDataset element_spec=(TensorSpec(shape=(28, 28, 1), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}

In [4]:
mnist_info

tfds.core.DatasetInfo(
    name='mnist',
    full_name='mnist/3.0.1',
    description="""
    The MNIST database of handwritten digits.
    """,
    homepage='http://yann.lecun.com/exdb/mnist/',
    data_dir='C:\\Users\\alowe\\tensorflow_datasets\\mnist\\3.0.1',
    file_format=tfrecord,
    download_size=11.06 MiB,
    dataset_size=21.00 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=uint8),
        'label': ClassLabel(shape=(), dtype=int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{lecun2010mnist,
      title={MNIST handwritten digit database},
      author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
      journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
      volume={2},
      year={2010}
    }""",
)

In [5]:
### Pre-process the data ###

# Extract traing and test data
mnist_train, mnist_test = mnist_dataset['train'], mnist_dataset['test'] # 60k and 10k examples, respectively

# Establish an amount to split for validation dataset, and cast it as a tensor of that size
num_val_samples = 0.1 * mnist_info.splits['train'].num_examples # 10% (arbitrary) of 60k train dataset will be for validation
num_val_samples = tf.cast(num_val_samples, tf.int64) # Convert/cast to integer (avoid having a float)

# Do the same as the above step but for a tensor for the test dataset
num_test_samples = mnist_info.splits['test'].num_examples
num_test_samples = tf.cast(num_test_samples, tf.int64)

# Scale data so that inputs are b/w 0 and 1 (integers b/w 0-255 to floats b/w 0-1)
def scale_image(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255.
    return image, label

# Apply the above function using tf's inbuilt dataset.map(*function*), which applies a custom transformation to a given dataset.
# Note: This .map(*function*) method can only apply a function that takes in (input, label) and returns (input, label).
scaled_train_and_val_data = mnist_train.map(scale_image)
test_data = mnist_test.map(scale_image)

# Shuffle the data
BUFFER_SIZE = 10000 # When dealing with enormous datasets, we can't shuffle all data at once (b/c of system memory), so we'll shuffle 10k at a time
shuffled_train_and_val_data = scaled_train_and_val_data.shuffle(BUFFER_SIZE)

# Partition data - Now we can "take" validation data from the shuffled train+val data and "skip" that same data for the test data
val_data = shuffled_train_and_val_data.take(num_val_samples)
train_data = shuffled_train_and_val_data.skip(num_val_samples)

# Note: At this point, we now have scaled (0-1) data for test, val and train (10k, 6k, and 54k, respectively)

# Setup batching for mini-batch gradient descent (Note: SGD is technically batch-size=1, though the term SGD is often used to describe Mini-Batch GD)
# Note: Only batch on training data, as for val data we only forward-propogate (whereas we forward and backpropogatae for training data), AND also b/c for
# each epoch, we'd rather calculate exact loss/accuracy for val (using all val data together), whereas for training data we take the average loss across batches.
BATCH_SIZE = 100
train_data = train_data.batch(BATCH_SIZE)
val_data = val_data.batch(num_val_samples) # Only batch on training data, but model expects val and test data in batch form too. (i.e., only 1 batch)
test_data = test_data.batch(num_test_samples)

# Extract inputs and targets, as we loaded the MNIST data as iterable and in a 2-tuple format (when we set as_supervised=True)
val_inputs, val_targets = next(iter(val_data))


In [6]:
### Outlining the model ###

# 4 layers: 784 (28x28x1 flattened img) -> 50 -> 50 -> 10 digits/classes. (Baseline hyperparameters)
# Will later tune: NN width, depth, activation functions, etc.
input_size = 784
output_size = 10
hidden_layer_size = 50

"""
model = tf.keras.Sequential(
    [
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)), # Replace this line with .Input(shape=(28, 28, 1)) and .Flatten() layers?
        tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        tf.keras.layers.Dense(output_size, activation='softmax')
    ]
)
"""

# Note, the below model definition code may better conform with Keras best practices and make the architecture's input more explicit
model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(28, 28, 1)), # Explicit Input layer
        tf.keras.layers.Flatten(), # Note: Moved input from this line to the above input layer
        tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        tf.keras.layers.Dense(output_size, activation='softmax') # Softmax usually used for output layer in classification problems
    ]
)

In [7]:
### Choose the optimizer, loss function, and optionally any metrics to calculate throughout training and testing ###

# Optimizer: Adam, or adaptive moment estimation, is very common (combines adaptive learning rate + momentum)
# Loss: Will choose one of the crossentropy options, since this is a classification problem
#       - categorical_crossentropy expects that you've one-hot encoded the targets
#       - sparse_categorical_crossentropy applies one-hot encoding
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [8]:
### Training ###

NUM_EPOCHS = 5 # Arbitrary - Should probably train more epochs, then stop training just before overfitting (i.e., val loss starts increasing)

history = model.fit(
    train_data,
    epochs=NUM_EPOCHS,
    validation_data=(val_inputs, val_targets),
    verbose=2 # Provide details at each epoch
)

Epoch 1/5
540/540 - 2s - 4ms/step - accuracy: 0.8858 - loss: 0.4112 - val_accuracy: 0.9403 - val_loss: 0.1980
Epoch 2/5
540/540 - 1s - 2ms/step - accuracy: 0.9468 - loss: 0.1827 - val_accuracy: 0.9567 - val_loss: 0.1448
Epoch 3/5
540/540 - 1s - 2ms/step - accuracy: 0.9585 - loss: 0.1382 - val_accuracy: 0.9625 - val_loss: 0.1168
Epoch 4/5
540/540 - 1s - 2ms/step - accuracy: 0.9667 - loss: 0.1107 - val_accuracy: 0.9680 - val_loss: 0.1044
Epoch 5/5
540/540 - 1s - 2ms/step - accuracy: 0.9706 - loss: 0.0974 - val_accuracy: 0.9727 - val_loss: 0.0906


In [9]:
### Test the model (in practice, the test data should really only be used at the very end, after hyperparameter tuning) ###

test_loss, test_accuracy = model.evaluate(test_data)
print(f'Test loss: {test_loss:.4f}. Test accuracy: {test_accuracy*100:.2f}%.')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step - accuracy: 0.9654 - loss: 0.1185
Test loss: 0.1185. Test accuracy: 96.54%.


#### MNIST - Hyperparameter Tuning

In [10]:
### Hyperparameter tuning (and also adding Early Stopping) ###

# Note 1: Manually implementing a tuning process very close to GridSearchCV, but in practice using an API
#         like the Keras Tuner's Bayesian Hyperparameter Optimization is likely preferred.
# Note 2: The "Pre-process the data" and earlier cells must be executed prior to this cell.

import time

NUM_EPOCHS = 100
input_size = 784
output_size = 10

hidden_layer_sizes = [32, 64, 128, 256, 512]
hidden_layer_size_decreases = ['constant', 'half']
hidden_layer_depths = [2, 3, 4]
activation_functions = ['relu', 'elu', 'tanh']

total_num_scenarios = (
    len(hidden_layer_sizes) *
    len(hidden_layer_size_decreases) * 
    len(hidden_layer_depths) *
    len(activation_functions)
)
curr_scenario_num = 0 # Increment for each loop

# Initialize best model and parameters to store info on best during loops
best_val_loss = float('inf')
best_val_accuracy = 0
best_train_accuracy = 0
best_model = None
best_params = {}


for hidden_layer_size in hidden_layer_sizes:
    for hidden_layer_size_decrease in hidden_layer_size_decreases:
        for hidden_layer_depth in hidden_layer_depths:
            for activation_function in activation_functions:

                loop_start_time = time.time()

                # Reset the global state, releasing memory
                tf.keras.backend.clear_session()

                # Create the EarlyStopping callback
                early_stopping = tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss',
                    patience=2, # Number of epochs to wait for improvement before stopping
                    restore_best_weights=True,
                )

                # Build the model by adding each layer, w/ dynamic # of hidden layers
                curr_model = tf.keras.Sequential()
                curr_model.add(tf.keras.layers.Input(shape=(28, 28, 1)))
                curr_model.add(tf.keras.layers.Flatten())
                for i in range(hidden_layer_depth):
                    if hidden_layer_size_decrease == 'constant': # Constant size
                        current_layer_size = hidden_layer_size
                    elif hidden_layer_size_decrease == 'decrease_half': # Each hidden layer reduces by half
                        current_layer_size = hidden_layer_size // (2 ** i)
                    curr_model.add(tf.keras.layers.Dense(current_layer_size, activation=activation_function))
                curr_model.add(tf.keras.layers.Dense(output_size, activation='softmax'))

                # Compile model optimizer and loss then fit/train the model
                curr_model.compile(
                    optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy']
                )
                curr_history = curr_model.fit(
                    train_data,
                    epochs=NUM_EPOCHS,
                    validation_data=(val_inputs, val_targets),
                    verbose=0,
                    callbacks=[early_stopping]
                )

                # Check if current model is best so far, and if so, update best model w/ current one. Print training info.
                val_accuracy = curr_history.history['val_accuracy'][-1]
                train_accuracy = curr_history.history['accuracy'][-1]
                epoch_stop = len(curr_history.history['val_loss'])
                curr_scenario_num += 1
                curr_scenario_elapsed_time = time.time() - loop_start_time
                print(
                    f'({curr_scenario_num}/{total_num_scenarios}) '
                    f'val_accuracy={val_accuracy*100:.2f}% | train_accuracy={train_accuracy*100:.2f}% for '
                    f'width={hidden_layer_size}, '
                    f'layer_size_decrease={hidden_layer_size_decrease}, '
                    f'depth={hidden_layer_depth}, '
                    f'activation={activation_function} with '
                    f'epochs={epoch_stop} and '
                    f'training_time={int(curr_scenario_elapsed_time // 60)}min {int(curr_scenario_elapsed_time % 60)}sec.'
                )
                if epoch_stop == NUM_EPOCHS:
                    print(f'Warning: Model reached epoch #{NUM_EPOCHS} for the above accuracy. Consider re-running with higher NUM_EPOCHS.')
                if val_accuracy > best_val_accuracy:
                    best_val_loss = curr_history.history['val_loss'][-1]
                    best_val_accuracy = val_accuracy
                    best_train_accuracy = train_accuracy
                    best_model = curr_model
                    best_params = {
                        'hidden_layer_size': hidden_layer_size,
                        'hidden_layer_size_variation': hidden_layer_size_decrease,
                        'hidden_layer_depth': hidden_layer_depth,
                        'activation_function': activation_function
                    }


# Print info about best model
print('----------------------------------------------------------------------')
print(f'{best_val_loss=:.4f} | {best_val_accuracy=:.4f} | {best_train_accuracy=:.4f}')
print("Best model available at 'best_model'")
print('Best Hyperparameters:')
print(best_params)







(1/90) val_accuracy=98.63% | train_accuracy=98.91% for width=32, layer_size_decrease=constant, depth=2, activation=relu with epochs=24 and training_time=0min 23sec.
(2/90) val_accuracy=98.48% | train_accuracy=99.23% for width=32, layer_size_decrease=constant, depth=2, activation=elu with epochs=26 and training_time=0min 22sec.
(3/90) val_accuracy=99.25% | train_accuracy=99.51% for width=32, layer_size_decrease=constant, depth=2, activation=tanh with epochs=34 and training_time=0min 29sec.
(4/90) val_accuracy=97.97% | train_accuracy=98.43% for width=32, layer_size_decrease=constant, depth=3, activation=relu with epochs=15 and training_time=0min 13sec.
(5/90) val_accuracy=98.33% | train_accuracy=98.71% for width=32, layer_size_decrease=constant, depth=3, activation=elu with epochs=16 and training_time=0min 14sec.
(6/90) val_accuracy=99.42% | train_accuracy=99.59% for width=32, layer_size_decrease=constant, depth=3, activation=tanh with epochs=35 and training_time=0min 31sec.
(7/90) val_a

In [12]:
### Test the model ###

test_loss, test_accuracy = best_model.evaluate(test_data)
print(f'Test loss: {test_loss:.4f} | Test accuracy: {test_accuracy*100:.2f}%')
print(f'{best_val_loss=:.4f} | {best_val_accuracy=:.4f} | {best_train_accuracy=:.4f}')
print('Best Hyperparameters:')
print(best_params)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - accuracy: 0.9801 - loss: 0.0849
Test loss: 0.0849 | Test accuracy: 98.01%
best_val_loss=0.0055 | best_val_accuracy=0.9988 | best_train_accuracy=0.9985
Best Hyperparameters:
{'hidden_layer_size': 128, 'hidden_layer_size_variation': 'constant', 'hidden_layer_depth': 2, 'activation_function': 'tanh'}
