In [4]:
# !pip install tensorflow_datasets
# The example follows this: https://www.tensorflow.org/tutorials/distribute/keras
import tensorflow_datasets as tfds
import tensorflow as tf

import os

In [5]:
print(tf.__version__)

2.4.1


In [6]:
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)

mnist_train, mnist_test = datasets['train'], datasets['test']

[1mDownloading and preparing dataset 11.06 MiB (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...[0m


Dl Completed...:   0%|          | 0/4 [00:00<?, ? file/s]


[1mDataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.[0m


In [7]:
strategy = tf.distribute.MirroredStrategy()





INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


In [8]:
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 1


In [9]:
num_train_examples = info.splits['train'].num_examples
num_test_examples = info.splits['test'].num_examples

BUFFER_SIZE = 10000

BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

In [11]:
def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255

    return image, label

In [12]:
train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)

In [13]:
with strategy.scope():
      model = tf.keras.Sequential([
          tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
          tf.keras.layers.MaxPooling2D(),
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(64, activation='relu'),
          tf.keras.layers.Dense(10)
      ])

      model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=['accuracy'])

In [14]:
# Define the checkpoint directory to store the checkpoints

checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

In [15]:
# Function for decaying the learning rate.
# You can define any decay function you need.
def decay(epoch):
  if epoch < 3:
    return 1e-3
  elif epoch >= 3 and epoch < 7:
    return 1e-4
  else:
    return 1e-5

In [16]:
# Callback for printing the LR at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
                                                      model.optimizer.lr.numpy()))

In [17]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                       save_weights_only=True),
    tf.keras.callbacks.LearningRateScheduler(decay),
    PrintLR()
]

In [18]:
model.fit(train_dataset, epochs=8, callbacks=callbacks)

Epoch 1/8

Learning rate for epoch 1 is 0.0010000000474974513
Epoch 2/8

Learning rate for epoch 2 is 0.0010000000474974513
Epoch 3/8

Learning rate for epoch 3 is 0.0010000000474974513
Epoch 4/8

Learning rate for epoch 4 is 9.999999747378752e-05
Epoch 5/8

Learning rate for epoch 5 is 9.999999747378752e-05
Epoch 6/8

Learning rate for epoch 6 is 9.999999747378752e-05
Epoch 7/8

Learning rate for epoch 7 is 9.999999747378752e-05
Epoch 8/8

Learning rate for epoch 8 is 9.999999747378752e-06


<tensorflow.python.keras.callbacks.History at 0x7fa85c236710>

In [20]:
# check the checkpoint directory
!ls {checkpoint_dir}

checkpoint		    ckpt_5.data-00000-of-00001
ckpt_1.data-00000-of-00001  ckpt_5.index
ckpt_1.index		    ckpt_6.data-00000-of-00001
ckpt_2.data-00000-of-00001  ckpt_6.index
ckpt_2.index		    ckpt_7.data-00000-of-00001
ckpt_3.data-00000-of-00001  ckpt_7.index
ckpt_3.index		    ckpt_8.data-00000-of-00001
ckpt_4.data-00000-of-00001  ckpt_8.index
ckpt_4.index


In [21]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

eval_loss, eval_acc = model.evaluate(eval_dataset)

print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))

Eval loss: 0.04011186957359314, Eval Accuracy: 0.9872000217437744
