In [1]:
import os

import tensorflow as tf
from tensorflow import keras

print(tf.version.VERSION)

2022-11-29 08:38:22.165110: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-29 08:38:22.165123: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


2.8.0-rc1


MNIST dataset for example

In [2]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

In [3]:
train_labels = train_labels[:1000]
test_labels = test_labels[:1000]

train_images = train_images[:1000].reshape(-1, 28*28) / 255.0
test_images = test_images[:1000].reshape(-1, 28*28) / 255.0

## Define model

In [5]:
def create_model():
    model = tf.keras.Sequential([
        keras.layers.Dense(512, activation='relu', input_shape=(784,)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(10)
    ])
    
    model.compile(optimizer='adam',
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    
    return model

model = create_model()

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 512)               401920    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_3 (Dense)             (None, 10)                5130      
                                                                 
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________


## Save checkpoints during training

 The `tf.keras.callbacks.ModelCheckpoint` callback allows to continually save the model both during and at the end of training.

### Checkpoint callback usage

Create a `tf.keras.callbacks.ModelCheckpoint` callback that saves weights only during training.

In [6]:
checkpoint_path = 'training_1/cp.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                save_weights_only=True,
                                                verbose=1)

model.fit(train_images,
         train_labels,
         epochs=10,
         validation_data=(test_images, test_labels),
         callbacks=[cp_callback])

Epoch 1/10
 1/32 [..............................] - ETA: 7s - loss: 2.3345 - sparse_categorical_accuracy: 0.1562
Epoch 1: saving model to training_1/cp.ckpt
Epoch 2/10
Epoch 2: saving model to training_1/cp.ckpt
Epoch 3/10
Epoch 3: saving model to training_1/cp.ckpt
Epoch 4/10
 1/32 [..............................] - ETA: 0s - loss: 0.1332 - sparse_categorical_accuracy: 1.0000
Epoch 4: saving model to training_1/cp.ckpt
Epoch 5/10
Epoch 5: saving model to training_1/cp.ckpt
Epoch 6/10
 1/32 [..............................] - ETA: 0s - loss: 0.0955 - sparse_categorical_accuracy: 1.0000
Epoch 6: saving model to training_1/cp.ckpt
Epoch 7/10
Epoch 7: saving model to training_1/cp.ckpt
Epoch 8/10
 1/32 [..............................] - ETA: 0s - loss: 0.0858 - sparse_categorical_accuracy: 1.0000
Epoch 8: saving model to training_1/cp.ckpt
Epoch 9/10
Epoch 9: saving model to training_1/cp.ckpt
Epoch 10/10
 1/32 [..............................] - ETA: 0s - loss: 0.0190 - sparse_categorical_

<keras.callbacks.History at 0x7f7ff474e390>

This creates a single collection of TensorFlow checkpoint files that are updated at the end of each epoch.

In [7]:
os.listdir(checkpoint_dir)

['cp.ckpt.data-00000-of-00001', 'checkpoint', 'cp.ckpt.index']

In [8]:
# Create a basic model instance
model = create_model()

loss, acc = model.evaluate(test_images, test_labels, verbose=1)
print('Untrained model, accuracy: {:5.2f}%'.format(100*acc))

Untrained model, accuracy:  6.40%


In [9]:
# Load the weights
model.load_weights(checkpoint_path)

loss, acc = model.evaluate(test_images, test_labels, verbose=1)
print('Restores model, accuracy: {:5.2f}%'.format(100*acc))

Restores model, accuracy: 87.40%


## Checkpoint callback options

The callback provides several options to provide unique names for checkpoints and adjust the checkpointing frequency.

Train a new model, and save uniquely named checkpoints once every five epochs.

In [10]:
checkpoint_path = 'training_2/cp-{epoch:04d}.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)

batch_size = 32

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                verbose=1,
                                                save_weights_only=True,
                                                save_freq=5*batch_size)

model = create_model()

model.save_weights(checkpoint_path.format(epoch=0))

model.fit(train_images,
         train_labels,
         epochs=50,
         batch_size=batch_size,
         callbacks=[cp_callback],
         validation_data=(test_images, test_labels),
         verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
 1/32 [..............................] - ETA: 0s - loss: 0.1241 - sparse_categorical_accuracy: 0.9688
Epoch 5: saving model to training_2/cp-0005.ckpt
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 10: saving model to training_2/cp-0010.ckpt
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 15: saving model to training_2/cp-0015.ckpt
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 20: saving model to training_2/cp-0020.ckpt
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
 1/32 [..............................] - ETA: 0s - loss: 0.0050 - sparse_categorical_accuracy: 1.0000
Epoch 25: saving model to training_2/cp-0025.ckpt
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 30: saving model to training_2/cp-0030.ckpt
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
 1/32 [..............................] - ETA: 0s - loss: 0.0045 - sparse_categorical_a

Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
 1/32 [..............................] - ETA: 0s - loss: 0.0026 - sparse_categorical_accuracy: 1.0000
Epoch 40: saving model to training_2/cp-0040.ckpt
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 45: saving model to training_2/cp-0045.ckpt
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 50: saving model to training_2/cp-0050.ckpt


<keras.callbacks.History at 0x7f7ff00adcc0>

In [11]:
os.listdir(checkpoint_dir)


['cp-0040.ckpt.index',
 'cp-0010.ckpt.data-00000-of-00001',
 'cp-0030.ckpt.data-00000-of-00001',
 'cp-0025.ckpt.index',
 'cp-0015.ckpt.index',
 'cp-0010.ckpt.index',
 'cp-0045.ckpt.index',
 'cp-0040.ckpt.data-00000-of-00001',
 'cp-0050.ckpt.index',
 'cp-0030.ckpt.index',
 'cp-0000.ckpt.index',
 'cp-0050.ckpt.data-00000-of-00001',
 'cp-0035.ckpt.index',
 'cp-0005.ckpt.index',
 'cp-0020.ckpt.index',
 'cp-0015.ckpt.data-00000-of-00001',
 'cp-0045.ckpt.data-00000-of-00001',
 'cp-0000.ckpt.data-00000-of-00001',
 'checkpoint',
 'cp-0020.ckpt.data-00000-of-00001',
 'cp-0005.ckpt.data-00000-of-00001',
 'cp-0025.ckpt.data-00000-of-00001',
 'cp-0035.ckpt.data-00000-of-00001']

choose the latest one

In [12]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

'training_2/cp-0050.ckpt'

To test, reset the model, and load the latest checkpoint.

In [13]:
model = create_model()

model.load_weights(latest)

loss, acc = model.evaluate(test_images, test_labels, verbose=1)
print('Restored model, accuracy: {:5.2f}%'.format(100*acc))

Restored model, accuracy: 87.50%


## The files

The above code stores the weights to a collection of checkpoint-formatted files that contain only the trained weights in a binary format. Checkpoints contain:
- One or more shards that contain your model's weights.
- An index file that indicates which weights are stored in which shard.

## Manually save weights

To save weights manually, use `tf.keras.Model.save_weights`. By default, `tf.keras`—and the `Model.save_weights` method in particular—uses the **TensorFlow Checkpoint** format with a `.ckpt` extension. To save in the **HDF5** format with a `.h5` extension, refer to the Save and load models guide.

In [14]:
model.save_weights('./checkpoints/my_checkpoint')

model = create_model()

#Restore
model.load_weights('./checkpoints/my_checkpoint')

loss, acc = model.evaluate(test_images, test_labels, verbose=1)
print('Restored model, accuracy: {:5.2f}%'.format(100*acc))

Restored model, accuracy: 87.50%


## Save the entire model
`Call tf.keras.Model.save` to save a model's architecture, weights, and training configuration in a single file/folder. This allows to export a model so it can be used without access to the original Python code*. Since the optimizer-state is recovered, we can resume training from exactly where we left off.
<br><br>
An entire model can be saved in two different file formats (`SavedModel` and `HDF5`). The TensorFlow `SavedModel` format is the default file format in TF2.x. However, models can be saved in `HDF5` format. More details on saving entire models in the two file formats is described below.
<br><br>
Saving a fully-functional model is very useful—we can load them in TensorFlow.js (Saved Model, HDF5) and then train and run them in web browsers, or convert them to run on mobile devices using TensorFlow Lite (Saved Model, HDF5)
#### *Custom objects (for example, subclassed models or layers) require special attention when saving and loading. Refer to the Saving custom objects section below.

### SavedModel format

The SavedModel format is another way to serialize models. Models saved in this format can be restored using `tf.keras.models.load_model` and are compatible with TensorFlow Serving. The SavedModel guide goes into detail about how to `serve/inspect` the SavedModel. 

In [15]:
model = create_model()
model.fit(train_images, train_labels, epochs=5)

#Save the entire model as a SavedModel
!mkdir -p saved_model
model.save('saved_model/my_model')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: saved_model/my_model/assets


2022-11-29 09:05:13.008584: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


The SavedModel format is a directory containing a protobuf binary and a TensorFlow checkpoint. Inspect the saved model directory

In [17]:
ls saved_model

[0m[01;34mmy_model[0m/


In [18]:
ls saved_model/my_model

[0m[01;34massets[0m/  keras_metadata.pb  saved_model.pb  [01;34mvariables[0m/


### Reload a fresh keras model from the saved model

In [19]:
new_model = tf.keras.models.load_model('saved_model/my_model')
new_model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 512)               401920    
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 dense_13 (Dense)            (None, 10)                5130      
                                                                 
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________


In [20]:
loss, acc = new_model.evaluate(test_images,test_labels, verbose=1)
print('Restored model, accuracy: {:5.2f}%'.format(100*acc))

print(new_model.predict(test_images).shape)

Restored model, accuracy: 86.30%
(1000, 10)


## HDF5 format


In [28]:
model = create_model()
model.fit(train_images, train_labels, epochs=5)

model.save('my_model.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
new_model = tf.keras.models.load_model('my_model.h5')
new_model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 512)               401920    
                                                                 
 dropout_9 (Dropout)         (None, 512)               0         
                                                                 
 dense_19 (Dense)            (None, 10)                5130      
                                                                 
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________


In [30]:
loss, acc = new_model.evaluate(test_images, test_labels, verbose=1)
print('Restored model, accuracy: {:5.2f}%'.format(100 * acc))

Restored model, accuracy: 86.30%


Keras saves models by inspecting their architectures. This technique saves everything:
- The weight values
- The model's training configuration (what is passed to the `.compile()` method)
- The optimizer and its state, if any (this enables to restart training where we left off)
