In [1]:
import os
import json
import tensorflow as tf
%load_ext tensorboard

# Multi-worker configuration

In [2]:
gpu_devices = tf.config.list_physical_devices('GPU') 
if len(gpu_devices) == 0: raise SystemError('GPU device not found')
for gpu in gpu_devices: 
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
tf_config = {
    'cluster': {
        'worker': ['192.168.1.1:12345', '192.168.1.2:12345']
    },
    'task': {'type': 'worker', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)

In [4]:
strategy = tf.distribute.MultiWorkerMirroredStrategy(
    communication_options = tf.distribute.experimental.CommunicationOptions(
        implementation = tf.distribute.experimental.CollectiveCommunication.RING
    )
)

INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0']
INFO:tensorflow:Waiting for the cluster, timeout = inf
INFO:tensorflow:Cluster is ready.
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['192.168.1.1:12345', '192.168.1.2:12345']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0',), communication = CommunicationImplementation.RING


# Preparing data

In [5]:
IMAGE_SIZE = (224, 224)
PER_WORKER_BATCH_SIZE = 32
NUM_WORKERS = len(tf_config['cluster']['worker'])
GLOBAL_BATCH_SIZE = PER_WORKER_BATCH_SIZE * NUM_WORKERS
EPOCHS = 10

In [6]:
from tensorflow.keras.layers import Rescaling
data_url = 'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz'
data_root = tf.keras.utils.get_file('flower_photos', data_url, untar=True)
data_length = !find {data_root} -name *.jpg | wc -l
data_length = int(data_length[0])

In [7]:
def get_dataset(batch_size, subset):
    shuffle = False
    if subset == 'validation': length = int(data_length * 0.1)
    elif subset == 'training': 
        length = int(data_length * 0.9)
        shuffle = True
    else: 
        raise NameError("subset must be 'training' or 'validation'")
        
    dataset = tf.keras.utils.image_dataset_from_directory(
      str(data_root),
      validation_split = 0.1,
      subset = subset,
      image_size = IMAGE_SIZE,
      batch_size = batch_size,
      seed = 123,
    )
    normalization_layer = Rescaling(1./127.5, offset=-1)
    dataset = dataset.map(lambda x, y: (normalization_layer(x), y))
    
    if shuffle: dataset = dataset.shuffle(buffer_size=length)
    dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset, length

In [8]:
ds_train, num_train = get_dataset(GLOBAL_BATCH_SIZE, 'training')
ds_val, num_val = get_dataset(GLOBAL_BATCH_SIZE, 'validation')

Found 3670 files belonging to 5 classes.
Using 3303 files for training.
Found 3670 files belonging to 5 classes.
Using 367 files for validation.


# Model implement

## Define the model

In [9]:
import tensorflow_hub as hub
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [10]:
def build_and_compile_model():
    feature_extractor_layer = hub.KerasLayer(
        'https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification/4',
        input_shape = IMAGE_SIZE + (3,),
        trainable = True
    )
    model = Sequential([feature_extractor_layer, Dense(5)])
    model.compile(
        optimizer = 'adam',
        loss = SparseCategoricalCrossentropy(from_logits=True),
        metrics = ['accuracy']
    )
    return model

In [11]:
with strategy.scope(): 
    model = build_and_compile_model()
    model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 1001)              3540265   
                                                                 
 dense (Dense)               (None, 5)                 5010      
                                                                 
Total params: 3,545,275
Trainable params: 3,511,163
Non-trainable params: 34,112
_________________________________________________________________


## Callbacks

In [12]:
def decay(epoch):
    if epoch < 3: return 1e-3
    elif epoch >= 3 and epoch < 7: return 1e-4
    return 1e-5

In [13]:
# Define a callback for printing the learning rate at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f'\nLearning rate for epoch {epoch + 1} is {model.optimizer.lr.numpy()}')

In [14]:
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler
callbacks = [
    TensorBoard(log_dir='./logs'),
    EarlyStopping(monitor='val_loss', patience=3, verbose=1),
    LearningRateScheduler(decay),
    PrintLR()
]
!rm -rf logs

# Training

In [15]:
history = model.fit(
    ds_train, 
    validation_data = ds_val,
    callbacks = callbacks,
    epochs = EPOCHS, 
    verbose = 1,
)
%tensorboard --logdir=logs

Epoch 1/10
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 160 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 160 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 160 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 160 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1




INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1



Learning rate for epoch 1 is 0.0010000000474974513
Epoch 2/10
Learning rate for epoch 2 is 0.0010000000474974513
Epoch 3/10
Learning rate for epoch 3 is 0.0010000000474974513
Epoch 4/10
Learning rate for epoch 4 is 9.999999747378752e-05
Epoch 5/10
Learning rate for epoch 5 is 9.999999747378752e-05
Epoch 6/10
Learning rate for epoch 6 is 9.999999747378752e-05
Epoch 7/10
Learning rate for epoch 7 is 9.999999747378752e-05
Epoch 8/10
Learning rate for epoch 8 is 9.999999747378752e-06
Epoch 9/10
Learning rate for epoch 9 is 9.999999747378752e-06
Epoch 10/10
Learning rate for epoch 10 is 9.999999747378752e-06
Epoch 00010: early stopping


Reusing TensorBoard on port 6006 (pid 28580), started 0:18:32 ago. (Use '!kill 28580' to kill it.)