In [1]:
import os
import json
import h5py
import tensorflow as tf
%load_ext tensorboard

# Multi-worker configuration

In [2]:
gpu_devices = tf.config.list_physical_devices('GPU') 
if len(gpu_devices) == 0: raise SystemError('GPU device not found')
for gpu in gpu_devices: 
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
tf_config = {
    'cluster': {
        'worker': ['192.168.1.1:12345', '192.168.1.2:12345']
    },
    'task': {'type': 'worker', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)

In [4]:
strategy = tf.distribute.MultiWorkerMirroredStrategy(
    communication_options = tf.distribute.experimental.CommunicationOptions(
        implementation = tf.distribute.experimental.CollectiveCommunication.RING
    )
)

INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0']
INFO:tensorflow:Waiting for the cluster, timeout = inf
INFO:tensorflow:Cluster is ready.
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['192.168.1.1:12345', '192.168.1.2:12345']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0',), communication = CommunicationImplementation.RING


# Preparing data

In [None]:
TRAIN_PATH = 'Dataset/Train'
VALIDATE_PATH = 'Dataset/Validate'
TEST_PATH = 'Dataset/Test'

In [6]:
CLASSES = 30
IMAGE_SIZE = (224, 224)
PER_WORKER_BATCH_SIZE = 32
NUM_WORKERS = len(tf_config['cluster']['worker'])
GLOBAL_BATCH_SIZE = PER_WORKER_BATCH_SIZE * NUM_WORKERS
EPOCHS = 2

In [7]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
train_generator = ImageDataGenerator(
    rescale = 1./255,
    rotation_range = 40, 
    width_shift_range = 0.2, 
    height_shift_range = 0.2,
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True
)
validate_generator = ImageDataGenerator(rescale=1./255)

In [8]:
generated_train_data = train_generator.flow_from_directory(
    TRAIN_PATH, 
    target_size = IMAGE_SIZE, 
    batch_size = GLOBAL_BATCH_SIZE
)
generated_validate_data = validate_generator.flow_from_directory(
    VALIDATE_PATH, 
    target_size = IMAGE_SIZE, 
    batch_size = GLOBAL_BATCH_SIZE
)

Found 17581 images belonging to 30 classes.
Found 2515 images belonging to 30 classes.


# Input data

In [9]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

In [10]:
train_dataset = tf.data.Dataset.from_generator(
    lambda: generated_train_data, 
    output_types = (tf.float32, tf.float32), 
    output_shapes = ([GLOBAL_BATCH_SIZE, *IMAGE_SIZE, 3], [GLOBAL_BATCH_SIZE, CLASSES])
).cache().prefetch(buffer_size=tf.data.AUTOTUNE).with_options(options)

In [11]:
validate_dataset = tf.data.Dataset.from_generator(
    lambda: generated_validate_data, 
    output_types = (tf.float32, tf.float32), 
    output_shapes = ([GLOBAL_BATCH_SIZE, *IMAGE_SIZE, 3], [GLOBAL_BATCH_SIZE, CLASSES])
).cache().prefetch(buffer_size=tf.data.AUTOTUNE).with_options(options)

In [12]:
num_train = !find {TRAIN_PATH} -type f | wc -l
num_validate = !find {VALIDATE_PATH} -type f | wc -l
num_train, num_validate = int(num_train[0]), int(num_validate[0])

# Model implement

In [13]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model

## Define the model

In [14]:
def build_and_compile_model():
    base_model = MobileNetV2(
        input_shape = IMAGE_SIZE + (3,), 
        include_top = False,
        weights = None
    )
    
    x = preprocess_input(base_model.output)
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(CLASSES, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=outputs)
    model.compile(
        optimizer = 'adam', 
        loss = 'categorical_crossentropy', 
        metrics = ['accuracy']
    )
    return model

## Callbacks

In [15]:
def decay(epoch):
    if epoch < 3: return 1e-3
    elif epoch >= 3 and epoch < 7: return 1e-4
    return 1e-5

In [16]:
# Define a callback for printing the learning rate at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f'\nLearning rate for epoch {epoch + 1} is {model.optimizer.lr.numpy()}')

In [17]:
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler
callbacks = [
    TensorBoard(log_dir='./logs'),
    EarlyStopping(monitor='val_loss', patience=3, verbose=1),
    LearningRateScheduler(decay),
    PrintLR()
]
!rm -rf logs

# Training

In [18]:
with strategy.scope(): 
    model = build_and_compile_model()
    
history = model.fit(
    train_dataset,
    validation_data = validate_dataset,
    validation_steps = num_validate // (GLOBAL_BATCH_SIZE * NUM_WORKERS),
    steps_per_epoch = num_train // (GLOBAL_BATCH_SIZE * NUM_WORKERS),
    # validation_steps = num_validate // GLOBAL_BATCH_SIZE,
    # steps_per_epoch = num_train // GLOBAL_BATCH_SIZE,
    callbacks = callbacks,
    epochs = EPOCHS,
    verbose = 1
)

INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 

INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 

  layer_config = serialize_layer_fn(layer)


INFO:tensorflow:Collective all_reduce tensors: 160 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 2, implementation = RING, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 160 all_reduces, num_device

In [None]:
model.save('model.hdf5')
%tensorboard --logdir=logs