In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# create a data generator
datagen = ImageDataGenerator(
    rescale=1.0/255,
    brightness_range=[1.0, 1.75],
    zoom_range=[1.0, 1.5],
    horizontal_flip = True,
    validation_split=0.2)

# load and iterate training dataset
train_it = datagen.flow_from_directory(
    '../master-data/train_large/rgb', 
    target_size=(224, 224),
    subset="training", 
    class_mode="categorical",
    shuffle=True,
    batch_size=512)

val_it = datagen.flow_from_directory(
    '../master-data/train_large/rgb', 
    target_size=(224, 224), 
    subset="validation", 
    class_mode="categorical",
    shuffle=True,
    batch_size=512)

Found 72000 images belonging to 11 classes.
Found 18000 images belonging to 11 classes.


In [None]:
# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='max', verbose=1, save_best_only=True)

# Open a strategy scope.
with strategy.scope():
  # Everything that creates variables should be under the strategy scope.
  # In general this is only model construction & `compile()`.
    conv_model = VGG16(weights='imagenet', include_top=False, classes=11, input_shape=(224,224,3))
    for layer in conv_model.layers:
        layer.trainable = False
    
    # flatten the output of the convolutional part: 
    x = Flatten()(conv_model.output)
    # three hidden layers
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='relu')(x)

    # final softmax layer
    predictions = Dense(11, activation='softmax')(x)

    # creating the full model:
    full_model = Model(inputs=conv_model.input, outputs=predictions)

    full_model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['acc'])

# Train the model on all available devices.
history = full_model.fit(train_it, validation_data = val_it, workers=14, epochs=16, callbacks=[es, mc])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


2021-11-24 17:28:24.319168: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2021-11-24 17:28:24.818857: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:1a:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2021-11-24 17:28:24.821190: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 1 with properties: 
pciBusID: 0000:1c:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2021-11-24 17:28:24.823433: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 2 with properties: 
pciBusID: 0000:1d:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2

Number of devices: 4
Epoch 1/16
INFO:tensorflow:batch_all_reduce: 6 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:batch_all_reduce: 6 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:lo

2021-11-24 17:29:07.433090: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2021-11-24 17:29:46.338537: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).

Epoch 00001: val_loss improved from -inf to 1.88298, saving model to best_model.h5
Epoch 2/16
Epoch 00002: val_loss did not improve from 1.88298
Epoch 3/16
Epoch 00003: val_loss did not improve from 1.88298
Epoch 4/16
Epoch 00004: val_loss did not improve from 1.88298
Epoch 5/16
Epoch 00005: val_loss did not improve from 1.88298
Epoch 6/16
Epoch 00006: val_loss did not improve from 1.88298
Epoch 7/16
Epoch 00007: val_loss did not improve from 1.88298
Epoch 8/16
Epoch 00008: val_loss did not improve from 1.88298
Epoch 9/16
Epoch 00009: val_loss did not improve from 1.88298
Epoch 10/16
Epoch 00010: val_loss did not improve from 1.88298
Epoch 11/16

In [None]:
def plot_history(history):
    '''Plot loss and accuracy as a function of the epoch,
    for the training and validation datasets.
    '''
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    # Get number of epochs
    epochs = range(len(acc))

    # Plot training and validation accuracy per epoch
    plt.plot(epochs, acc, label="acc")
    plt.plot(epochs, val_acc, label="val_acc")
    plt.title('Training and validation accuracy')
    plt.legend()
    
    # Plot training and validation loss per epoch
    plt.figure()

    plt.plot(epochs, loss, label="loss")
    plt.plot(epochs, val_loss, label="val_loss")
    plt.title('Training and validation loss')
    plt.legend()
    
    plt.show()

In [None]:
plot_history(history)

Epoch 10/50
282/282 [==============================] - 601s 2s/step - acc: 0.6664 - loss: 0.9677 - val_acc: 0.5649 - val_loss: 1.4936