In [1]:
from keras_preprocessing import image
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Input, Model
from keras.layers import Conv2D
import numpy as np
from keras import backend as K
import tensorflow as tf
import keras

2022-09-21 22:13:42.490525: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [7]:
img_size = 9
target_size = (img_size,img_size)
num_classes = 11
batch_size = 32
number_of_epochs = 50

# Create Datasets

In [8]:
train_dir = "/mnt/sda1/image_results-small/Train/"
valid_dir = "/mnt/sda1/image_results-small/Validation/"

train_datagen = ImageDataGenerator()


train_generator = train_datagen.flow_from_directory(train_dir,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    class_mode='categorical',
                                                    target_size=target_size,
                                                    color_mode="grayscale")

validation_datagen = ImageDataGenerator()
validation_generator = validation_datagen.flow_from_directory(valid_dir, 
                                                              shuffle=True,
                                                              batch_size=batch_size,
                                                              class_mode='categorical',
                                                              target_size=target_size,
                                                              color_mode="grayscale")

Found 567989 images belonging to 11 classes.
Found 55000 images belonging to 11 classes.


# Custom Metrics not supported in Tensforflow

In [9]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Verify GPU support

In [10]:
print("Running tensorflow version: {}".format(tf.keras.__version__))
print("Running tensorflow.keras version: {}".format(tf.__version__))
print("Running keras version: {}".format(keras.__version__))
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.experimental.list_physical_devices('GPU')

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 2GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

Running tensorflow version: 2.4.0
Running tensorflow.keras version: 2.4.1
Running keras version: 2.4.3
Num GPUs Available:  1
1 Physical GPUs, 1 Logical GPUs


# Train model

In [11]:
with tf.device(tf.DeviceSpec(device_type="GPU", device_index='0')):
    # Grab pretrained model, include_top removes the classification layer
    original_ResNet50_model = tf.keras.applications.ResNet50V2(weights='imagenet', include_top=False, classes=num_classes)
    
    # Resnet wants a three chanel input, but we have grayscale images
    input_tensor = Input(shape=(img_size,img_size,1))
    # x has a dimension of (IMG_SIZE,IMG_SIZE,3)
    x = Conv2D(3,target_size,padding='same')(input_tensor) 
    # Prepend the Resnet model with the input tensor
    out = original_ResNet50_model(x) 
    ResNet50_model = Model(inputs=input_tensor,outputs=out)
    
    # Layers are frozen by default, performance seems to tank if we freeze them
    for layer in ResNet50_model.layers:
        layer.trainable = True
    
    # Creating fully connected layer for learning
    resnet50_x = tf.keras.layers.Flatten()(ResNet50_model.output)
    resnet50_x = tf.keras.layers.Dense(512,activation='relu')(resnet50_x)
    resnet50_x = tf.keras.layers.Dense(num_classes,activation='softmax')(resnet50_x)
    resnet50_x_final_model = tf.keras.Model(inputs=ResNet50_model.input, outputs=resnet50_x)
    
    #opt = tf.keras.optimizers.Adam(lr=0.01)
    opt = tf.keras.optimizers.SGD(lr=0.01,momentum=0.7)
    resnet50_x_final_model.compile(loss = 'categorical_crossentropy', optimizer= opt, metrics=['acc',f1_m,precision_m, recall_m])
    resnet_filepath = '/mnt/sda1/resnet-models/resnet50'+'-saved-model-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5'
    resnet_checkpoint = tf.keras.callbacks.ModelCheckpoint(resnet_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    resnet_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.05, patience=5, min_lr=0.000002)
    tb_callback = tf.keras.callbacks.TensorBoard('./tb_logs', update_freq=1)
    callbacklist = [resnet_checkpoint,resnet_early_stopping,reduce_lr,tb_callback]

    resnet50_history = resnet50_x_final_model.fit(train_generator, epochs = number_of_epochs ,validation_data = validation_generator,callbacks=callbacklist,verbose=1)


2022-09-21 22:14:59.369036: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2022-09-21 22:14:59.369061: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2022-09-21 22:14:59.369158: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2022-09-21 22:14:59.369236: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] CUPTI activity buffer flushed


Epoch 1/50


2022-09-21 22:15:02.394085: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.41GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-21 22:15:02.710700: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.27GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-21 22:15:04.232969: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.27GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-21 22:15:04.560015: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Alloc

    2/17750 [..............................] - ETA: 39:54 - loss: 2.6915 - acc: 0.1094 - f1_m: 0.0000e+00 - precision_m: 0.0000e+00 - recall_m: 0.0000e+00    

2022-09-21 22:15:23.779397: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2022-09-21 22:15:23.779422: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2022-09-21 22:15:23.823986: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2022-09-21 22:15:23.825131: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] CUPTI activity buffer flushed
2022-09-21 22:15:23.833466: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:228]  GpuTracer has collected 1579 callback api events and 1555 activity events. 
2022-09-21 22:15:23.862069: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2022-09-21 22:15:23.890656: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./tb_logs/train/plugins/profile/2022_09_21_22_15_23
2022-09-21 22:15:23.910683: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzip



2022-09-21 22:22:04.321409: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.29GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-21 22:22:04.745707: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.36GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-21 22:22:04.967354: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.36GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-21 22:22:05.299320: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Alloc

