In [1]:
from keras_preprocessing import image
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Input, Model
from keras.layers import Conv2D
import numpy as np
from keras import backend as K
import tensorflow as tf
import keras

2022-09-22 19:19:58.551519: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
img_size = 32
target_size = (img_size,img_size)
num_classes = 8
batch_size = 32
number_of_epochs = 50

# Create Datasets

In [3]:
train_dir = "/mnt/sda1/image-results-darknet/Train/"
valid_dir = "/mnt/sda1/image-results-darknet/Validation/"

train_datagen = ImageDataGenerator()


train_generator = train_datagen.flow_from_directory(train_dir,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    class_mode='categorical',
                                                    target_size=target_size
                                                    )

validation_datagen = ImageDataGenerator()
validation_generator = validation_datagen.flow_from_directory(valid_dir, 
                                                              shuffle=True,
                                                              batch_size=batch_size,
                                                              class_mode='categorical',
                                                              target_size=target_size)

Found 470467 images belonging to 8 classes.
Found 80000 images belonging to 8 classes.


# Custom Metrics not supported in Tensforflow

In [4]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Verify GPU support

In [5]:
print("Running tensorflow version: {}".format(tf.keras.__version__))
print("Running tensorflow.keras version: {}".format(tf.__version__))
print("Running keras version: {}".format(keras.__version__))
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.experimental.list_physical_devices('GPU')

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 2GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

Running tensorflow version: 2.4.0
Running tensorflow.keras version: 2.4.1
Running keras version: 2.4.3
Num GPUs Available:  1
1 Physical GPUs, 1 Logical GPUs


2022-09-22 19:20:13.857959: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-09-22 19:20:13.858844: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-09-22 19:20:13.887502: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-22 19:20:13.887822: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1080 computeCapability: 6.1
coreClock: 1.7715GHz coreCount: 20 deviceMemorySize: 7.92GiB deviceMemoryBandwidth: 298.32GiB/s
2022-09-22 19:20:13.887842: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-09-22 19:20:13.923312: I tensorflow/stream_executor/platform/d

# Train model

In [7]:
with tf.device(tf.DeviceSpec(device_type="GPU", device_index='0')):
    # Grab pretrained model, include_top removes the classification layer
    ResNet50_model = tf.keras.applications.ResNet50V2(weights='imagenet', include_top=False, classes=num_classes, input_shape=(img_size,img_size,3))
    
    # Resnet wants a three chanel input, but we have grayscale images
    #input_tensor = Input(shape=(img_size,img_size,1))
    # x has a dimension of (IMG_SIZE,IMG_SIZE,3)
    #x = Conv2D(3,target_size,padding='same')(input_tensor) 
    # Prepend the Resnet model with the input tensor
    #out = original_ResNet50_model(x) 
    #ResNet50_model = Model(inputs=input_tensor,outputs=out)
    
    # Layers are frozen by default, performance seems to tank if we freeze them
    for layer in ResNet50_model.layers:
        layer.trainable = True
    
    # Creating fully connected layer for learning
    resnet50_x = tf.keras.layers.Flatten()(ResNet50_model.output)
    resnet50_x = tf.keras.layers.Dense(512,activation='relu')(resnet50_x)
    resnet50_x = tf.keras.layers.Dense(num_classes,activation='softmax')(resnet50_x)
    resnet50_x_final_model = tf.keras.Model(inputs=ResNet50_model.input, outputs=resnet50_x)
    
    #opt = tf.keras.optimizers.Adam(lr=0.01)
    opt = tf.keras.optimizers.SGD(lr=0.01,momentum=0.7)
    resnet50_x_final_model.compile(loss = 'categorical_crossentropy', optimizer= opt, metrics=['acc',f1_m,precision_m, recall_m])
    resnet_filepath = '/mnt/sda1/resnet-models/resnet50'+'-saved-model-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5'
    resnet_checkpoint = tf.keras.callbacks.ModelCheckpoint(resnet_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    resnet_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.05, patience=5, min_lr=0.000002)
    tb_callback = tf.keras.callbacks.TensorBoard('./tb_logs', update_freq=1)
    callbacklist = [resnet_checkpoint,resnet_early_stopping,reduce_lr,tb_callback]

    resnet50_history = resnet50_x_final_model.fit(train_generator, epochs = number_of_epochs ,validation_data = validation_generator,callbacks=callbacklist,verbose=1)


2022-09-22 19:45:07.166240: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2022-09-22 19:45:07.166266: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2022-09-22 19:45:07.166347: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2022-09-22 19:45:07.166419: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] CUPTI activity buffer flushed


Epoch 1/50
    2/14703 [..............................] - ETA: 30:16 - loss: 2.4721 - acc: 0.1641 - f1_m: 0.0545 - precision_m: 0.2232 - recall_m: 0.0312   

2022-09-22 19:45:10.191677: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2022-09-22 19:45:10.191701: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.


    5/14703 [..............................] - ETA: 35:08 - loss: 2.4322 - acc: 0.1679 - f1_m: 0.0493 - precision_m: 0.1955 - recall_m: 0.0284  

2022-09-22 19:45:10.403128: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2022-09-22 19:45:10.405787: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] CUPTI activity buffer flushed
2022-09-22 19:45:10.415570: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:228]  GpuTracer has collected 1587 callback api events and 1563 activity events. 
2022-09-22 19:45:10.449231: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2022-09-22 19:45:10.476822: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./tb_logs/train/plugins/profile/2022_09_22_19_45_10
2022-09-22 19:45:10.496577: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to ./tb_logs/train/plugins/profile/2022_09_22_19_45_10/drake-pc.trace.json.gz
2022-09-22 19:45:10.550307: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./tb_log

 1021/14703 [=>............................] - ETA: 8:46 - loss: 1.7289 - acc: 0.3809 - f1_m: 0.2550 - precision_m: 0.5901 - recall_m: 0.1758

2022-09-22 19:46:00.696122: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.27GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-22 19:46:01.025410: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.28GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-22 19:46:01.195562: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.28GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-22 19:46:01.442499: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Alloc


Epoch 00001: val_acc improved from -inf to 0.50964, saving model to /mnt/sda1/resnet-models/resnet50-saved-model-01-val_acc-0.51.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.50964 to 0.73256, saving model to /mnt/sda1/resnet-models/resnet50-saved-model-02-val_acc-0.73.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.73256 to 0.83184, saving model to /mnt/sda1/resnet-models/resnet50-saved-model-03-val_acc-0.83.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.83184 to 0.84188, saving model to /mnt/sda1/resnet-models/resnet50-saved-model-04-val_acc-0.84.hdf5
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.84188
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.84188
Epoch 7/50

Epoch 00007: val_acc improved from 0.84188 to 0.85406, saving model to /mnt/sda1/resnet-models/resnet50-saved-model-07-val_acc-0.85.hdf5
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.85406
Epoch 9/50

Epoch 00009: val_acc improved from 0.85406 to 0.94214, saving model to 

KeyboardInterrupt: 