# Experimental Template
The following notebook acts as a template for experiments. The one step not included is the data cleaning phase!
<br />
<br />
I have put FIXME tags next to the areas that you will need to address.
<br />
<br />
If you are curious to know more about how the code works, look at the `helper_functions.py` file.

## Imports and Setup

In [1]:
from helper_functions import pd, os, shutil
from helper_functions import convert_samples_to_binary, get_column_data_types, print_library_versions, add_id_column

In [2]:
print_library_versions()

pandas version:           1.4.1
matplotlib version:       3.5.1
numpy version:            1.18.5
bitstring version:        3.1.9
joblib version:           1.1.0
PIL version:              8.2.0


## Data Loading and Cleaning

In [3]:
# FIXME - update the filename to point to your dataset
filename = '/mnt/sda1/iris.csv'
df = pd.read_csv(filename)

In [4]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


# NOTE:
The following code segments are specific to my dataset. This is where you will need to perform your own data cleaning!!!
<br />
<br />
This will be the most difficult/time intensive aspect.

In [6]:
# FIXME - change the value 'species' to match the 'y' value of your dataset
# This function will give your dataframe an 'id' column which will be used to identify the samples in the dataset
df = add_id_column(df, 'Species')
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1-setosa,5.1,3.5,1.4,0.2,Iris-setosa
1,2-setosa,4.9,3.0,1.4,0.2,Iris-setosa
2,3-setosa,4.7,3.2,1.3,0.2,Iris-setosa
3,4-setosa,4.6,3.1,1.5,0.2,Iris-setosa
4,5-setosa,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,46-virginica,6.7,3.0,5.2,2.3,Iris-virginica
146,47-virginica,6.3,2.5,5.0,1.9,Iris-virginica
147,48-virginica,6.5,3.0,5.2,2.0,Iris-virginica
148,49-virginica,6.2,3.4,5.4,2.3,Iris-virginica


In [7]:
# FIXME - Perform any data cleaning or preprocessing steps here

In [8]:
# FIXME - get the correct values in x and Y for your dataset
x = df.drop(['Species', 'Id'], axis=1)
Y = df['Species']
print(f"x {x.shape}")
print(f"Y {Y.shape}")
print(f"This value should be True: {x.shape[0] == Y.shape[0]}")

x (150, 4)
Y (150,)
This value should be True: True


## Image Generation

In [9]:
# FIXME - update the image directory to point to a directory where you want the images to be saved. Create a folder there.
# Change any other constants you want to change
image_directory = "/mnt/sda1/image-results-iris"
feature_types = get_column_data_types(x)
precision = 64
one = 128
zero = 0
n_jobs = -1
# For feature_types, 0 = float, 1 = int, 2 = bool
print(feature_types)

[0, 0, 0, 0]


In [10]:
convert_samples_to_binary(x, df["Id"], image_directory, precision, one, zero, n_jobs, feature_types)

## Place Folders into the Correct Categories

In [11]:
dirs = Y.unique().tolist()
if not os.path.exists(image_directory + '/data'):
    os.mkdir(image_directory + '/data')
new_dir = image_directory + '/data/'
if not os.path.exists(new_dir+'Train/'):
    os.mkdir(new_dir+'Train/')
if not os.path.exists(new_dir+'Validation/'):
    os.mkdir(new_dir+'Validation/')
for i in dirs:
    i = str(i).split('-')[1]
    if not os.path.exists(new_dir+'Train/'+i):
        os.mkdir(new_dir+'Train/'+i)
    if not os.path.exists(new_dir+'Validation/'+i):
        os.mkdir(new_dir+'Validation/'+i)

### Place pictures into the correct folder

In [12]:
total_images = 0
type_counts = {value.split("-")[1]: 0 for value in dirs}
for file in os.listdir(image_directory):
    try:
        dir = file.split("-")[1].split(".")[0]
    except:
        continue
    type_counts[dir]+=1
    shutil.move(f"{image_directory}/{file}", f"{new_dir}Train/{dir}/{file}")
    total_images += 1
print(total_images)
print(type_counts)

150
{'setosa': 50, 'versicolor': 50, 'virginica': 50}


### Place 20% of the data into the test folder

In [13]:
import random
for dir in os.listdir(new_dir+"Train/"):
    images_to_move = int(type_counts[dir] * 0.2)
    # select images_to_move random images from the directory
    for _ in range(images_to_move):
        image = random.choice(os.listdir(f"{new_dir}Train/{dir}"))
        shutil.move(f"{new_dir}Train/{dir}/{image}", f"{new_dir}Validation/{dir}/{image}")

## Train ResNet50

In [1]:
from keras_preprocessing import image
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Input, Model
from keras.layers import Conv2D
import numpy as np
from keras import backend as K
import tensorflow as tf
import keras

2023-03-02 16:45:23.753821: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
target_size = (4,64)
num_classes = 3
batch_size = 32
number_of_epochs = 50

In [4]:
train_dir = "/mnt/sda1/image-results-iris/data/Train/"
valid_dir = "/mnt/sda1/image-results-iris/data/Validation/"

train_datagen = ImageDataGenerator()


train_generator = train_datagen.flow_from_directory(train_dir,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    class_mode='categorical',
                                                    target_size=target_size
                                                    )

validation_datagen = ImageDataGenerator()
validation_generator = validation_datagen.flow_from_directory(valid_dir, 
                                                              shuffle=True,
                                                              batch_size=batch_size,
                                                              class_mode='categorical',
                                                              target_size=target_size)

Found 120 images belonging to 3 classes.
Found 30 images belonging to 3 classes.


In [5]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [6]:
print("Running tensorflow version: {}".format(tf.keras.__version__))
print("Running tensorflow.keras version: {}".format(tf.__version__))
print("Running keras version: {}".format(keras.__version__))
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.experimental.list_physical_devices('GPU')

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 2GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

Running tensorflow version: 2.4.0
Running tensorflow.keras version: 2.4.1
Running keras version: 2.4.3
Num GPUs Available:  1
1 Physical GPUs, 1 Logical GPUs


2023-03-02 16:46:01.499739: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-03-02 16:46:01.500446: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-03-02 16:46:01.527597: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-02 16:46:01.527906: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1080 computeCapability: 6.1
coreClock: 1.7715GHz coreCount: 20 deviceMemorySize: 7.92GiB deviceMemoryBandwidth: 298.32GiB/s
2023-03-02 16:46:01.527924: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-03-02 16:46:01.561279: I tensorflow/stream_executor/platform/d

In [9]:
with tf.device(tf.DeviceSpec(device_type="GPU", device_index='0')):
    # Grab pretrained model, include_top removes the classification layer
    ResNet50_model = tf.keras.applications.ResNet50V2(weights='imagenet', include_top=False, classes=num_classes, input_shape=(64,64,3))
    
    # Resnet wants a three chanel input, but we have grayscale images
    #input_tensor = Input(shape=(img_size,img_size,1))
    # x has a dimension of (IMG_SIZE,IMG_SIZE,3)
    #x = Conv2D(3,target_size,padding='same')(input_tensor) 
    # Prepend the Resnet model with the input tensor
    #out = original_ResNet50_model(x) 
    #ResNet50_model = Model(inputs=input_tensor,outputs=out)
    
    # Layers are frozen by default, performance seems to tank if we freeze them
    for layer in ResNet50_model.layers:
        layer.trainable = True
    
    # Creating fully connected layer for learning
    resnet50_x = tf.keras.layers.Flatten()(ResNet50_model.output)
    resnet50_x = tf.keras.layers.Dense(512,activation='relu')(resnet50_x)
    resnet50_x = tf.keras.layers.Dense(num_classes,activation='softmax')(resnet50_x)
    resnet50_x_final_model = tf.keras.Model(inputs=ResNet50_model.input, outputs=resnet50_x)
    
    #opt = tf.keras.optimizers.Adam(lr=0.01)
    opt = tf.keras.optimizers.SGD(lr=0.01,momentum=0.7)
    resnet50_x_final_model.compile(loss = 'categorical_crossentropy', optimizer= opt, metrics=['acc',f1_m,precision_m, recall_m])
    resnet_filepath = '/mnt/sda1/resnet-models/resnet50'+'-saved-model-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5'
    resnet_checkpoint = tf.keras.callbacks.ModelCheckpoint(resnet_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    resnet_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.05, patience=5, min_lr=0.000002)
    tb_callback = tf.keras.callbacks.TensorBoard('./tb_logs', update_freq=1)
    callbacklist = [resnet_checkpoint,resnet_early_stopping,reduce_lr,tb_callback]

    resnet50_history = resnet50_x_final_model.fit(train_generator, epochs = number_of_epochs ,validation_data = validation_generator,verbose=1)


2023-03-02 16:48:36.380592: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2023-03-02 16:48:36.380620: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2023-03-02 16:48:36.380710: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2023-03-02 16:48:36.380745: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] CUPTI activity buffer flushed


Epoch 1/50


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  logits and labels must be broadcastable: logits_size=[16,3] labels_size=[32,3]
	 [[node categorical_crossentropy/softmax_cross_entropy_with_logits (defined at tmp/ipykernel_51580/3209915692.py:33) ]]
	 [[Func/cond/then/_0/input/_52/_44]]
  (1) Invalid argument:  logits and labels must be broadcastable: logits_size=[16,3] labels_size=[32,3]
	 [[node categorical_crossentropy/softmax_cross_entropy_with_logits (defined at tmp/ipykernel_51580/3209915692.py:33) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_20719]

Function call stack:
train_function -> train_function
