### Load libraries

In [None]:
from datetime import datetime
import numpy as np
import pandas as pd
import PIL
import PIL.Image
import tensorflow as tf
# from tensorflow.data import Dataset
from tensorflow._api.v2.data import Dataset
from tensorflow.keras import *
from tensorflow.keras.layers import *

print(f"Using Tensorflow {tf.__version__}")

import pathlib
import matplotlib.pyplot as plt
plt.style.use('dark_background')

# TensorFlow configuration
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Import TensorDash
# from tensordash.tensordash import Tensordash

# Load the TensorBoard notebook extension
%load_ext tensorboard
log_dir = "logs/" + "bc=64"
tf.debugging.experimental.enable_dump_debug_info(log_dir, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1)

### Load the dataset

In [None]:
data_dir = pathlib.Path("data/train")
print(f"{len(list(data_dir.glob('*.jpg')))} images found!")

### Load csv file

In [None]:
batch_size = 64
images_shapes = (32, 32)
# class_names = { 0: "no_cactus", 1: "cactus" }
class_names = [ "no_cactus", "cactus" ]

images_csv = pd.read_csv("data/train.csv")
# Cast "has_cactus" column to string, as required by flow_from_dataframe
# images_csv["has_cactus"] = images_csv["has_cactus"].astype("str")
images_csv = images_csv.replace({ "has_cactus": { i: class_names[i] for i in range(len(class_names)) } })
print(images_csv.head())

### TensorDash init

In [None]:
# histories = Tensordash(
#     ModelName=f"Cactus (bc={batch_size}, no MaxPooling)",
#     email="hugo.cartigny@gmail.com",
#     password="jaimelescactus"
# )

### Check data repartition

In [None]:
# samples has to be a Pandas series (single column)
def get_weights(class_names, samples) :
    sample_count = samples.size
    class_count = len(class_names)

    classes_repartition = samples.value_counts()
    # Plot the repartition
    classes_repartition.plot.bar()

    return {
        i: sample_count / (classes_repartition[class_name] * class_count)
        for i, class_name in enumerate(class_names)
    }

# Compute classes weights based on their proportion in the dataset
weights = get_weights(class_names, images_csv["has_cactus"])
print(weights)

### Preprocess the data

In [None]:
datagen = preprocessing.image.ImageDataGenerator(
    rescale=1. / 255,
    validation_split=0.2,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode="nearest",

    samplewise_center=True,
    samplewise_std_normalization=True
)
def get_generator(subset):
    generator = datagen.flow_from_dataframe(
        dataframe=images_csv,
        directory="data/train/",
        x_col="id",
        y_col="has_cactus",
        subset=subset,
        batch_size=batch_size,
        #seed=123,
        shuffle=True, # Default value
        class_mode="categorical",
        target_size=images_shapes # All images should already be in 32x32
    )
    return (lambda : generator), generator.__len__()

def get_dataset(subset):
    gen, gen_len = get_generator(subset)
    print(next(gen())[0].shape)
    return \
        Dataset.from_generator(
            gen,
            output_signature=(
                tf.TensorSpec(shape=(None, *images_shapes, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(None,2), dtype=tf.float32)
            )
        ), \
        gen_len

In [None]:
train_ds, train_ds_steps = get_dataset("training")
val_ds, val_ds_steps = get_dataset("validation")

In [None]:
print(train_ds.element_spec)

### Preview the images

In [None]:
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype(np.float32))
        # plt.title("Cactus" if labels[i] == 1. else "Not a cactus")
        plt.title(class_names[labels[i][0] == 1])
        plt.axis("off")

In [None]:
for image_batch, labels_batch in train_ds.take(1):
  print(image_batch.shape)
  print(labels_batch.shape)

### Configure the dataset for performance (cache + prefetch)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

# train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
# val_ds = val_ds.cac.cache().prefetch(buffer_size=AUTOTUNE)
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)

for image_batch, labels_batch in train_ds.take(1):
  print(image_batch.shape)
  print(labels_batch.shape)

print(tf.data.experimental.cardinality(train_ds))

### Define the model

In [None]:
model = Sequential([
    Conv2D(16, 3, activation="relu", padding="same", input_shape=(*images_shapes, 3)),
    Conv2D(16, 3, activation="relu", padding="same"),
    BatchNormalization(),
    # MaxPooling2D(strides=2),
    Conv2D(32, 3, activation="relu", padding="same"),
    Dropout(0.2),
    Conv2D(64, 5, activation="relu", padding="same"),
    BatchNormalization(),
    Dropout(0.2),
    Flatten(),
    Dense(512, activation="relu"),
    Dropout(0.2),
    Dense(256, activation="relu"),
    Dense(2, activation="softmax")
])

model.summary()

### Create callback

#### ReduceLROnPlateau

Reduces learning rate when a metric has stopped improving.

In [None]:
reduce_lr_on_plateau = callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.15, # new_lr = lr * factor
    patience=3,
    min_lr=1e-5
)

#### EarlyStopping

Stops training when a monitored metric has stopped improving.

In [None]:
early_stopping = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=12
)

#### TensorBoard

In [None]:
tensorboard_callback = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

### Compile the model

In [None]:
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss=losses.BinaryCrossentropy(),
    metrics=["accuracy"]
)

### Train the model

In [None]:
epochs = 150

# try:
history = model.fit(
    train_ds,
    steps_per_epoch=train_ds_steps,
    validation_data=val_ds,
    validation_steps=val_ds_steps,
    class_weight=weights,
    epochs=epochs,
    callbacks=[
        # histories,
        tensorboard_callback,
        reduce_lr_on_plateau,
        early_stopping
    ]
)

# except:
#     histories.sendCrash()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()