### Making the necessary imports

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
# from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

import matplotlib.pyplot as plt

import time

### Setting the batch size, number of epochs and target size

In [None]:
batch_size = 16
epochs = 25
image_size = (512, 512)

### Creating training and validation images data

Here we use the function *image_dataset_from_directory* setting the training data from the *Dataset/training* folder.

We set the *subset* as "*training*" and *validation_split* as "*0.2*", in other words, from all the images inside the
*training* folder, 80% will be used in training and 20% in validation.

The *image_size* and *batch_size* are the same as defined above.

In [None]:
train_ds = keras.preprocessing.image_dataset_from_directory(
    'Dataset/training',
    validation_split=0.2,
    subset='training',
    seed=1337,
    image_size=image_size,
    batch_size=batch_size,
    label_mode='categorical'
)
val_ds = keras.preprocessing.image_dataset_from_directory(
    'Dataset/training',
    validation_split=0.2,
    subset='validation',
    seed=1337,
    image_size=image_size,
    batch_size=batch_size,
    label_mode='categorical'
)

### Print the class names and set the number of classes

In [None]:
class_names = train_ds.class_names
num_classes = len(class_names)
print(f'Class names: {class_names}')
print(f'Number of classes: {num_classes}')

### Prepare data augmentation and normalization layers

The dataset used is very small, only has 1216 images, the model might have a tendency to overfit. To prevent that from
happening, we preprocess the images to increase the amount images and the variety that the model is processing.

Our image has RGB coeficients ranging from 0 to 255 and that is too large for our model to process. In order to simplify
this, we use a normalization layer which, by rescaling with a factor of 1./255, it changes all pixels values to a range
of 0 to 1.

In [None]:
data_augmentation = keras.Sequential(
    [
        keras.layers.experimental.preprocessing.RandomFlip('horizontal', input_shape=(image_size[0], image_size[1], 3)),
        keras.layers.experimental.preprocessing.RandomFlip('vertical', input_shape=(image_size[0], image_size[1], 3)),
        keras.layers.experimental.preprocessing.RandomRotation(0.1),
        keras.layers.experimental.preprocessing.RandomZoom(0.1),
    ]
)

normalization_layer = keras.layers.experimental.preprocessing.Rescaling(1./255)

### Prefetching, Shuffling and Caching

The *prefetch* method separates the moment when the data is created from the moment when the data is consumed, basically
it uses a separate thread and an internal buffer to prefetch entry data elements before the moment they are consumed.

The *shuffle* method randomly shuffles elements from the dataset.

And the *cache* method stores the dataset in memory or local storage, preventing unnecessary operations (like file
openings and data readings) from happening during each epoch.

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

### Creating the model

Here we create the structure of the model, being the first two layers the **data augmentation** and **normalization**
layers we previously created, followed by two **2D convolutional** layers with **16** output filters, kernel size of
__3__, padding "*same*" to guarantee that the output has the same width and height as the input and activation **ReLU**
(Rectified Linear Units).

Then a **2D max pooling** layer, which downsamples the input along its spacial dimensions (height and width) by taking
the maximum value over an input window for each channel of the input. The size of this input window is defined by a
parameter called "*pool_size*" and we used its default value, which is **(2, 2)**.

After this we have three more **2D convolutional** layers interspersed with three **2D max pooling** layers. These
layers work the same way as described above, the only change is in the convolutional layers where the new output filters
are 32, 64 and 128.

We then have a **dropout** layer, that randomly sets input units to 0 with a frequency of *__0.2__* at each step during
training time. This is used to prevent overfitting.

Then comes a **flatten** layer, which flattens the input, transforming all its channels into a single array.

And lastly, three **dense** layers. The first **128** units and activation "**ReLU**", the second with **64** units and
the same activation as the previous and the last layer, the one that outputs the prediction, has the **number of
classes** as the number of units, in our case __2__, and activation is "**softmax**".

#### ReLU and Softmax

ReLU is defined by the formula: $ReLU(x) = max(0, x)$

And Softmax is defined by the formula: $Sofmax(x_i) = \frac{exp(x_i)}{\sum_j exp(x_j)}$

In [None]:
model = Sequential([
    data_augmentation,
    normalization_layer,
    Conv2D(16, 3, padding='same', activation='relu'),
    Conv2D(16, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Conv2D(32, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Conv2D(64, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Conv2D(128, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Dropout(0.2),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model_version = 4

### Show the model summary

In [None]:
model.summary()

### Set the optimizer and compile the model

For the optimizer we used *__Adam__*, which is an algorithm based on other famous algorithm called "**Stochastic Gradient
Descent**", with a *learning rate* of *__0.001__*.

To compile our model, we use the method **compile** with 3 parameters:
- *__optimizer__*: the Adam optimizer
- *__loss__*: Categorical Crossentropy, which is a Softmax function followed by the cross-entropy loss function, defined
by $CE = -\sum_{i}^Ct_ilog(s_i)$, where $t_i$ is the groundtruth and $s_i$ is the CNN score for each class.
- *__metrics__*: Accuracy, which is the metric we want the model to evaluate during training and testing.

In [None]:
opt = Adam(learning_rate=0.001)
model.compile(
    optimizer=opt,
    loss=keras.losses.categorical_crossentropy,
    metrics=['accuracy']
)

### Set checkpoints and early stopping

We use the callback functions *__ModelCheckpoint__* and *__EarlyStopping__* to save our progress at each improving step
of the model training by saving a version of the current model in a specified location in our computer, as well as, if
there are no improvements in the monitored metric (in this case **accuracy**, defined in the "*monitor*" parameter) in
**20** epochs (defined in the "*patience*" parameter), the training of the model will be stopped.

In [None]:
checkpoint = ModelCheckpoint(f'Models/ModelCheckpoint-{batch_size}-{epochs}-v{model_version}.h5',
                             monitor='accuracy', verbose=1, save_best_only=True,
                             save_weights_only=False, mode='auto', save_freq=1)
early = EarlyStopping(monitor='accuracy', min_delta=0, patience=20, verbose=1, mode='auto')


### Train the model

Here we start our model training.

For the parameters we used:
- *__train_ds__*: the training dataset we prepared in the beginning.
- *__validation_data__*: *__val_ds__*, the validation dataset we prepared in the beginning.
- *__epochs__*: The *__epochs__* variable we set in the beginning.
- *__callbacks__*: The *__checkpoint__* and *__early stopping__* functions we just made.

We save our training in the *__hist__* variable so we can plot the training and validation data.

We also print out the elapsed time just for a better perception of the process.

In [None]:
start = time.time()
hist = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=[checkpoint, early]
)
total_time = (time.time() - start) / 60
print(f'\nElapsed time: {total_time:.2f} minutes')

### Plot the training data

Here we store our model's training data in variables just for clarity.

We plot two graphs, the first with the training and validation accuracy and the second with the training and validation
loss.

We then save this ploted graphs to a PNG file inside the _Plots_ folder with its name saying the **batch size** and
**epochs** used as well as which **model structure version** we are using. The latter only need to be changed if the
structure in "**Creating the model**" is changed. This variable is right below the model structure.

In [None]:
acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']

loss = hist.history['loss']
val_loss = hist.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')

plt.savefig(f'Plots/Batch Size {batch_size} - Epochs {epochs} - Model v{model_version}.png')

### Save the model

Finally, we save our model in the _Models_ folder with the same information as the plot PNG.

In [None]:
model.save(f'Models/Batch Size {batch_size} - Epochs {epochs} - Model v{model_version}.h5')
