### Loading necessary libraries

In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns
from PIL import Image
import glob

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from sklearn.metrics import classification_report

from tensorflow.keras.callbacks import EarlyStopping

import joblib

### Defining files' path

In [10]:
data_dir = '../data/'
model_dir = '../model/'

dir_ = os.listdir(data_dir)

model_file = model_dir + 'CNN.joblib'

In [11]:
dir_

['valid', 'train', 'test', 'images2predict']

### Retrieving directories list

In [15]:
all_dir = dict.fromkeys(dir_)

for d in dir_ :
    all_dir[d] = glob.glob(data_dir+d+'/*')

### Retrieving data files

In [80]:
files = dict.fromkeys(dir_)

for key in dict.fromkeys(dir_) :
    files[key] = []
    
    for d in range(len(all_dir[key])) :

        if key != 'images2predict' :
            files[key].extend(glob.glob(all_dir[key][d]+'/*.jpg'))
        else :
            files[key].append(all_dir[key][d])


### Investigate dataset resolution distribution

In [84]:
for key in dict.fromkeys(dir_) :
    # Empty shape list
    shape = []

    # Empty image list
    image_un = []

    for file in files[key] :
        # Read image file
        image = plt.imread(file)

        # Add image to image list
        image_un.append(image)

        shape.append(image.shape)

    # Sum the shape
    shape_sum = tuple(map(sum, tuple(zip(*shape)))) 
    shape_median = tuple(np.percentile(dim, 50) for dim in zip(*shape))
    shape_per25 = tuple(np.percentile(dim, 25) for dim in zip(*shape))
    shape_per75 = tuple(np.percentile(dim, 75) for dim in zip(*shape))
    shape_per90 = tuple(np.percentile(dim, 90) for dim in zip(*shape))
    shape_min = tuple(map(min, tuple(zip(*shape))))
    shape_max = tuple(map(max, tuple(zip(*shape))))

    # Calcualting shape average
    shape_avg = tuple( i // len(shape) for i in shape_sum)

    # Print results
    print(f'The average dimensions for images in the {key} folder is {shape_avg}')
    print(f'The maximum dimensions for images in the {key} folder is {shape_max}')
    print(f'The mimimum dimensions for images in the {key} folder is {shape_min}')
    print(f'The median dimensions for images in the {key} folder is {shape_median}')
    print(f'The 25th percentile dimensions for images in the {key} folder is {shape_per25}')
    print(f'The 75th percentile dimensions for images in the {key} folder is {shape_per75}')
    print(f'The 90th percentile dimensions for images in the {key} folder is {shape_per90}\n\n')

The average dimensions for images in the valid folder is (224, 224, 3)
The maximum dimensions for images in the valid folder is (224, 224, 3)
The mimimum dimensions for images in the valid folder is (224, 224, 3)
The median dimensions for images in the valid folder is (np.float64(224.0), np.float64(224.0), np.float64(3.0))
The 25th percentile dimensions for images in the valid folder is (np.float64(224.0), np.float64(224.0), np.float64(3.0))
The 75th percentile dimensions for images in the valid folder is (np.float64(224.0), np.float64(224.0), np.float64(3.0))
The 90th percentile dimensions for images in the valid folder is (np.float64(224.0), np.float64(224.0), np.float64(3.0))


The average dimensions for images in the train folder is (224, 224, 3)
The maximum dimensions for images in the train folder is (224, 224, 3)
The mimimum dimensions for images in the train folder is (224, 224, 3)
The median dimensions for images in the train folder is (np.float64(224.0), np.float64(224.0), np

### Defining the image resolution

In [None]:
res = (224, 224)

### Image Augmentation and Scaling

In [None]:
img_gen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

### Apply Image augmentation and scaling to dataset

In [None]:
train_data = img_gen.flow_from_directory(data_dir,
                                         target_size=res, batch_size=64, class_mode='binary',
                                         shuffle=True,subset='training')

train_data.class_indices

In [None]:
val_data = img_gen.flow_from_directory(data_dir,
                                        target_size=res,batch_size=1, shuffle=False,
                                        class_mode='binary',
                                        subset='validation')

val_data.class_indices

In [None]:
test_data = img_gen.flow_from_directory(data_dir,
                                        target_size=res, batch_size=1, shuffle=False,
                                        class_mode='binary',
                                        subset='validation')

test_data.class_indices

### Load model if it already exists

In [4]:
if os.path.exists(model_file) :
    cnn = joblib.load('../model/CNN.joblib')
    scaler = joblib.load('../model/CNN_scaler.joblib')
    print('Model exists')

### Define model if it does not already exist

In [9]:
if not os.path.exists(model_file) :
    print('Model does not exists')
    

Model does not exists


### Train the model on the dataset

### Save model

In [None]:
joblib.dump(cnn, "../model/CNN.joblib")
joblib.dump(scaler, "../model/CNN_scaler.joblib")