### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import confusion_matrix
import shutil
import random

2022-12-12 18:07:01.539949: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Creating new folders for respective classes

In [7]:

os.mkdir('dogs-vs-cats/images/cat')
os.mkdir('dogs-vs-cats/images/dog')

folder = 'dogs-vs-cats/images/'

for file in os.listdir(folder):
    if file.startswith('cat.'):
        shutil.move(folder + file, folder + 'cat')
    elif file.startswith('dog.'):
        shutil.move(folder + file, folder + 'dog')
        


FileExistsError: [Errno 17] File exists: 'dogs-vs-cats/images'

### Removing images

In [14]:
files = os.listdir('dogs-vs-cats/images/cat')
for file in random.sample(files,12000):
    os.remove('dogs-vs-cats/images/cat/' + file)

files = os.listdir('dogs-vs-cats/images/dog')
for file in random.sample(files,12000):
    os.remove('dogs-vs-cats/images/dog/' + file)
    


### Set image directory

In [2]:
image_dir = Path('../cats-and-dogs-data-mining/dogs-vs-cats/images/')

### Create filepath dataframe

In [3]:
filepaths = list(image_dir.glob(r'**/*.jpg')) #find all .jpg files within the current folder
labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths)) #how we pull labels

filepaths = pd.Series(filepaths, name = 'Filepath').astype(str)
labels = pd.Series(labels, name = 'Label')

image_df = pd.concat([filepaths, labels], axis = 1)

### Train-Test Split

In [4]:
train_df, test_df = train_test_split(image_df, train_size = 0.7, shuffle = True, random_state = 1)

### Load Image Data

In [5]:
# Allows us to load a subset of images at a time, train them and recycle the memory so we don't run out
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale = 1./255, # scale pixel intensity values from 0 - 255 down to 0 - 1
    horizontal_flip=True, # Make our model more resilient to horizontally flipped pics
    width_shift_range=0.2, # Shift width by 20%
    height_shift_range=0.2, # Shift height by 20%
    validation_split = 0.2 # Pull train and test images through the same generato
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale = 1./255
)

In [6]:
# Flow the images (specify how the images will be loaded)
train_images = train_generator.flow_from_dataframe(
    dataframe = train_df,
    x_col="Filepath",
    y_col="Label",
    target_size = (224,224), # Standardize image size
    color_mode='rgb', # Our images are colorized
    class_mode='binary', # we have 2 classes only
    batch_size = 32, # how many images to load at a time
    shuffle = True, # Shuffle for training
    seed=42, # makes sure the shuffling is always the same way, and always the same subset
    subset = 'training'
)

val_images = train_generator.flow_from_dataframe(
    dataframe = train_df,
    x_col="Filepath",
    y_col="Label",
    target_size = (224,224), # Standardize image size
    color_mode='rgb', # Our images are colorized
    class_mode='binary', # we have 2 classes only
    batch_size = 32, # how many images to load at a time
    shuffle = True, # Shuffle for training
    seed=42, # makes sure the shuffling is always the same way, and always the same subset
    subset = 'validation'
)

test_images = train_generator.flow_from_dataframe(
    dataframe = test_df,
    x_col="Filepath",
    y_col="Label",
    target_size = (224,224), # Standardize image size
    color_mode='rgb', # Our images are colorized
    class_mode='binary', # we have 2 classes only
    batch_size = 32, # how many images to load at a time
    shuffle = False, # False since we are only evaluating, not training
)

Found 560 validated image filenames belonging to 2 classes.
Found 140 validated image filenames belonging to 2 classes.
Found 300 validated image filenames belonging to 2 classes.


### Model Training

In [8]:
inputs = tf.keras.Input(shape=(224, 224, 3)) # One for each color channel
# 16 filters, kernal size of 3x3. 
# The convolutional layer will look at the image, slide a window across the image, and the window
# weights will multiply by the pixel values, sum them up, and send that to a new 2D feature
# We will end up with a new 2D array with the values. 
# Filters specify how many times we want to do this full pass over the image.
# The kernal size represents how big the window is
# The whole point of a Convolutional Neural Network is to extract features that 
# are useful for predicting
# If we were to pass each pixel as an individual feature, the model would be too complex and likely
# overfit. Also, there is no way to capture the spatial relationship between the data
x = tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu')(inputs)
# maxpool also sends a window across the image, and takes a max of 4 pixels.
# allows the next convolutional data to reduce the dimensions of the data and keep the most
# important pixels (simplified, high level view of each image)
# Each time we maxpool, we lose information, but make it easier for the next layer to grasp
# high level relationships in the data
x = tf.keras.layers.MaxPool2D()(x)
x = tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), activation = 'relu')(x)
x = tf.keras.layers.MaxPool2D()(x)
# Average over the first 2 dimensions so that we just end up with 32 features. 
# These features could be anything like pointy ears for cats, or floppy ears for dogs
x = tf.keras.layers.GlobalAveragePooling2D()(x)
# create the actual classifier, a 2 hidden layer dense NN
x = tf.keras.layers.Dense(128, activation = 'relu')(x)
x = tf.keras.layers.Dense(128, activation = 'relu')(x)
# outputs is another dense layer with 1 output value and sigmoid activation since
# it is a binary classification task
# sigmoid gives it the effect of being betweem 0 or 1
# so the output is a single prob estimate of the prob that one of the classes is present in the image
# In this case, 1 = dog and 0 = cat
# So the output is the probability of a dog since that is the positive class
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

# Create the model
model = tf.keras.Model(inputs = inputs, outputs = outputs)

# Model compiler with adam optimizer, binary crossentropy loss, and accuracy as the metric
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

# 100 epochs with early stopping callback. Early stopping will look at the validation loss so we can
# monitor the validation loss, when the val loss has not improved after 5 epochs, it will stop
# training and restore the weights from the best epoch
# We chose to reduce the learning rate to stabilize model training. Validation loss was fluctuating
# a lot previously.
history = model.fit(
    train_images,
    validation_data = val_images,
    epochs = 100,
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor = "val_loss",
            patience = 5,
            restore_best_weights = True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor = "val_loss",
            patience = 3
        )
    ]
)

2022-12-12 17:39:37.978350: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
