In [1]:
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.optimizers import Adam

from sklearn.metrics import accuracy_score, classification_report

import numpy as np
import pandas as pd
import random

# use pillow for checking image sizes
import os
from PIL import Image

# Checking Image Sizes

In [2]:
# function to identify all unique image sizes for images in a directory
def print_img_sizes(path):
    # Get a list of all image file names in the directory
    image_files = [file for file in os.listdir(path) if file.endswith(('.jpg', '.jpeg', '.png'))]

    # Create a set to store unique image sizes
    unique_sizes = set()

    # Iterate over each image file and store its size in the set
    for image_file in image_files:
        image_path = os.path.join(path, image_file)
        with Image.open(image_path) as image:
            width, height = image.size
            size = (width, height)
            unique_sizes.add(size)

    # Print the unique image sizes
    print("Unique Image Sizes:")
    for size in unique_sizes:
        print(f"{size[0]}x{size[1]}")

In [3]:
print_img_sizes('../data/test/FAKE/')

Unique Image Sizes:
32x32


Images are 32x32, and are in color.

# Build ImageDataGenerator Classes for Training

In [4]:
# build an instance of the ImageDataGenerator class
datagen = ImageDataGenerator(
    rescale=1.0/255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

In [5]:
# build the training set
train_generator = datagen.flow_from_directory(
    directory="../data/train",
    target_size=(32,32),
    batch_size=32,
    class_mode="categorical"
)

Found 100000 images belonging to 2 classes.


In [6]:
print("Image shape of each observation: ",train_generator.image_shape)
print("Number of classes: ",train_generator.num_classes)

Image shape of each observation:  (32, 32, 3)
Number of classes:  2


# Build ImageDataGenerator Class for Testing

In [7]:
# Generate the test set
test_generator = datagen.flow_from_directory(
    directory="../data/test",
    target_size=(32,32),
    class_mode="categorical",
    shuffle=False # Make sure this is false so that predictions will align w correct image labels later on
)

Found 20000 images belonging to 2 classes.


# Build the Neural Network

In [8]:
# Define model as a function so that we can perform hyperparameter tuning
def create_model(filters=32,kernel_size=(3,3), pool_size=(2,2), hidden_units=128, dropout_rate=.5, learning_rate=0.001):

    # Create an instance of Sequential
    classifier = Sequential()

    # Add a Conv2D layer. Applies a set of filters to the input data, each filter learns to recognize different patterns or features
    classifier.add(Conv2D(filters=filters, 
                        kernel_size=kernel_size, 
                        input_shape=train_generator.image_shape, 
                        activation='relu')
                        )

    # Add a MaxPooling2d layer. Performs downsampling on the data, reduces dimensions. Divides input data into non-overlapping regions (pooling windows).
    # Maximum value is output within each window.
    classifier.add(MaxPooling2D(pool_size))

    classifier.add(Conv2D(filters=filters, 
                        kernel_size=kernel_size, 
                        input_shape=train_generator.image_shape, 
                        activation='relu')
                        )
    
    classifier.add(MaxPooling2D(pool_size))

    # Add a Flatten layer. Reshape data into a 1d array. Transition the convolution and pooling layers to the fully connected layers.
    classifier.add(Flatten())

    # Add a Dense layer. A fully connected layer, allows for the learning of relationships. Activation function introduces non-linearity
    classifier.add(Dense(hidden_units,
                        activation='relu')
                        )
    
    classifier.add(Dropout(dropout_rate))

    # Add a Final Dense layer. This will output our probabilities.
    classifier.add(Dense(units=train_generator.num_classes,
                        activation='sigmoid')
                        )

    # Compile the model
    classifier.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return classifier

### Train and Evaluate the Model

In [9]:
 # Create and compile a model with the selected hyperparameters
model = create_model(filters=64,kernel_size=(3,3), pool_size=(3,3), hidden_units=256, dropout_rate=.2, learning_rate=0.001)

# Train the model
model.fit(train_generator, epochs=5, batch_size=32, steps_per_epoch=64, verbose=1)

# save the model to disk
model.save('mod_1.h5')
print('Model saved to disk')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model saved to disk


In [10]:
# function for evaluating model performance and generating predictions
def evaluate_model(path_to_mod, generator):
    # Load the trained model
    model = keras.models.load_model(path_to_mod)

    # Make predictions on the test set
    predictions = model.predict(generator)

    # Convert the predictions to class labels
    predicted_classes = np.argmax(predictions, axis=1)

    # Get the true class labels
    true_classes = generator.classes

    accuracy = accuracy_score(predicted_classes, true_classes)
    print('Prediction Accuracy: ', accuracy)

    return predicted_classes,true_classes

In [11]:
preds, actual = evaluate_model('mod_1.h5', test_generator)
print(classification_report(preds, actual))
print("Class Labels: ", test_generator.class_indices)

Prediction Accuracy:  0.76915
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      9727
           1       0.78      0.76      0.77     10273

    accuracy                           0.77     20000
   macro avg       0.77      0.77      0.77     20000
weighted avg       0.77      0.77      0.77     20000

Class Labels:  {'FAKE': 0, 'REAL': 1}


### Random Grid Search

Performing a random grid search on a number of hyper parameters will assist us with fune tuing our model for the best performance.

In [25]:
# Define a hyper parameter grid
param_grid = {
    'filters': [16, 24, 32, 64, 128, 256],
    'kernel_size': [(2, 2), (3, 3)],
    'pool_size': [(2, 2), (3,3)],
    'hidden_units': [64, 128, 256],
    'dropout_rate': [0.2, 0.3, 0.5],
    'learning_rate': [.0001, 0.001, 0.005]
}

# Perform random grid search
num_iterations = 2
best_accuracy = 0.0 # placeholder to track best accuracy
best_params = None # placeholder to track best parameters

# Create a dataframe to track model performance using various hyper parameters
col_names = list(param_grid.keys()) # names of hyper parameters
params_df = pd.DataFrame(columns=col_names)
mod_df = pd.DataFrame(columns=['mod_name','accuracy'])
params_df = pd.concat([params_df, mod_df], axis=1) # combine to create df for performance tracking

for i in range(num_iterations):
    # Randomly select hyperparameters from the grid
    params = {param: random.choice(values) for param, values in param_grid.items()}

    print(params)

    # Create and compile the model with the selected hyperparameters
    model = create_model(**params)

    # Train the model
    model.fit(train_generator, epochs=5, batch_size=32, steps_per_epoch=64, verbose=0)

    # save the model to disk
    mod_name = 'mod_rgs_'+str(i)+'.h5'
    model.save(mod_name)
    print('Model saved to disk')

    preds, actual = evaluate_model(mod_name, test_generator)

    accuracy = accuracy_score(preds,actual)

    # add a row to our tracker dataframe
    converted_list = [str(item) if isinstance(item, set) else item for item in list(params.values())] # convert items in a list that are a set to string
    converted_list.extend([mod_name, accuracy])
    params_df = pd.concat([params_df, pd.DataFrame([converted_list], columns=params_df.columns)], ignore_index=True) # add new row to tracker df

    # Check if the current model outperforms the previous best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

        model.save('best_model.h5')

{'filters': 24, 'kernel_size': (2, 2), 'pool_size': (2, 2), 'hidden_units': 256, 'dropout_rate': 0.2, 'learning_rate': 0.001}
Model saved to disk
Prediction Accuracy:  0.80375
{'filters': 32, 'kernel_size': (3, 3), 'pool_size': (3, 3), 'hidden_units': 256, 'dropout_rate': 0.2, 'learning_rate': 0.001}
Model saved to disk
Prediction Accuracy:  0.7396


In [26]:
params_df

Unnamed: 0,filters,kernel_size,pool_size,hidden_units,dropout_rate,learning_rate,mod_name,accuracy
0,24,"(2, 2)","(2, 2)",256,0.2,0.001,mod_rgs_0.h5,0.80375
1,32,"(3, 3)","(3, 3)",256,0.2,0.001,mod_rgs_1.h5,0.7396
