In [None]:
!pip install -U numpy keras pandas tensorflow matplotlib sklearn 

In [None]:
# Convolutional Neural Network (CNN) algorithm based on Marsh work (https://www.kaggle.com/vbookshelf/cnn-how-to-use-160-000-images-without-crashing).
#
# Algorithm Developed to carry out the Course Conclusion Work (Trabalho de Conclusão de Curso - TCC) in Computer Science 
# at Universidade Estadual Paulista Júlio de Mesquita Filho Campus of Bauru (UNESP)
# Work developed by Gabriel Vieira under the guidance of Prof. Dr. Kelton Costa  
#
# Algorithm responsible for the execution of the neural network, 
# that is, the entire file configuration, model configuration, training and testing.
#
# !pip install -U numpy keras pandas tensorflow matplotlib sklearn 

import itertools
import os
import cv2
import shutil
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from keras.models import model_from_json

from sklearn.model_selection import train_test_split
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.model_selection import train_test_split

#--- Function for create Confusion Matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
        
#--- Getting access to Google Drive
from google.colab import drive
drive.mount('/content/drive/', force_remount = True)

#--- Moving to current folder
path = '/content/drive/My Drive/TCC_code_database/'
os.chdir(path)

IMAGE_SIZE = 256
IMAGE_WIDTH = 640
IMAGE_HEIGHT = 480
IMAGE_CHANNELS = 3
SAMPLE_SIZE = 3500 # the number of images we use from each of the three classes

# #--- List amoung of 'training' folder content 
# print(len(os.listdir('input/train')))
# print(len(os.listdir('input/test')))

#--- Create a Dataframe containing all images
df_data = pd.read_csv('input/training.csv')
print(df_data.shape)

#--- Check the class distribution
df_data['class'].value_counts()

#--- Create the Train and Val Sets
print(df_data.head())

#--- Balance the target distribution
# We will reduce the number of samples in class 0.
# take a random sample of class 0 with size equal to num samples in class 1 and class 2
df_seam_carving = df_data[df_data['class'] == 'seam_carving'].sample(SAMPLE_SIZE, random_state = 101)
df_seam_insertion = df_data[df_data['class'] == 'seam_insertion'].sample(SAMPLE_SIZE, random_state = 101)
df_uncompressed = df_data[df_data['class'] == 'uncompressed'].sample(SAMPLE_SIZE, random_state = 101)

df_data = pd.concat([df_seam_carving, df_seam_insertion, df_uncompressed], axis=0).reset_index(drop=True)
df_data = shuffle(df_data) 
print(df_data['class'].value_counts())
print(df_data.head())

#--- Stratify=y creates a balanced validation set
y = df_data['class']
df_train, df_val = train_test_split(df_data, test_size=0.10, random_state=101, stratify=y)
print(df_train)
print(df_train.shape)
print(df_val.shape)
print(df_train['class'].value_counts())
print(df_val['class'].value_counts())

#--- Create a Directory Structure
base_dir = 'base_dir'
# os.mkdir(base_dir)

# #[CREATE FOLDERS INSIDE THE BASE DIRECTORY]
# # now we create 2 folders inside 'base_dir':
# # train_dir
#     # a_seam_carving
#     # b_seam_insertion
#     # c_uncompressed

# # val_dir
#     # a_seam_carving
#     # b_seam_insertion
#     # c_uncompressed

# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
# os.mkdir(train_dir)

# val_dir
val_dir = os.path.join(base_dir, 'val_dir')
# os.mkdir(val_dir)

# [CREATE FOLDERS INSIDE THE TRAIN AND VALIDATION FOLDERS]
# Inside each folder we create seperate folders for each class

# # create new folders inside train_dir
# seam_carving = os.path.join(train_dir, 'seam_carving')
# os.mkdir(seam_carving)
# seam_insertion = os.path.join(train_dir, 'seam_insertion')
# os.mkdir(seam_insertion)
# uncompressed = os.path.join(train_dir, 'uncompressed')
# os.mkdir(uncompressed)

# # create new folders inside val_dir
# seam_carving = os.path.join(val_dir, 'seam_carving')
# os.mkdir(seam_carving)
# seam_insertion = os.path.join(val_dir, 'seam_insertion')
# os.mkdir(seam_insertion)
# uncompressed = os.path.join(val_dir, 'uncompressed')
# os.mkdir(uncompressed)

# check that the folders have been created
print(os.listdir('base_dir/train_dir'))

# Transfer the images into the folders
# Set the id as the index in df_data
df_data.set_index('filename', inplace=True)
# Get a list of train and val images
train_list = list(df_train['filename'])
val_list = list(df_val['filename'])

# # Transfer the train images
# for image in train_list:
#     # get the label for a certain image
#     target = df_data.loc[image,'class']
    
#     # these must match the folder names
#     if target == 'seam_carving':
#         label = 'seam_carving'
#     if target == 'seam_insertion':
#         label = 'seam_insertion'
#     if target == 'uncompressed':
#         label = 'uncompressed'

#     # source path to image
#     src = os.path.join('input/train', image)
#     # destination path to image
#     dst = os.path.join(train_dir, label, image)
#     # copy the image from the source to the destination
#     try:
#       shutil.copyfile(src, dst)
#     except:
#       continue 

# # Transfer the val images
# for image in val_list:
#     # get the label for a certain image
#     target = df_data.loc[image,'class']

#     # these must match the folder names
#     if target == 'seam_carving':
#         label = 'seam_carving'
#     if target == 'seam_insertion':
#         label = 'seam_insertion'
#     if target == 'uncompressed':
#         label = 'uncompressed'
    
#     # source path to image
#     src = os.path.join('input/train', image)
#     # destination path to image
#     dst = os.path.join(val_dir, label, image)
#     # copy the image from the source to the destination
#     try:
#       shutil.copyfile(src, dst)
#     except:
#       continue 

# # check how many train images we have in each folder
print(len(os.listdir('base_dir/train_dir/seam_carving')))
print(len(os.listdir('base_dir/train_dir/seam_insertion')))
print(len(os.listdir('base_dir/train_dir/uncompressed')))

# check how many val images we have in each folder
print(len(os.listdir('base_dir/val_dir/seam_carving')))
print(len(os.listdir('base_dir/val_dir/seam_insertion')))
print(len(os.listdir('base_dir/val_dir/uncompressed')))

#[CREATE A TEST FOLDER DIRECTORY STRUCTURE]
# We will be feeding test images from a folder into predict_generator().
# Keras requires that the path should point to a folder containing images and not
# to the images themselves. That is why we are creating a folder (test_images) 
# inside another folder (test_dir).

# test_dir
    # test_images

# create test_dir
test_dir = 'test_dir'
# os.mkdir(test_dir)
    
# create test_images inside test_dir
test_images = os.path.join(test_dir, 'test_images')
# os.mkdir(test_images)
# check that the directory we created exists
print(len(os.listdir('test_dir')))

# Transfer the test images into image_dir
test_list = os.listdir('input/test')

# for image in test_list:
#     # source path to image
#     src = os.path.join('input/test', image)
#     # destination path to image
#     dst = os.path.join(test_images, image)
#     # copy the image from the source to the destination
#     shutil.copyfile(src, dst)
# # check that the images are now in the test_images
# # Should now be 57458 images in the test_images folder
# print(len(os.listdir('test_dir/test_images')))

#--- Set Up the Generators Train
train_path = 'base_dir/train_dir'
valid_path = 'base_dir/val_dir'
test_path = 'input/test'

num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 10
val_batch_size = 10

train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)
datagen = ImageDataGenerator(rescale=1.0/255)

train_gen = datagen.flow_from_directory(train_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=train_batch_size,
                                        class_mode='categorical')

val_gen = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=val_batch_size,
                                        class_mode='categorical')

# Note: shuffle=False causes the test dataset to not be shuffled
test_gen_training = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=1,
                                        class_mode='categorical',
                                        shuffle=False)

#--- Set Up the Generators Test
test_path ='test_dir' 
# Here we change the path to point to the test_images folder.
test_gen_testing = datagen.flow_from_directory(test_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=1,
                                        class_mode='categorical',
                                        shuffle=False)

#-- Create the Model Architecture
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128

dropout_conv = 0.3
dropout_dense = 0.3

# OBS. 'relu': função linear por partes que produzirá a entrada diretamente se for positiva, caso contrário, ela produzirá zero
model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3))) 
model.add(Conv2D(first_filters, kernel_size, activation = 'relu')) 
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size)) 
model.add(Dropout(dropout_conv)) 

model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Flatten())
model.add(Dense(256, activation = "relu")) 
model.add(Dropout(dropout_dense)) 
model.add(Dense(3, activation = "softmax"))

# model.summary()

#--- Train the Model
model.compile(Adam(lr=0.0001), loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Get the labels that are associated with each index
print(val_gen.class_indices)

filepath = "model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, 
                             save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=3, 
                                   verbose=1, mode='max', min_lr=0.00001)
                  
callbacks_list = [checkpoint, reduce_lr]

history = model.fit_generator(train_gen, 
                    steps_per_epoch=train_steps, 
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    epochs=20, verbose=1,
                    callbacks=callbacks_list)

#--- Evaluate the model using the val set
# get the metric names so we can use evaulate_generator
model.metrics_names

#--- Write and Read JSON model file
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

# Here the best epoch will be used.
model.load_weights('model.h5')

val_loss, val_acc = \
model.evaluate_generator(test_gen_training, 
                        steps=len(df_val))

print('val_loss:', val_loss)
print('val_acc:', val_acc)

#--- Make a prediction on the val set
# We need these predictions to calculate the AUC score, print the Confusion Matrix and calculate the F1 score
# make a prediction
predictions = model.predict_generator(test_gen_training, steps=len(df_val), verbose=1)
print(predictions.shape)


#--- A note on Keras class index values
# Keras assigns it's own index value (here 0 and 1) to the classes. It infers the classes based on the folder structure.
# Important: These index values may not match the index values we were given in the train_labels.csv file.
# I've used 'a' and 'b' folder name pre-fixes to get keras to assign index values to match what was in the train_labels.csv file
# I guessed that keras is assigning the index value based on folder name alphabetical order.

# This is how to check what index keras has internally assigned to each class. 
print(test_gen_training.class_indices)

# Put the predictions into a dataframe.
# The columns need to be oredered to match the output of the previous cell
df_preds = pd.DataFrame(predictions, columns=['seam_carving', 'seam_insertion', 'uncompressed'])
print(df_preds.head())

# Get the true labels
y_true = test_gen_training.classes
# print(y_true)
# print(y_true.shape)

# Get the predicted labels as probabilities
# y_pred = df_preds['uncompressed']
# print(y_pred)
# print(y_pred.shape)

#--- What is the AUC Score?
# roc_auc_score(y_true, y_pred, multi_class='ovr') 

#--- Create a Confusion Matrix
# Get the labels of the test images.
test_labels_training = test_gen_training.classes
print(test_labels_training.shape)
print(test_labels_training)

# Print the label associated with each class
print(test_gen_training.class_indices)

# argmax returns the index of the max value in a row
# cm = confusion_matrix(test_labels_training, predictions.argmax(axis=1))
cm = confusion_matrix(test_labels_training, predictions.argmax(axis=1))
# Define the labels of the class indices. These need to match the 
# order shown above.
cm_plot_labels = ['seam_carving', 'seam_insertion', 'uncompressed']
plot_confusion_matrix(cm, cm_plot_labels)


#--- Create a Classification Report
# Generate a classification report
# For this to work we need y_pred as categorical labels not as probabilities
y_pred_categorical = predictions.argmax(axis=1)
report = classification_report(y_true, y_pred_categorical, target_names=cm_plot_labels, labels=[0, 1, 2] )
print(report)

# Recall = Given a class, will the classifier be able to detect it?
# Precision = Given a class prediction from a classifier, how likely is it to be correct?
# F1 Score = The harmonic mean of the recall and precision. Essentially, it punishes extreme values.
# From the confusion matrix and classification report we see that our model is equally good at detecting both classes.

# Are the number of predictions correct?
# Should be 57458.
print(len(predictions))
# Put the predictions into a dataframe
df_preds = pd.DataFrame(predictions, columns=['seam_carving', 'seam_insertion', 'uncompressed'])
# print(df_preds.head())

# This outputs the file names in the sequence in which 
# the generator processed the test images.
test_filenames = test_gen_testing.filenames
test_filenames = pd.DataFrame(test_filenames)
print(test_filenames)
print(test_filenames.shape)
# add the filenames to the dataframe

df_preds.insert(loc=3, column='file_names', value=test_filenames)
# df_preds['file_names'] = test_filenames
print(df_preds.head())