In [None]:
import numpy as np # For number storage and calculations
import os # For files
from sklearn.metrics import confusion_matrix # To evaluate the correctness of our model
import seaborn as sn; sn.set(font_scale=1.4) # To format our charts
from sklearn.utils import shuffle # To shuffle input data
import matplotlib.pyplot as plt # To plot our progress
import cv2 # Computer vision framework
import tensorflow as tf # Neural network framework
from tensorflow import keras # To build the layers of a network

In [None]:
# Recursively import the data from a downloaded folder
class_names = ['benign', 'malignant']
class_names_label = {class_name: i for i, class_name in enumerate(class_names)}
nb_classes = len(class_names)
IMAGE_SIZE = (150, 150)
def load_data():
    DIRECTORY = r"C:\Users\faith\OneDrive\Documents\Image_Classification"
    CATEGORY = ["train", "test"]
    output = []
    for category in CATEGORY:
        path = os.path.join(DIRECTORY, category)
        print(path)
        images = []
        labels = []
        print("Loading {}".format(category))
        for folder in os.listdir(path):
            label = class_names_label[folder]
            for file in os.listdir(os.path.join(path, folder)):
                img_path = os.path.join(os.path.join(path, folder), file)
                image = cv2.imread(img_path)

                # Convert default color formatting for CV2
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

                # Resize an image to a standard size 
                image = cv2.resize(image, IMAGE_SIZE)

                # Add the images and labels to corresponding lists
                images.append(image)
                labels.append(label)

        # Convert the lists into numpy arrays 
        images = np.array(images, dtype = 'float32')
        labels = np.array(labels, dtype = 'int32')
        output.append((images, labels))

    return output

In [None]:
# Separate a portion of the data to use later to evaluate the model
(train_images, train_labels), (test_images, test_labels) = load_data()

# Shuffle the image order to gain a more general understanding of the data during training
train_images, train_labels = shuffle(train_images, train_labels, random_state=25)
test_images, test_labels = shuffle(test_images, test_labels, random_state=25)

In [None]:
# Count to ensure malignant and benign samples are close in size for training
zeros = 0
ones = 1
for i in range(len(train_labels)):
    # print(train_labels[i])
    if(train_labels[i]==0):
        zeros+=1
    elif(train_labels[i]==1):
        ones+=1
print("zeros: " + str(zeros))
print("ones: " + str(ones))

In [None]:
# Same as above for testing
zeros = 0
ones = 1
for i in range(len(test_labels)):
    # print(train_labels[i])
    if(test_labels[i]==0):
        zeros+=1
    elif(test_labels[i]==1):
        ones+=1
print("zeros: " + str(zeros))
print("ones: " + str(ones))

In [None]:
# Basic model
the_model = tf.keras.Sequential([ 
    tf.keras.layers.Conv2D(32, (3,3), activation = 'relu', input_shape = (150, 150, 3)), # Input layer has the dimensions of our image
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dropout(0.5), # Reduces overfitting
    tf.keras.layers.Dense(2, activation=tf.nn.softmax), # Two dimensional because of the two outputs
])

In [None]:
# Advanced model

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D

# Increased Convolutional layers
model = Sequential() 
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu', input_shape = (150,150,3)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(2, activation = "softmax"))

In [None]:
# Resnet

from tensorflow.python.keras.layers import Dense, Flatten, BatchNormalization,Dropout,Conv2D,MaxPool2D
from keras.applications.resnet import ResNet50
resnet_weights_path = '../Image_Classification/resnet50_weights_tf_dim_ordering_tf_kernels.h5' 

# Residual networks are a gateless deep feedforward network
model = Sequential()
model.add(ResNet50(include_top=False,input_tensor=None,input_shape=(224,224,3),pooling='avg',classes=2,weights=resnet_weights_path))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

model.layers[0].trainable = False
model.summary()

In [None]:
# Plot accuracy and validation accuracy of the model training
def plot_accuracy(history):
    fig=plt.figure(figsize=(10,5))
    plt.subplot(221)
    plt.plot(history.history['accuracy'], 'bo--', label='acc')
    plt.plot(history.history['val_accuracy'], 'ro--', label="val_acc")
    plt.title("train_acc vs val_acc")
    plt.ylabel("accuracy")
    plt.xlabel("epochs")
    plt.legend()
    plt.show()

In [None]:
# Tensorboard is used to chart the progress and structure of our model
# It runs based on the logs specified in a directory, which must be created at each run

In [None]:
import tensorboard

In [None]:
%load_ext tensorboard

In [None]:
import datetime
log_dir = os.path.join('logs','fit', datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
print(log_dir)

In [None]:
!powershell rm -Force -R logs

In [None]:
os.makedirs(log_dir, exist_ok=True)
!powershell dir logs\fit

In [None]:
# Check the randomization of the test labels
test_labels

In [None]:
# Compile the model using an optimizer that can handle sparse gradients on noisy problems. Displays the accuracy after each epoch
the_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Allow tensorboard to access the model's data
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq = 1)

In [None]:
# Train the model with the images and labels
# Batch size was experimented with and 128 was the optimal number for the size of the dataset
# The optimal number of epochs depends on the model but higher is better up until a plateau
# Twenty percent of the data is saved for the model to validate and learn
# The validation data is shuffled
history = the_model.fit(train_images, train_labels, batch_size=128, epochs=5, validation_split=0.2, callbacks=[tensorboard_callback], shuffle=1)

In [None]:
plot_accuracy(history)

In [None]:
# Remove unnecessary weights in the model

import tensorflow_model_optimization as tfmot
pruning_schedule = tfmot.sparsity.keras.ConstantSparsity(
          target_sparsity=0.8,
          begin_step=0,
          end_step=1000,
)


pruned_model = tfmot.sparsity.keras.prune_low_magnitude(
    model, pruning_schedule=pruning_schedule
)

In [None]:
# Print a confusion matrix for the model:
# True positive, False negative
# False negative, True negative

preds = model.predict(test_images)
rev = []
for row in preds:
    if row[1] == 1:
        rev.append(1)
    else:
        rev.append(0)
rev = np.array(rev)
cm = confusion_matrix(test_labels, rev)
print(cm)
sn.heatmap(cm, cmap='Blues')


In [None]:
# Clean tensorboard after the session

In [None]:
! powershell "echo 'checking for existing tensorboard processes'"
! powershell "ps | Where-Object {$_.ProcessName -eq 'tensorboard'}"

! powershell "ps | Where-Object {$_.ProcessName -eq 'tensorboard'}| %{kill $_}"

! powershell "echo 'cleaning tensorboard temp dir'"
! powershell "rm $env:TEMP\.tensorboard-info\*"

! powershell "ps | Where-Object {$_.ProcessName -eq 'tensorboard'}"

In [None]:
%tensorboard --logdir="logs/fit" --host localhost 


! echo If it has timed out in jupyter, then go to http://localhost:6006 in the browser and check

In [None]:
# Save the model as an .hdf5 model

In [None]:
model.save("Image_Classification")

In [None]:
save_model = input("Do you wish to save this model [y/n]: ").strip().lower()

if save_model == 'y' or save_model == 'yes':
    model_name = input("Model Name: ").strip()
    try:
        tf.keras.models.save_model(model, model_name)
    except:
        print("Saving failed...")