# Requirements

In [None]:
import cv2
import shutil
import random
import zipfile
import warnings
from PIL import Image
import numpy as np
# %load_ext cudf.pandas
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.applications.resnet import ResNet50
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import precision_recall_fscore_support
from tensorflow.keras.callbacks import CSVLogger

# Suppress warnings
warnings.filterwarnings("ignore")

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
# Balance the dataset by removing excess samples from the majority class
from random import sample
min_path = "/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/malignant"
maj_path = "/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/benign"
address = [image for image in os.listdir(maj_path)]
cut = len(os.listdir(min_path))
cut_list = sample(address, cut)
for index in tqdm(os.listdir(maj_path)):
    if index not in cut_list:
        os.remove(os.path.join(maj_path, index))

# Verify the number of samples
benigns = len(os.listdir("/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/benign"))
melignant = len(os.listdir("/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/malignant"))
print(f"\nNumber of benign Samples: {benigns}\nNumber of malignant Samples: {melignant}")

In [None]:
# Here since the path to dataset is read-only we need to copy them to another Dir
new_path = '/kaggle/working/root'
os.makedirs(new_path, exist_ok=True)
shutil.copytree('/kaggle/input/skin-canser-b584m584/Melanoma-b584m584', new_path, dirs_exist_ok=True)
# Just to Verify
print(os.listdir(new_path))

In [None]:
# Function to divide a test set
def divide_test_set(temp_path, cut_percentage):
    """
    Returns a list containing relative address for images you need to move
    ------------------------------------------------------------------------
    temp_path: 
                    a path to the root directory of your data
    
    cut_precentage: 
                    how much data you want to move"""
    
    rel_image_paths = [os.path.join(temp_path, i) for i in os.listdir(temp_path)]
    cut_set = random.sample(rel_image_paths, int(cut_percentage * len(os.listdir(temp_path))))
    return cut_set

In [None]:
def copy_data(input_list, path):
    """Copies all the data located at the input list indexes
    -------------------------------------------------------------
    input_list: 
                a list containing all the relative paths
    
    path:
                output directory"""
    
    os.makedirs(path, exist_ok=True)
    for index in (input_list):
        shutil.copy(index, path) 

In [None]:
def move_data(input_list, path):
    """moves all the data located at the input list indexes
    -------------------------------------------------------------
    input_list: 
                a list containing all the relative paths
    
    path:
                output directory"""
    
    os.makedirs(path, exist_ok=True)
    for index in (input_list):
        shutil.move(index, path) 

In [None]:
def set_remainder(input_list, root_path):
    """
    This function removes the samples presented in input_list from the root path files
    -------------------------------------------------------------------------------------
    input_list: files to be excluded
    root_path: directory path
    
    """
    all_paths = [os.path.join(root_path, i) for i in os.listdir(root_path)]
    # creates a new list that only includes items from all_paths that are not in input_list
    filtered_list = [image for image in all_paths if image not in input_list]
    
    return filtered_list
    

In [None]:
# Data generator for training, validation and testing
def create_generator(DIR):
    datagen = ImageDataGenerator(rescale=1/255)
    generator = datagen.flow_from_directory(directory=DIR,
                                            batch_size=batch_size,
                                            class_mode='binary',
                                            target_size=(224, 224))
    return generator

In [None]:
def test_me(root):
    """
    This function captures prediction/label pairs and return predictions, labels lists
    ------------------------------------------------
    root: 
            a relative path to the test directory"""
    predictions = []
    labels = []
    for label in tqdm(os.listdir(root)):
        if label == "malignant":
            new_root = os.path.join(root, label)
            for image in tqdm(os.listdir(new_root)):
                # read, covert, and normalize the image
                img_path = os.path.join(new_root, image)
                image_file = Image.open(img_path).convert('RGB')
                image_array = np.array(image_file)
                # image_array = image_array * 255.0/image_array.max()
                image_array = cv2.resize(image_array, (224,224))
                image_array = image_array / 255.0
                image_array = image_array.reshape(1, 224,224, 3)
                # Prediction
                predictions.append(model.predict(image_array, verbose=0).squeeze())
                labels.append(1)
                
            return predictions, labels 


        elif label == "benign":
            new_root = os.path.join(root, label)
            for image in tqdm(os.listdir(new_root)):
                # read, covert, and normalize the image
                img_path = os.path.join(new_root, image)
                image_file = Image.open(img_path).convert('RGB')
                image_array = np.array(image_file)
                # image_array = image_array * 255.0/image_array.max()
                image_array = cv2.resize(image_array, (224,224))
                image_array = image_array / 255.0
                image_array = image_array.reshape(1, 224,224, 3)
                # Prediction
                predictions.append(model.predict(image_array, verbose=0).squeeze())
                labels.append(0)
            
            return predictions, labels 

        else:
            return "\nSomething is not right!"

In [None]:
# Function to build the model
def build_model():
    image_size = 224
    ResNet50_base = ResNet50(weights="imagenet", include_top=False, input_shape=(image_size, image_size, 3))
    model = ResNet50_base.output
    model = tf.keras.layers.GlobalAveragePooling2D()(model)
    model = tf.keras.layers.Dropout(rate=0.4)(model)
    model = tf.keras.layers.Dense(1, activation='sigmoid')(model)
    model = tf.keras.models.Model(inputs=ResNet50_base.input, outputs=model)
    return model

In [None]:
# Training parameters
batch_size = 4
EPOCHS = 40
test_accs = []
test_recalls = []
test_precisions = []
tps = []
fps = []
tns = []
fns = []


# Ensure TensorFlow uses the GPU
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)


for i in tqdm(range(6)):
    # TEST SAMPLING -----------------------------------------------
    # Pick a sample of images for testing 
    input_path = "/kaggle/working/root/benign"
    benign_test_set = divide_test_set(input_path, 0.20)
    input_path = "/kaggle/working/root/malignant"
    malignant_test_set = divide_test_set(input_path, 0.20)
    
    # Copy selected testing images into their corresponding folder
    copy_data(input_list = benign_test_set , path = f"/kaggle/working/test_{i+1}/benign")
    copy_data(input_list = malignant_test_set , path = f"/kaggle/working/test_{i+1}/malignant")
    
    # TRAIN SAMPLING ----------------------------------------------
    # Constant Paths
    root_benign_dir = "/kaggle/working/root/benign"
    root_malignant_dir = "/kaggle/working/root/malignant"
    
    # Get the training samples indexes
    benign_train_set = set_remainder(benign_test_set, root_benign_dir)
    malignant_train_set = set_remainder(malignant_test_set, root_malignant_dir)
    
    # Copy selected training images into their corresponding folder
    copy_data(input_list = benign_train_set , path = f"/kaggle/working/train_{i+1}/benign")
    copy_data(input_list = malignant_train_set , path = f"/kaggle/working/train_{i+1}/malignant")
    
    # VALIDATION SAMPLING -----------------------------------------
    # Pick a sample of images for validation 
    input_path = f"/kaggle/working/train_{i+1}/benign"
    benign_val_set = divide_test_set(input_path, 0.20)
    input_path = f"/kaggle/working/train_{i+1}/malignant"
    malignant_val_set = divide_test_set(input_path, 0.20)
    
    # Move selected validation images into their corresponding folder
    move_data(input_list = benign_val_set , path = f"/kaggle/working/val_{i+1}/benign")
    move_data(input_list = malignant_val_set , path = f"/kaggle/working/val_{i+1}/malignant")
    print(f"\n{i+1} out of 6 Dataset splitted.")
    
    # Used for Fit() function 
    va = len(os.listdir(f"/kaggle/working/val_{i+1}/benign")) # benign valiiation
    tr = len(os.listdir(f"/kaggle/working/train_{i+1}/benign")) # benign train
    va_ = len(os.listdir(f"/kaggle/working/val_{i+1}/malignant")) # malignant validation
    tr_ = len(os.listdir(f"/kaggle/working/train_{i+1}/malignant")) # malignant train
    print(f"\nNumber of benign train Samples: {tr}\nNumber of benign validation Samples: {va}")
    print(f"\nNumber of malignant train Samples: {tr_}\nNumber of malignant validation Samples: {va_}\n")
    
    # Create data generators
    Train_Dir = f"/kaggle/working/train_{i+1}/"
    train_generator = create_generator(DIR=Train_Dir)
    Val_Dir = f"/kaggle/working/val_{i+1}/"
    validation_generator = create_generator(DIR=Val_Dir)
    
    # Build a model and pick an optimizer
    model = build_model()
    opt = Adam(learning_rate=0.001)
    
    # Callback and Logger
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=4, min_delta=0.0001, mode='auto', verbose=1)
    csv_logger = CSVLogger(f'/kaggle/working/model_version_{i+1}_log.csv', append=True, separator=',')
    
    # Compile and Run
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', 'Recall', 'Precision'])
    history = model.fit(
        train_generator,
        steps_per_epoch= (tr+tr_)//batch_size,
        epochs=EPOCHS,
        verbose=1,
        validation_data=validation_generator,
        callbacks=[reduce_lr, csv_logger]
    )
    
    model.save_weights(f'/kaggle/working/model_version_{i+1}.weights.h5')
    
    # Testing begins here ...
    test_path = f"/kaggle/working/test_{i+1}/"
    test_generator = create_generator(DIR=test_path)
    
    # Capture common metrics
    c_metrics = model.evaluate(test_generator)
    test_accs.append(c_metrics[1])
    test_recalls.append(c_metrics[2])
    test_precisions.append(c_metrics[3])
    
    # Make predictions
    predictions, labels = test_me(test_path)
    new_list = [0 if value <= 0.50 else 1 for value in predictions]
    
    cf = confusion_matrix(labels, new_list)
    tn = cf[0, 0]  # True Negatives
    fp = cf[0, 1]  # False Positives
    fn = cf[1, 0]  # False Negatives
    tp = cf[1, 1]  # True Positives

    # Just logging everything
    print(f"\nTraining number {i+1}/6 model performance:\nTN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}\n")
    print(f"Test ACC: {c_metrics[1]}, Test RECALL: {c_metrics[2]}, Test PRECISION: {c_metrics[3]}\n")
    tns.append(tn)
    fps.append(fp)
    fns.append(fn)
    tps.append(tp)
    print("="*100)
    
    # Clear the path for the next training index
    shutil.rmtree(f"/kaggle/working/train_{i+1}/")
    shutil.rmtree(f"/kaggle/working/val_{i+1}/")
    shutil.rmtree(f"/kaggle/working/test_{i+1}/")
    del model
    del history
    del train_generator, validation_generator, test_generator
    tf.keras.backend.clear_session()
    
    