# The code is modified to work properly on the dataset directories on kaggle
- Please run it (with mirror or without, its your call), currently I have major internet problems regarding an stable connection to the Kaggle.
- I estimate it to take around ~2:30-3 Hours on single P100 to run fully (2 Epochs/ Per Minute) without Mirror.
- I have modified the number of epochs and also the dropout rate to make the training a bit more time-efficient with minimum knowledge base loss during training.
- Also cleaned the code in general (Removed redundent functions for Generators and Visualizations).
- Please make sure to commit your possible changed version as a new version to the repo.

@Good Luck

# Requirements

In [None]:
import cv2
import shutil
import random
import zipfile
import warnings
import numpy as np
# %load_ext cudf.pandas
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.applications.resnet import ResNet50
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import precision_recall_fscore_support
from tensorflow.keras.callbacks import CSVLogger

# Suppress warnings
warnings.filterwarnings("ignore")

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
benigns = len(os.listdir("/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/benign"))
melignant = len(os.listdir("/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/malignant"))
print(f"Number of benign Samples: {benigns}\nNumber of malignant Samples: {melignant}")



> Removing random samples from the majority class



In [None]:
# Balance the dataset by removing excess samples from the majority class
from random import sample
min_path = "/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/malignant"
maj_path = "/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/benign"
address = [image for image in os.listdir(maj_path)]
cut = len(os.listdir(min_path))
cut_list = sample(address, cut)
for index in tqdm(os.listdir(maj_path)):
    if index not in cut_list:
        os.remove(os.path.join(maj_path, index))

# Verify the number of samples
benigns = len(os.listdir("/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/benign"))
melignant = len(os.listdir("/kaggle/input/skin-canser-b584m584/Melanoma-b584m584/malignant"))
print(f"\nNumber of benign Samples: {benigns}\nNumber of malignant Samples: {melignant}")

In [None]:
# Function to divide test set
def divide_test_set(temp_path, cut_percentage):
    rel_image_paths = [os.path.join(temp_path, i) for i in os.listdir(temp_path)]
    cut_set = random.sample(rel_image_paths, int(cut_percentage * len(os.listdir(temp_path))))
    return cut_set



> Remmeber to Rename "Melanoma-b584m584" folder to "train" before running the next cells


In [None]:
# Here since the path to dataset is read-only we need to copy them to another Dir
new_path = '/kaggle/working/train'
os.makedirs(new_path, exist_ok=True)
shutil.copytree('/kaggle/input/skin-canser-b584m584/Melanoma-b584m584', new_path, dirs_exist_ok=True)
# Just to Verify
print(os.listdir(new_path))

In [None]:
# Divide the dataset into training and test sets
input_path = "/kaggle/working/train/benign"
benign_test_set = divide_test_set(input_path, 0.20)
input_path = "/kaggle/working/train/malignant"
malignant_test_set = divide_test_set(input_path, 0.20)

In [None]:
benign_test_path = "/kaggle/working/test/benign"
os.makedirs(benign_test_path, exist_ok=True)
for index in tqdm(benign_test_set):
    shutil.move(index, benign_test_path)

print(len(os.listdir(benign_test_path)))

In [None]:
malignant_test_path = "/kaggle/working/test/malignant"
os.makedirs(malignant_test_path, exist_ok=True)
for index in tqdm(malignant_test_set):
    shutil.move(index, malignant_test_path)

print(len(os.listdir(malignant_test_path)))



> Load and shuffle the data



In [None]:
# Data generator for training, validation and testing
def create_generator(DIR):
    datagen = ImageDataGenerator(rescale=1/255)
    generator = datagen.flow_from_directory(directory=DIR,
                                            batch_size=batch_size,
                                            class_mode='binary',
                                            target_size=(224, 224))
    return generator

In [None]:
# Function to shuffle and split data
def shuffle_and_split_data(src_dir, dest_dir, split_percent):
    image_filenames = os.listdir(src_dir)
    num_images = len(image_filenames)
    random.shuffle(image_filenames)
    num_images_to_move = int(num_images * split_percent)
    for i in tqdm(range(num_images_to_move)):
        src_path = os.path.join(src_dir, image_filenames[i])
        dest_path = os.path.join(dest_dir, image_filenames[i])
        shutil.move(src_path, dest_path)

In [None]:
# Create train and validation directories
train_benign_dir = "/kaggle/working/train/benign"
train_malignant_dir = "/kaggle/working/train/malignant"
validation_dir = "/kaggle/working/validation"
os.makedirs(validation_dir, exist_ok=True)
validation_benign_dir = os.path.join(validation_dir, "benign")
validation_malignant_dir = os.path.join(validation_dir, "malignant")
os.makedirs(validation_benign_dir, exist_ok=True)
os.makedirs(validation_malignant_dir, exist_ok=True)

In [None]:
split_percent = 0.20
shuffle_and_split_data(train_benign_dir, validation_benign_dir, split_percent)
shuffle_and_split_data(train_malignant_dir, validation_malignant_dir, split_percent)

print("\nData shuffling and splitting completed.")

In [None]:
# Verification
va = len(os.listdir("/kaggle/working/validation/benign"))
tr = len(os.listdir("/kaggle/working/train/benign"))
print(f"\nNumber of benign train Samples: {tr}\nNumber of benign validation Samples: {va}")

In [None]:
# Verification
va_ = len(os.listdir("/kaggle/working/validation/malignant"))
tr_ = len(os.listdir("/kaggle/working/train/malignant"))
print(f"\nNumber of malignant train Samples: {tr_}\nNumber of malignant validation Samples: {va_}")

In [None]:
# (it should be ~935)
va+va_+tr_+tr

In [None]:
# Training parameters
batch_size = 4
EPOCHS = 40

In [None]:
Train_Dir = '/kaggle/working/train'
train_generator = create_generator(DIR=Train_Dir)

In [None]:
Val_Dir = '/kaggle/working/validation'
validation_generator = create_generator(DIR=Val_Dir)



> For Future Debugging...



# Pilot Modeling

In [None]:
# Function to build the model
def build_model():
    image_size = 224
    ResNet50_base = ResNet50(weights="imagenet", include_top=False, input_shape=(image_size, image_size, 3))
    model = ResNet50_base.output
    model = tf.keras.layers.GlobalAveragePooling2D()(model)
    model = tf.keras.layers.Dropout(rate=0.4)(model)
    model = tf.keras.layers.Dense(1, activation='sigmoid')(model)
    model = tf.keras.models.Model(inputs=ResNet50_base.input, outputs=model)
    return model

In [None]:
# Callback to reduce learning rate
reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.4,
                              patience=4,
                              min_delta=0.0001,
                              mode='auto',
                              verbose=1)

In [None]:
# Lists to store specificity and sensitivity
specificities = []
sensitivities = []

In [None]:
# Ensure TensorFlow uses the GPU
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Train the model 6 times
for i in range(6):
    model = build_model()
    opt = Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy', 'Recall', 'Precision'])

    Train_Dir = '/kaggle/working/train'
    train_gen = create_generator(DIR=Train_Dir)
    Val_Dir = '/kaggle/working/validation'
    val_gen = create_generator(DIR=Val_Dir)

    csv_logger = CSVLogger(f'/kaggle/working/model_version_{i+1}_log.csv',
                           append=True,
                           separator=',')

    history = model.fit(
        train_gen,
        steps_per_epoch= (tr+tr_)//batch_size,
        epochs=EPOCHS,
        verbose=1,
        validation_data=val_gen,
        callbacks=[reduce_lr, csv_logger]
    )

    test_path = "/kaggle/working/test"
    test_gen = create_generator(DIR=test_path)

    predictions = model.predict(test_gen, verbose=1)
    labels = test_gen.classes

    new_list = [0 if value <= 0.50 else 1 for value in predictions]
    cm = confusion_matrix(labels, new_list)

    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)

    specificities.append(specificity)
    sensitivities.append(sensitivity)

    model.save_weights(f'/kaggle/working/model_version_{i+1}.weights.h5')

    del model
    del history
    tf.keras.backend.clear_session()

print("Specificities: ", specificities)
print("Sensitivities: ", sensitivities)

In [None]:
# Box plot for specificity and sensitivity
plt.figure(figsize=(10, 5))
plt.boxplot([specificities, sensitivities], labels=['Specificity', 'Sensitivity'])
plt.title('Specificity and Sensitivity across 6 Runs')
plt.ylabel('Scores')
plt.show()

# Plot ROC Curve
fpr, tpr, _ = roc_curve(labels, predictions)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Plot ROC Curve using Specificity and Sensitivity
plt.figure()
plt.plot(1 - np.array(specificities), sensitivities, marker='o', linestyle='-', color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity')
plt.ylabel('Sensitivity')
plt.title('Receiver Operating Characteristic using Specificity and Sensitivity')
plt.legend(loc="lower right")
plt.show()

In [None]:
# plot model performance
acc = history.history['recall']
val_acc = history.history['val_recall']
loss = history.history['precision']
val_loss = history.history['val_precision']
epochs_range = range(1, len(history.epoch) + 1)

plt.figure(figsize=(11,4))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Train Set')
plt.plot(epochs_range, val_acc, label='Val Set')
plt.legend(loc="best")
plt.xlabel('Epochs')
plt.ylabel('recall')
plt.title('Model recall')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Train Set')
plt.plot(epochs_range, val_loss, label='Val Set')
plt.legend(loc="best")
plt.xlabel('Epochs')
plt.ylabel('Precision')
plt.title('Model Precision')

plt.tight_layout()
plt.show()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(1, len(history.epoch) + 1)

plt.figure(figsize=(11,4))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Train Set')
plt.plot(epochs_range, val_acc, label='Val Set')
plt.legend(loc="best")
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Train Set')
plt.plot(epochs_range, val_loss, label='Val Set')
plt.legend(loc="best")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Model Loss')

plt.tight_layout()
plt.show()

In [None]:
# Accuracy and loss plots
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(1, len(history.epoch) + 1)

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Train Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Train Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

# Testing

In [None]:
test_path = "/kaggle/working/test"
test_generator = create_generator(test_path)

In [None]:
model.evaluate(test_generator)



> ### Test Report

Precision = 0.76

Recall = 0.71

Accuracy = 0.74



# These lines are just for experimenting

In [None]:
# from PIL import Image

# root = "/content/SkinCancerDataset/data/test"
# predictions = []
# labels = []

# for label in tqdm(os.listdir(root)):
#   if label == "malignant":
#     new_root = os.path.join(root, label)
#     for image in tqdm(os.listdir(new_root)):
#       # read, covert, and normalize the image
#       img_path = os.path.join(new_root, image)
#       image_file = Image.open(img_path).convert('RGB')
#       image_array = np.array(image_file)
#       # image_array = image_array * 255.0/image_array.max()
#       image_array = cv2.resize(image_array, (224,224))
#       image_array = image_array / 255.0
#       image_array = image_array.reshape(1, 224,224, 3)
#       # Prediction
#       predictions.append(model.predict(image_array, verbose=0).squeeze())
#       labels.append(1)


#   elif label == "benign":
#     new_root = os.path.join(root, label)
#     for image in tqdm(os.listdir(new_root)):
#       # read, covert, and normalize the image
#       img_path = os.path.join(new_root, image)
#       image_file = Image.open(img_path).convert('RGB')
#       image_array = np.array(image_file)
#       # image_array = image_array * 255.0/image_array.max()
#       image_array = cv2.resize(image_array, (224,224))
#       image_array = image_array / 255.0
#       image_array = image_array.reshape(1, 224,224, 3)
#       # Prediction
#       predictions.append(model.predict(image_array, verbose=0).squeeze())
#       labels.append(0)

#   else:
#     print("\nSomething is not right!")




In [None]:
# new_list = [0 if value <= 0.50 else 1 for value in predictions]

In [None]:
# from sklearn.metrics import confusion_matrix, classification_report
# import warnings
# warnings.filterwarnings('ignore')

# confusion_matrix(labels, new_list)

In [None]:
# sns.heatmap(confusion_matrix(labels, new_list), annot=True, cmap="Blues")