<a href="https://colab.research.google.com/github/DermaScan-Bangkit-2024-CapstoneProject/DermaScan-Machine-Learning/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
IN_COLAB = None

if os.getenv("COLAB_RELEASE_TAG"):
    IN_COLAB = True
else:
    IN_COLAB = False

print("Running in Google Colab:", IN_COLAB)

In [None]:
%pip install -q kaggle

### **Consideration Table**
| Condition                  | Cancer Association       | Type                     |
|----------------------------|--------------------------|--------------------------|
| Actinic Keratosis          | Precancerous             | Potentially cancerous    |
| Basal Cell Carcinoma       | Cancerous                | Cancerous                |
| Bowen’s Disease            | Early-stage cancer       | Cancerous                |
| Melanoma                   | Cancerous                | Cancerous                |
| Skin Cancer (General)      | Cancerous                | Cancerous                |
| Moles                      | Generally benign         | Generally benign         |
| Sun/Sunlight Damage        | Indirectly linked        | Risk factor for cancer   |
| Benign Keratosis-like Lesions | Benign               | Non-cancerous            |
| Benign Tumors              | Benign                   | Non-cancerous            |
| Seborrheic Keratoses       | Benign                   | Non-cancerous            |
| Vascular Tumors            | Mostly benign            | Mostly non-cancerous     |
| Others (Acne, Eczema, etc.)| Benign                   | Non-cancerous            |


In [None]:
content_path = "/content" if IN_COLAB else "."
dataset_path = "/content/drive/MyDrive/datasets" if IN_COLAB else "./Datasets"

In [None]:
import zipfile
import os

for file_name in os.listdir(dataset_path):
    if file_name.endswith('.zip'):
        folder_name = os.path.join(dataset_path, file_name[:-4])
        os.makedirs(folder_name, exist_ok=True)

        file_path = os.path.join(dataset_path, file_name)
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(folder_name)

        print(f"Extracted {file_name} into folder: {folder_name}")

In [None]:
import random
import cv2
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import math

def display_images(processed_images):
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))  # Create a 2x5 grid to display 10 images
    axes = axes.flatten()  # Flatten the axes to make it easier to iterate

    for img, ax in zip(processed_images, axes):
        ax.imshow(img)
        ax.axis('off')  # Turn off axes

    plt.tight_layout()
    plt.show()

def draw_hair(image):
    # Copy the original image so we don't modify it directly
    num_hairs = random.randint(4, 7)
    image_with_hair = image.copy()

    # Get image dimensions
    height, width, _ = image.shape

    # Define possible hair colors (in BGR format)
    hair_colors = [(0, 0, 0), (50, 50, 50), (80, 80, 80)]

    # Draw the specified number of hairs
    for _ in range(num_hairs):
        # Randomly choose the starting point
        start_x = random.randint(0, width - 1)
        start_y = random.randint(0, height - 1)

        # Randomly choose the length and direction
        length = random.randint(40, 150)
        angle = random.randint(40, 360)
        end_x = start_x + length * np.cos(np.radians(angle))
        end_y = start_y + length * np.sin(np.radians(angle))

        # Make sure the end point is within the image boundaries
        end_x = np.clip(end_x, 0, width - 1)
        end_y = np.clip(end_y, 0, height - 1)

        # Randomly choose the hair color
        color = random.choice(hair_colors)

        # Draw the hair as a line on the image
        thickness = random.randint(1, 2) # Random thickness for variation
        cv2.line(image_with_hair, (int(start_x), int(start_y)), (int(end_x), int(end_y)), color, thickness)

    return image_with_hair

In [None]:
base_ham1000_path = dataset_path + "/ham1000-segmentation-and-classification/"
csv_path = base_ham1000_path + "GroundTruth.csv"

df = pd.read_csv(csv_path)
df['image'] = base_ham1000_path + 'images/' + df['image'] + '.jpg'
random_images = df['image'].tolist()[1:11]
images_with_hair = [draw_hair(cv2.cvtColor(cv2.imread(img), cv2.COLOR_BGR2RGB)) for img in random_images]
display_images(images_with_hair)

## Preprocessing

In [None]:
def bresenham(x1, y1, x2, y2, r):
    dx = x2 - x1
    dy = y2 - y1
    line_length = math.sqrt(dx**2 + dy**2)
    steps = int(line_length / r)
    x_inc = dx / steps
    y_inc = dy / steps

    xs = x1 + np.arange(steps) * x_inc
    ys = y1 + np.arange(steps) * y_inc

    return list(zip(xs, ys))

def detection_points(source, new_img, linesP, r):
    all_points = []
    for line in linesP:
        x1, y1, x2, y2 = line[0]
        cv2.line(new_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        intermedian_points = bresenham(x1, y1, x2, y2, r)
        all_points.extend(intermedian_points)
    return source, new_img, all_points

def reduce_points(points):
    points_array = np.array(points)
    differences = np.abs(points_array[:, None] - points_array)
    mask = np.all(differences > 2, axis=-1)
    mask = np.all(mask, axis=1)
    return points_array[mask].tolist()

def histogram_distribution(H_flat_norm):
    num_bins = 10
    H = np.array(H_flat_norm)
    hist, _ = np.histogram(H, bins=num_bins, range=(0, 1))
    return np.var(hist)

def get_density(points, gridsize, **kwargs):
    xlim, ylim = [0, 600], [0, 450]
    x, y = zip(*points)

    D, _, _ = np.histogram2d(x, y, bins=gridsize, range=[xlim, ylim])
    D_flat = D.flatten()
    D_max = np.max(D_flat)
    D_norm_flat = D_flat / D_max

    variance = np.var(D_norm_flat)
    std_dev = np.std(D_norm_flat)

    hist_variance = histogram_distribution(D_norm_flat)
    return D_max, hist_variance, std_dev, variance

def plot_density(x, y, D):
    xlim, ylim = [0, 600], [0, 450]
    gridsize = 5
    D_flat = D.flatten()
    cmap = plt.cm.get_cmap('viridis_r')

    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(D.T, origin='upper', extent=[xlim[0], xlim[1], ylim[1], ylim[0]], cmap=cmap)
    ax.scatter(x, y, color='orange')

    cbar = fig.colorbar(im, ax=ax, shrink=0.5)
    cbar.set_label('Density')
    plt.show()

    plt.hist(D_flat, bins=gridsize**2, alpha=0.8, density=False)
    plt.xlabel('Density')
    plt.ylabel('Count')
    plt.show()

def hair_removal(image, file_name, **kwargs):

    # height, width of image
    height = image.shape[0]
    width = image.shape[1]

    # Get r from the parameters
    r = kwargs.get("r")

    # Define source image
    source = image.copy()

    # Define the desination image
    new_img = image.copy()

    # Define grayscale image for canny
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply canny edge detection
    edges= cv2.Canny(gray,kwargs.get("canny_A"), kwargs.get("canny_B"), kwargs.get("aperture"))

    # Line detection with Probabilistic Line Transform
    linesP = cv2.HoughLinesP(edges, cv2.HOUGH_PROBABILISTIC, np.pi / kwargs.get("hough_resolution"),kwargs.get("hough_threshold"),kwargs.get("hough_min_length"),kwargs.get("hough_max_gap"),kwargs.get("hough_iter"))

    # Process the lines if there are less than 600
    if linesP is not None and len(linesP) < 600:
        skip_this_image = False

        source, new_img, all_points = detection_points(source,new_img,linesP,r)
        max_density, histogram_variance, std_dev,variance = get_density(all_points,gridsize=5,**kwargs)

        filtered_points = reduce_points(all_points)
        filtered_points = all_points

        if not skip_this_image and max_density > kwargs.get("max_density_cap"):
            # Skip if the histogram variance is greater than this max
            if histogram_variance > kwargs.get("max_hist_variance"):
                skip_this_image=True

            # Skip if the variance is too great
            if variance > kwargs.get("max_variance_cap"):
                skip_this_image=True

            if not skip_this_image and len(linesP) > kwargs.get("max_lines_cap"):
                if std_dev > kwargs.get("max_std_dev_cap"):
                    skip_this_image=True

                elif not skip_this_image and max_density > kwargs.get("max_density_cap") and (histogram_variance > kwargs.get("max_hist_variance_cap") or std_dev>kwargs.get("max_std_dev_cap")):
                    skip_this_image=True

        if not skip_this_image:

          for i in range(0, 32):
              for point in filtered_points:
                  x1, y1 = map(int, point)
                  if x1 > r and y1 > r and x1 < width - r and y1 < height - r:
                      y_range = slice(y1 - r, y1 + r)
                      x_range = slice(x1 - r, x1 + r)

                      top = 0.25 * source[y_range, x1 - 2 * r]
                      left = 0.25 * source[y1 - 2 * r, x_range]
                      bottom = 0.25 * source[y1 + 2 * r, x_range]
                      right = 0.25 * source[y_range, x1 + 2 * r]

                      noise = np.random.uniform(-5, 5, (2 * r, 2 * r, 3))
                      source[y_range, x_range] = left + top + right + bottom + noise

          cv2.imwrite(os.path.join(kwargs['dest_folder'],  os.path.basename(file_name)), source)


In [None]:
# Declare width and height of the patches
r=6

#Set canny parameters
canny_A = 100
canny_B = 100
aperture = 3
L2gradient = False

#Set HoughlinesP parameters
hough_method = 'cv2.HOUGH_PROBABILISTIC'
hough_resolution = 720   # the resolution of rho in degrees
hough_threshold  = 35    # number of required votes
hough_min_length = 1     # the minimum line length
hough_max_gap    = 15    # maximum allowed gap
hough_iter       = 16    # number of iterations

# Set Thresholds
max_lines_cap           = 99    # Check density above this cap
max_density_cap         = 123   # Check density history variance above this cap
max_hist_variance       = 30    # Maximum allowed density history variance
max_hist_variance_cap   = 5     # Even distribution below this threshold
max_std_dev_cap         = 0.30  # Even distribution below this threshold
max_variance_cap        = 0.10  # Even distribution below this threshold

In [None]:
parameters={"r": r, "canny_A": canny_A, "canny_B": canny_B,
            "aperture": aperture, "L2gradient": L2gradient,"max_lines_cap": max_lines_cap,
            "max_density_cap": max_density_cap,"max_hist_variance":max_hist_variance,
            "max_hist_variance_cap":max_hist_variance_cap,"max_std_dev_cap":max_std_dev_cap,
            "max_variance_cap":max_variance_cap,"hough_method":hough_method,"hough_resolution":hough_resolution,
            "hough_threshold":hough_threshold,
            "hough_min_length":hough_min_length,
            "hough_max_gap":hough_max_gap,
            "hough_iter":hough_iter,
            "dest_folder": "dest"}

In [None]:
import os
i = 0
def process_image(image_path):
    image = cv2.imread(image_path)
    # Apply hair_removal function with your parameters
    if not os.path.exists(parameters["dest_folder"]):
        os.makedirs(parameters["dest_folder"])
    hair_removal(image, image_path, **parameters)
    i = i + 1
    print(i)

image_paths = list(map(lambda x: os.path.join(base_ham1000_path + "images", x), filter(lambda x: x.endswith('.jpg'), os.listdir(base_ham1000_path + "images"))))
print(image_paths[:5], len(image_paths))

## Execute hair removal with workers

In [None]:
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=14) as executor:
    executor.map(process_image, image_paths)

In [None]:
import shutil

# Define the source and destination directories
source_dir = parameters["dest_folder"]
dest_dir = base_ham1000_path + "images"

# Get the list of preprocessed images
preprocessed_images = os.listdir(source_dir)

# Replace the original images with the preprocessed images
for image_name in preprocessed_images:
    source_path = os.path.join(source_dir, image_name)
    dest_path = os.path.join(dest_dir, image_name)

    # Copy the preprocessed image to the original directory
    shutil.copy(source_path, dest_path)
    print(f"Replaced {dest_path} with preprocessed image.")

print("All images have been replaced with preprocessed images.")

## Group by labels and Split HAM10000 to train-test 80:20

In [None]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

image_folder = './Datasets/ham1000-segmentation-and-classification/images'
output_folder = image_folder + "/labeled_splitted"
file_extension = ".jpg"

# Define a mapping of labels to folder-friendly names
label_mapping = {
    "MEL": "melanoma",
    "NV": "melanocytic_nevi",
    "BCC": "basal_cell_carcinoma",
    "AKIEC": "actinic_keratoses_and_bowens_disease",
    "BKL": "benign_keratosis_like_lesions",
    "DF": "dermatofibroma",
    "VASC": "vascular_lesions"
}

# Load CSV
data = pd.read_csv(csv_path)

# Split into train and test sets (80% train, 20% test, stratified by labels)
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data.iloc[:, 1:], random_state=42)

# Function to organize images into train or test directories
def organize_images(dataframe, split_name):
    for _, row in dataframe.iterrows():
        filename = row['image'] + file_extension
        image_path = os.path.join(image_folder, filename)

        # Check if the image exists
        if not os.path.exists(image_path):
            print(f"Image {filename} not found, skipping.")
            continue

        # Move image based on label
        for label, value in row.items():
            if label == 'image':
                continue
            if value == 1:
                folder_name = label_mapping[label]
                split_dir = os.path.join(output_folder, split_name, folder_name)
                os.makedirs(split_dir, exist_ok=True)

                # Copy the image
                dest_path = os.path.join(split_dir, filename)
                shutil.copy(image_path, dest_path)
                print(f"Moved {filename} to {split_dir}")

# Organize train and test images
organize_images(train_data, "train")
organize_images(test_data, "test")

print("Train-test split and organization completed.")


## Merge with skindiseasedataset

In [None]:
import tensorflow as tf

# Paths
train_dir = dataset_path + "/train"
test_dir = dataset_path + "/test"

# Define parameters
image_size = (128, 128)  # Resize images
batch_size = 32

# Load train and test datasets
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir,
    labels="inferred",
    label_mode="int",  # Multiclass integer labels
    batch_size=batch_size,
    image_size=image_size
)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir,
    labels="inferred",
    label_mode="int",
    batch_size=batch_size,
    image_size=image_size
)

# Get class names
class_names = train_ds.class_names
print("Classes:", class_names)

# Define cancerous classes
cancerous_classes = {"melanoma", "actinic_keratoses_and_bowens_disease", "basal_cell_carcinoma", "SkinCancer"}

# Convert class names to a tensor for TensorFlow compatibility
class_names_tensor = tf.constant(class_names)

# Mapping function
def map_labels(image, label):
    # Get class names for the batch using tf.gather
    class_name = tf.gather(class_names_tensor, label)

    # Check if class names are in cancerous classes
    binary_label = tf.reduce_any(tf.equal(class_name[:, None], list(cancerous_classes)), axis=1)

    # Cast the binary label to an integer (0 or 1)
    binary_label = tf.cast(binary_label, tf.int32)

    # Return both disease_output and cancer_output
    return image, {"disease_output": label, "cancer_output": binary_label}

# Apply mapping to datasets
train_ds = train_ds.map(map_labels)
test_ds = test_ds.map(map_labels)

# Prefetch for performance
train_ds = train_ds.prefetch(buffer_size=tf.data.AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
# Model Definition
inputs = tf.keras.Input(shape=(128, 128, 3))
x = tf.keras.layers.Rescaling(1.0 / 255)(inputs)
x = tf.keras.layers.Conv2D(32, (3, 3), activation="relu")(x)
x = tf.keras.layers.MaxPooling2D((2, 2))(x)
x = tf.keras.layers.Conv2D(64, (3, 3), activation="relu")(x)
x = tf.keras.layers.MaxPooling2D((2, 2))(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)

# Output for multiclass classification (disease)
disease_output = tf.keras.layers.Dense(len(class_names), activation="softmax", name="disease_output")(x)

# Output for binary classification (cancerous or non-cancerous)
cancer_output = tf.keras.layers.Dense(1, activation="sigmoid", name="cancer_output")(x)

# Create the model
model = tf.keras.Model(inputs=inputs, outputs=[disease_output, cancer_output])

# Compile the model
model.compile(
    optimizer="adam",
    loss={
        "disease_output": "sparse_categorical_crossentropy",
        "cancer_output": "binary_crossentropy"
    },
    metrics={
        "disease_output": "accuracy",
        "cancer_output": "accuracy"
    }
)

# Train the model
history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=10
)

# Evaluate the model
results = model.evaluate(test_ds)
print("Evaluation Results:", results)