### Preprocessing
Inspired by DSCC_Net

#### Imports

In [None]:
import kagglehub
import os
import pathlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
import cv2
import numpy as np
import math
from imblearn.combine import SMOTETomek

#### Download ISIC-2019

In [4]:
# Download latest version
base_path_isic = kagglehub.dataset_download("andrewmvd/isic-2019")
print("Base path to ISIC-2019:", base_path_isic)

Base path to ISIC-2019: /Users/audreylu/.cache/kagglehub/datasets/andrewmvd/isic-2019/versions/1


#### Download HAM10000

In [3]:
# Download latest version
base_path_ham = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
print("Base path to HAM10000:", base_path_ham)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kmader/skin-cancer-mnist-ham10000?dataset_version_number=2...


100%|██████████| 5.20G/5.20G [02:30<00:00, 37.0MB/s]

Extracting model files...





Base path to HAM10000: /Users/audreylu/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2


#### Filter Functions
Hair Removal and Noise Reduction

In [None]:
def filter_all_images(input_folder: str, output_folder: str) -> None:
    """Apply all filtering functions to jpg files in input_folder."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for file_name in os.listdir(input_folder):
        try:
            if file_name.endswith(".jpg"):
                print(f"Processing {file_name}...")
                if not os.path.exists(os.path.join(output_folder, file_name)):
                    filter_image(input_folder, output_folder, file_name)
                else:
                    print(f"{file_name} already exists in {output_folder}. Skipping.")
        except Exception as e:
            print(f"Error processing image {file_name}: {e}")

def filter_image(input_folder: str, output_folder: str, file_name: str) -> None:
    """Apply all filtering functions to a single image file."""
    image_path = os.path.join(input_folder, file_name)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading image from {image_path}")
    else:
        image_hr = hair_removal(image)
        image_nr = noise_reduction(image_hr)

        image_final = image_nr
        output_path = os.path.join(output_folder, file_name)
        cv2.imwrite(output_path, image_final)

def hair_removal(image: np.ndarray) -> np.ndarray:
    """Apply hair removal filter to the image using the Dull Razor algorithm."""
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (15, 15))  # creates a kernel for morphing
    blackhat = cv2.morphologyEx(image, cv2.MORPH_BLACKHAT, kernel)  # apply blackhat filter (highlights dark regions)
    bhg = cv2.GaussianBlur(blackhat, (9, 9), cv2.BORDER_REPLICATE)  # smooths image
    _, mask = cv2.threshold(bhg, 50, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)  # creates a binary mask to detect hair-like structure
    image_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    inpainted_image = cv2.inpaint(image_bgr, mask, 4, cv2.INPAINT_TELEA)
    return inpainted_image

def noise_reduction(image: np.ndarray) -> np.ndarray:
    """Apply noise reduction to the image using Median Filtering and Bilateral Filtering."""
    image_mf = cv2.medianBlur(image, 5)  # Apply median filter to reduce salt and pepper noise
    image_bf = cv2.bilateralFilter(image_mf, d=17, sigmaColor=100, sigmaSpace=100)  # Apply bilateral filter to reduce noise while preserving edges
    return image_bf

#### Preprocessing Functions
SMOTE-Tomek Oversampling

In [None]:
def oversample_melanoma_images(output_folder: str, data_frame: pd.DataFrame, output_csv_path: str, image_size=(224, 224)) -> None:
    """
    Oversample melanoma images using SMOTE-Tomek.

    output_folder: Path to the folder with filtered images and where upsampled images will be saved.
    data_frame: DataFrame containing image file names, melanoma (0 or 1), and paths.
    output_csv_path: Path to save the updated CSV file.
    image_size: Target size (width, height) for resized images.
    """
    # Load melanoma images and their labels
    melanoma_df = data_frame[data_frame['melanoma'] == 1]
    non_melanoma_df = data_frame[data_frame['melanoma'] == 0]

    melanoma_images = []
    melanoma_labels = []

    for image_path in melanoma_df['path']:
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Load as RGB
        if image is not None:
            resized_image = cv2.resize(image, image_size)  # Resize to target size
            melanoma_images.append(resized_image.flatten())  # Flatten the image
            melanoma_labels.append(1)  # Label for melanoma

    non_melanoma_images = []
    non_melanoma_labels = []

    for image_path in non_melanoma_df['path']:
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Load as RGB
        if image is not None:
            resized_image = cv2.resize(image, image_size)  # Resize to target size
            non_melanoma_images.append(resized_image.flatten())  # Flatten the image
            non_melanoma_labels.append(0)  # Label for non-melanoma

    # Combine melanoma and non-melanoma data
    X = np.array(melanoma_images + non_melanoma_images)
    y = np.array(melanoma_labels + non_melanoma_labels)

    # Apply SMOTE-Tomek
    smote_tomek = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

    synthetic_data = {"image_name": [], "melanoma": [], "path": []}
    for i, (image_data, label) in enumerate(zip(X_resampled, y_resampled)):
        reshaped_image = image_data.reshape(image_size[0], image_size[1], 3)  # Reshape to original size
        output_image_path = os.path.join(output_folder, f"synthetic_{i}.jpg")
        cv2.imwrite(output_image_path, reshaped_image)  # Save the image
        synthetic_data["image_name"].append(f"synthetic_{i}")
        synthetic_data["melanoma"].append(label)
        synthetic_data["path"].append(output_image_path)

    # Save the updated DataFrame
    synthetic_df = pd.DataFrame(synthetic_data)
    synthetic_df.to_csv(output_csv_path, mode='a', header=False)
    print(f"Updated CSV saved to {output_csv_path}")

#### View Functions

In [None]:
def num_melanoma(data_frame: pd.DataFrame) -> int:
    # Count number of melanoma vs non-melanoma
    melanoma_count = data_frame[data_frame["MEL"] == 1].shape[0]
    non_melanoma_count = data_frame[data_frame["MEL"] == 0].shape[0]

    print(f"\nMelanoma images: {melanoma_count}")
    print(f"Non-Melanoma images: {non_melanoma_count}")

    # Percent distribution
    print("\nMEL column class distribution (%):")
    print(data_frame["MEL"].value_counts(normalize=True) * 100)

def view_images(folder: str, image_files: list[str], num=0) -> None:
    """
    View a subset of images from the dataset.

    image_files: List of image file names
    num: Number of images to display
    """
    if num == 0:
        num = len(image_files)
    n_rows = math.ceil(num // 5)
    n_cols = 5
    plt.figure(figsize=(n_cols * 3, n_rows * 5))  # Adjust figure size dynamically
    for i, image_file in enumerate(image_files[:num]):
        image_path = os.path.join(folder, image_file)
        img = mpimg.imread(image_path)
        
        plt.subplot(n_rows, n_cols, i + 1)
        # Check if the image is grayscale
        if len(img.shape) == 2 or (len(img.shape) == 3 and img.shape[2] == 1):
            plt.imshow(img, cmap='gray')
        else:
            plt.imshow(img)
        plt.title(image_file, fontsize=8)
        plt.axis('off')

    plt.tight_layout()
    plt.show()

#### Path Setup

In [None]:
base_path_isic = '/Users/audreylu/.cache/kagglehub/datasets/andrewmvd/isic-2019/versions/1'  # Update this path as needed
base_path_ham = '/Users/audreylu/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2'  # Update this path as needed
image_folder = os.path.join(base_path, 'ISIC_2019_Training_Input', 'ISIC_2019_Training_Input')
labels_path = os.path.join(base_path, 'ISIC_2019_Training_GroundTruth.csv')
output_folder = '../../preprocessed_dataset'  # Update this path as needed
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
balanced_csv_path = 'ISIC_2019_Training_Balanced.csv'

#### View Images

In [None]:
# === Check Images ===
# List only .jpg image files (ignore folders)
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg') and os.path.isfile(os.path.join(image_folder, f))]

print("Total .jpg files found:", len(image_files))
print("Sample image names:", image_files[:5])

# === Visualize First 5 Images ===
print("\nImages before preprocessing:")
view_images(image_folder, image_files, 5)

#### Filter All Images

In [None]:
# === Filtering and Preprocessing ===

# Testing with first 5 images
# filter_all_images(image_folder, output_folder, image_files[:5])

filter_all_images(image_folder, output_folder, image_files)
print("\nImages after filtering:")
result_files = [f for f in os.listdir(output_folder) if f.endswith('.jpg') and os.path.isfile(os.path.join(output_folder, f))]
result_files.sort()
view_images(output_folder, result_files, 5)
# view_images(output_folder, result_files)

#### Load DataFrame

In [None]:
# === Load Labels CSV ===
df = pd.read_csv(labels_path)[['image', 'MEL']]
print("\nTraining labels preview:")
print(df.head())

# === Melanoma vs. Non-Melanoma ===
num_melanoma(df)  # Display the number of melanoma vs non-melanoma images

#### Data Augmentation

In [None]:
print("\nApplying data augmentation to balance the dataset...")
upsample_melanoma_images(df, image_folder, output_folder, balanced_csv_path)
print(f"Updated CSV saved to {balanced_csv_path}")

#### Preview Augmented Dataset

In [None]:
df_balanced = pd.read_csv(balanced_csv_path)
print("\nBalanced dataset preview:")
print(df_balanced.head())
num_melanoma(df_balanced)  # Display the number of melanoma vs non-melanoma images