# Skin Cancer detection and classification using HAM10000 dataset
Submitted by **Andrea DAVILA**



link to the HAM10000's kaggle page : https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000/data?select=HAM10000_images_part_2

In [None]:
import cv2
import os
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
from tqdm import tqdm

## I. Data Extraction

### Importing the dataset from Kaggle's API

In [None]:
!mkdir -p ~/.kaggle

!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

zip_ref = zipfile.ZipFile('/content/skin-cancer-mnist-ham10000.zip', 'r') #don't hesitate to right click on the zip file --> "copy path" and paste it here
zip_ref.extractall('/content') #destination of the unzipped file
zip_ref.close()

Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0
Downloading skin-cancer-mnist-ham10000.zip to /content
100% 5.20G/5.20G [02:16<00:00, 42.1MB/s]
100% 5.20G/5.20G [02:16<00:00, 41.0MB/s]


### Read metadata csv file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/HAM10000_metadata.csv')

In [None]:
df.head()

## II. Data Preprocessing

### Hair removal : we'll use the DullRazor algorithm.
function adapted from https://github.com/BlueDokk/Dullrazor-algorithm/blob/main/ISIC_0031023.jpg

In [None]:
def dullrazor(image_path):
  """
  Applies the dull razor hair removal algorithm to an RGB image.

    Args:
        image_path (string): the path to the image file.

    Returns:
        numpy.ndarray: The image with the hair removed (450, 600, 3) shaped.

    Example:
        dullrazor(image_rgb) returns an rgb image with the hair removed.
  """

  image=cv2.imread(image_path,cv2.IMREAD_COLOR) #image is read in BGR
  #to avoid the dermoscopy frame
  image=image[40:400,40:550]
  grayScale = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY )
  #Black hat filter
  kernel = cv2.getStructuringElement(1,(9,9))
  blackhat = cv2.morphologyEx(grayScale, cv2.MORPH_BLACKHAT, kernel)
  #Gaussian filter
  bhg= cv2.GaussianBlur(blackhat,(3,3),cv2.BORDER_DEFAULT)
  #Binary thresholding (MASK)
  ret,mask = cv2.threshold(bhg,10,255,cv2.THRESH_BINARY)
  #Replace pixels of the mask
  dst = cv2.inpaint(image, mask, 6, cv2.INPAINT_TELEA)
  #retransform the image to RGB
  dullrazor_image = cv2.cvtColor(dst, cv2.COLOR_BGR2RGB)

  return dullrazor_image

### Resize image to 128x128 (DenseNet input format, good for compuational efficiency)

In [None]:
def resize_image(image, target_size=(128, 128)):
    """
    Resizes image to target size without black bars.
    """
    resized = cv2.resize(image, target_size, interpolation=cv2.INTER_AREA)
    return resized

### Lesion segmentation using Otsu's thresholding method


In [None]:
def otsu(hair_removed):
    """
    Applies improved Otsu's thresholding with morphological operations to clean up segmentation.

    Args:
        hair_removed (numpy.ndarray): the RGB image with the hair removed

    Returns:
        tuple: a tuple containing the binary mask and the segmented image.
    """
    #Get red channel
    red_channel = hair_removed[:, :, 0]


    ##---Otsu's Thresholding part---
      #-preparation->Apply Gaussian blur
    blurred = cv2.GaussianBlur(red_channel, (5, 5), 0)

      #Apply Otsu's thresholding
    _, otsu_thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    ##--Morphological operations to clean up the mask--
      #Create circular kernel
    kernel_size = 30
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))

      # 1. remove small noise objects (remeber that the part of interest in BLACK not WHITE !!!!!!!!!)
      #here we dilate the surrounding to erase the small noise, then we erode to recover the original mask.
    closing = cv2.morphologyEx(otsu_thresh, cv2.MORPH_CLOSE, kernel)

      # 2. Remove holes in the lesion
    kernel_size = 50
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
    opening = cv2.morphologyEx(closing, cv2.MORPH_OPEN, kernel)

    kernel_size = 10
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
    erode = cv2.morphologyEx(opening, cv2.MORPH_ERODE, kernel)

    ##--Build convex hull of the remaining parts for weird moles--
      #Find contours in the binary image
    contours, _ = cv2.findContours(~opening, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

      #Create an empty mask
    hull_mask = np.zeros_like(opening)

      #Find the convex hull of the contours
    all_points = np.vstack([cont.squeeze() for cont in contours])
    hull = cv2.convexHull(all_points)

      #Draw the convex hull on the mask
    cv2.drawContours(hull_mask, [hull], -1, (255, 255, 255), -1)

    ##--Apply the hull mask to the original image--
    black_pixel_mask = ~hull_mask == 0
    #black_pixel_mask = erode == 0
    black_pixels_image = np.zeros_like(hair_removed)
    black_pixels_image[black_pixel_mask] = hair_removed[black_pixel_mask]

    return erode, black_pixels_image

## Applying the transformations to all the dataset

Delete the few problematic images (error when segmenting) in order to have the exact same splits at the end.

In [None]:
#delete problematic images (did not manage to process them, those generated errors)
os.remove("/content/ham10000_images_part_1/ISIC_0025610.jpg")
os.remove("/content/ham10000_images_part_1/ISIC_0024947.jpg")
os.remove("/content/ham10000_images_part_1/ISIC_0027979.jpg")
os.remove("/content/ham10000_images_part_2/ISIC_0031449.jpg")
os.remove("/content/ham10000_images_part_2/ISIC_0030655.jpg")

## resize the hole dataset to 128x128x3 without dullrazor and Otsu

In [None]:
def resize_dataset(target_size, filepath_dict, df):
    """
    Process images using the segment function and save them to specified output directory

    Args:
        filepath_dict (dict): Dictionary with filenames as keys and file paths as values
        df (pandas.DataFrame): DataFrame containing image labels

    Returns:
        tuple: Lists of processed images (X), corresponding labels (y) and the list of images we failed to process (failed_images)
    """

    X = []
    y = []
    failed_images = []

    for filename in tqdm(filepath_dict.keys()):
        try:
            #Process image
            image=cv2.imread(filepath_dict[filename], cv2.IMREAD_COLOR) #image is read in BGR
            #to avoid the dermoscopy frame
            image=image[40:400,40:550]
            #resize img
            processed_img = resize_image(image, target_size)

            #Append to lists (keep RGB format for X)
            X.append(processed_img)  #Original RGB format
            y.append(df[df['image_id'] == filename.split('.')[0]]['dx'].values[0])

        except cv2.error as e:
            print(f"OpenCV error processing {filename}: {str(e)}")
            failed_images.append((filename, "OpenCV error"))
            continue

        except IOError as e:
            print(f"IO error processing {filename}: {str(e)}")
            failed_images.append((filename, "IO error"))
            continue

        except Exception as e:
            print(f"Unexpected error processing {filename}: {str(e)}")
            failed_images.append((filename, "Unexpected error"))
            continue
      # Print summary
    print(f"\nProcessing completed:")
    print(f"Successfully processed: {len(X)} images")
    print(f"Failed to process: {len(failed_images)} images")

    if failed_images:
        print("\nFailed images:")
        for img, error_type in failed_images:
            print(f"- {img}: {error_type}")

    return X, y, failed_images

In [None]:
filepath = {}
directory1 = "/content/ham10000_images_part_1"
directory2 = "/content/ham10000_images_part_2"

#Get file list and path to each image
for filename in os.listdir(directory1):
    filepath[filename] = os.path.join(directory1, filename)

for filename in os.listdir(directory2):
    filepath[filename] = os.path.join(directory2, filename)

#Process images and get data
X, y, failed_images = resize_dataset(target_size=(128, 128), filepath_dict=filepath, df=df)

100%|██████████| 10010/10010 [02:31<00:00, 65.86it/s]


Processing completed:
Successfully processed: 10010 images
Failed to process: 0 images





In [None]:
#Save the arrays
np.save('/content/drive/MyDrive/Project 36100 - Andrea/Assignment Stage 2/X_NO_dullrazor_NO_segmentation_128.npy', X)
np.save('/content/drive/MyDrive/Project 36100 - Andrea/Assignment Stage 2/y_NO_dullrazor_NO_segmentation_128.npy', y)

## Apply dullrazor + resizing to the hole dataset

In [None]:
def hair_removal_and_resize(target_size, filepath_dict, df):
    """
    Process images using the segment function and save them to specified output directory

    Args:
        filepath_dict (dict): Dictionary with filenames as keys and file paths as values
        df (pandas.DataFrame): DataFrame containing image labels

    Returns:
        tuple: Lists of processed images (X), corresponding labels (y) and the list of images we failed to process (failed_images)
    """

    X = []
    y = []
    failed_images = []

    for filename in tqdm(filepath_dict.keys()):
        try:
            #Process image
            processed_img = dullrazor(filepath_dict[filename])
            processed_img = resize_image(processed_img, target_size)

            #Append to lists (keep RGB format for X)
            X.append(processed_img)  #Original RGB format
            y.append(df[df['image_id'] == filename.split('.')[0]]['dx'].values[0])

        except cv2.error as e:
            print(f"OpenCV error processing {filename}: {str(e)}")
            failed_images.append((filename, "OpenCV error"))
            continue

        except IOError as e:
            print(f"IO error processing {filename}: {str(e)}")
            failed_images.append((filename, "IO error"))
            continue

        except Exception as e:
            print(f"Unexpected error processing {filename}: {str(e)}")
            failed_images.append((filename, "Unexpected error"))
            continue
      # Print summary
    print(f"\nProcessing completed:")
    print(f"Successfully processed: {len(X)} images")
    print(f"Failed to process: {len(failed_images)} images")

    if failed_images:
        print("\nFailed images:")
        for img, error_type in failed_images:
            print(f"- {img}: {error_type}")

    return X, y, failed_images

In [None]:
filepath = {}
directory1 = "/content/ham10000_images_part_1"
directory2 = "/content/ham10000_images_part_2"

#Get file list and path to each image
for filename in os.listdir(directory1):
    filepath[filename] = os.path.join(directory1, filename)

for filename in os.listdir(directory2):
    filepath[filename] = os.path.join(directory2, filename)

#Process images and get data
X, y, failed_images = hair_removal_and_resize(target_size=(128, 128), filepath_dict=filepath, df=df)

In [None]:
#Save the arrays
np.save('/content/drive/MyDrive/Project 36100 - Andrea/Assignment Stage 2/X_hair_removal_NO_segmentation_128.npy', X)
np.save('/content/drive/MyDrive/Project 36100 - Andrea/Assignment Stage 2/y_hair_removal_NO_segmentation_128.npy', y)

## Apply DullRazor + Otsu + resizing (128x128) transformations to the whole dataset

### resize image and segment function

In [None]:
def segment_resize(file_path):
  """applies hair removal and segmentation operations on the image at the file_path"""
  target_size=(128, 128)
  hair_removed = dullrazor(file_path)
  _, segmented_image = otsu(hair_removed)
  resized_img = resize_image(segmented_image, target_size)
  return resized_img

## Save Data into usable format

In [None]:
def hair_removal_segment_images(filepath_dict, df):
    """
    Process images using the segment function and dullrazor and save them to specified output directory

    Args:
        filepath_dict (dict): Dictionary with filenames as keys and file paths as values
        df (pandas.DataFrame): DataFrame containing image labels
        output_dir (str): Directory to save processed images
        segment_func (function): Function to process the images

    Returns:
        tuple: Lists of processed images (X) and corresponding labels (y)
    """

    X = []
    y = []
    failed_images = []

    for filename in tqdm(filepath_dict.keys()):
        try:
            #Process image
            processed_img = segment_resize(filepath_dict[filename])

            #Append to lists (keep RGB format for X)
            X.append(processed_img/255)  #normalizing the image
            y.append(df[df['image_id'] == filename.split('.')[0]]['dx'].values[0])

        except cv2.error as e:
            print(f"OpenCV error processing {filename}: {str(e)}")
            failed_images.append((filename, "OpenCV error"))
            continue

        except IOError as e:
            print(f"IO error processing {filename}: {str(e)}")
            failed_images.append((filename, "IO error"))
            continue

        except Exception as e:
            print(f"Unexpected error processing {filename}: {str(e)}")
            failed_images.append((filename, "Unexpected error"))
            continue
      # Print summary
    print(f"\nProcessing completed:")
    print(f"Successfully processed: {len(X)} images")
    print(f"Failed to process: {len(failed_images)} images")

    if failed_images:
        print("\nFailed images:")
        for img, error_type in failed_images:
            print(f"- {img}: {error_type}")

    return X, y, failed_images

In [None]:
#Example usage:
filepath = {}
directory1 = "/content/ham10000_images_part_1"
directory2 = "/content/ham10000_images_part_2"

#Get file list and path to each image
for filename in os.listdir(directory1):
    filepath[filename] = os.path.join(directory1, filename)

for filename in os.listdir(directory2):
    filepath[filename] = os.path.join(directory2, filename)

#Process images and get data
X, y, failed_images = hair_removal_segment_images(filepath, df)

In [None]:
#Save the arrays
np.save('/content/drive/MyDrive/Project 36100 - Andrea/Assignment Stage 2/X_dullrazor_128_otsu.npy', X)
np.save('/content/drive/MyDrive/Project 36100 - Andrea/Assignment Stage 2/y_dullrazor_128_otsu.npy', y)