In [2]:
import os
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2

from sklearn.utils import resample
import albumentations as A



In [95]:
# Current Issues:
# 1. Add oversampling to augmentation
# 2. Add a combination of both oversampling and undersampling
# 3. Figure out some more preprocessing techniques/augmented images (if possible, if not then oh well)
# 4. Start creating the model and training it
# 5. Sleep

In [4]:
# -isic_2019
#   -ISIC_2019_Training_Input
#       -ISIC_2019_Training_GroundTruth.csv
#       -ISIC_2019_Training_Input
#           -images
#   -ISIC_2019_Test_Input
#       -ISIC_2019_Test_GroundTruth.csv
#       -ISIC_2019_Test_Input
#           -images

training_path = "../isic_2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input"
testing_path = "../isic_2019/ISIC_2019_Test_Input/ISIC_2019_Test_Input"

training_labels_path = "../isic_2019/ISIC_2019_Training_Input/ISIC_2019_Training_GroundTruth.csv"
testing_labels_path = "../isic_2019/ISIC_2019_Test_Input/ISIC_2019_Test_GroundTruth.csv"

training_labels = pd.read_csv(training_labels_path)
print("Found training labels")
testing_labels = pd.read_csv(testing_labels_path)
print("Found testing labels")

def load_data(path):
    data = []
    for filename in os.listdir(path):
        if filename.endswith(".jpg"):
            data.append(filename)
    return data

training_data = load_data(training_path)
print("Training data: ", len(training_data))
testing_data = load_data(testing_path)
print("Testing data: ", len(testing_data))

print("Training labels: ", len(training_labels))
print("Testing labels: ", len(testing_labels))


Found training labels
Found testing labels
Training data:  25331
Testing data:  1592
Training labels:  25331
Testing labels:  8238


In [5]:
# Verify extracted data


df = training_labels
print("Training labels preview:")
print(df.head())


# Peek of MEL column and NON-MEL values
melanoma_count = df[df["MEL"] == 1].shape[0]
non_melanoma_count = df[df["MEL"] == 0].shape[0]

# Printing the counts
print(f"Melanoma images: {melanoma_count}")
print(f"Non-Melanoma images: {non_melanoma_count}")

# Get overall class distribution
print(df["MEL"].value_counts(normalize=True) * 100)  # Shows % distribution


Training labels preview:
          image  MEL   NV  BCC   AK  BKL   DF  VASC  SCC  UNK
0  ISIC_0000000  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
1  ISIC_0000001  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
2  ISIC_0000002  1.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
3  ISIC_0000003  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
4  ISIC_0000004  1.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
Melanoma images: 4522
Non-Melanoma images: 20809
MEL
0.0    82.148356
1.0    17.851644
Name: proportion, dtype: float64


In [98]:
'''
MEL: Melanoma
NV: Melanocytic nevus
BCC: Basal cell carcinoma
AK: Actinic keratosis
BKL: Benign keratosis (solar lentigo / seborrheic keratosis / lichen planus-like keratosis)
DF: Dermatofibroma
VASC: Vascular lesion
SCC: Squamous cell carcinoma
UNK: None of the above
'''


'\nMEL: Melanoma\nNV: Melanocytic nevus\nBCC: Basal cell carcinoma\nAK: Actinic keratosis\nBKL: Benign keratosis (solar lentigo / seborrheic keratosis / lichen planus-like keratosis)\nDF: Dermatofibroma\nVASC: Vascular lesion\nSCC: Squamous cell carcinoma\nUNK: None of the above\n'

In [99]:
'''
English
MEL: Cancerous cells, what we are trying to detect
NV: mole on the skin, not cancerous
    - round, flat/raised, brown or black spots on the skin with smooth edges
BCC: type of skin cancer, not melanoma
    - rolled edges, pearly appearance, and visible blood vessels
AK: pre-cancerous skin condition, not melanoma
    - rough, scaly patches on sun-exposed skin
BKL: benign skin lesions/skin growth, not cancerous
    - brown or black spots on the skin, often with irregular borders
DF: firm skin bumps, not cancerous
    - small, raised, and firm nodules that are usually brown or skin-colored
VASC: abnormal malformation of blood vessels, not cancerous
    - red or purple lesions on the skin, often with a raised appearance
SCC: type of skin cancer, not melanoma
    - scaly, crusted patches or open sores that do not heal
UNK: unc status
'''

'\nEnglish\nMEL: Cancerous cells, what we are trying to detect\nNV: mole on the skin, not cancerous\n    - round, flat/raised, brown or black spots on the skin with smooth edges\nBCC: type of skin cancer, not melanoma\n    - rolled edges, pearly appearance, and visible blood vessels\nAK: pre-cancerous skin condition, not melanoma\n    - rough, scaly patches on sun-exposed skin\nBKL: benign skin lesions/skin growth, not cancerous\n    - brown or black spots on the skin, often with irregular borders\nDF: firm skin bumps, not cancerous\n    - small, raised, and firm nodules that are usually brown or skin-colored\nVASC: abnormal malformation of blood vessels, not cancerous\n    - red or purple lesions on the skin, often with a raised appearance\nSCC: type of skin cancer, not melanoma\n    - scaly, crusted patches or open sores that do not heal\nUNK: unc status\n'

In [100]:
print("Melanoma images: ", len(df[df["MEL"] == 1]))
print("Melanocytic nevus images: ", len(df[df["NV"] == 1]))
print("Basal cell carcinoma images: ", len(df[df["BCC"] == 1]))
print("Actinic keratosis images: ", len(df[df["AK"] == 1]))
print("Benign keratosis images: ", len(df[df["BKL"] == 1]))
print("Dermatofibroma images: ", len(df[df["DF"] == 1]))
print("Vascular lesion images: ", len(df[df["VASC"] == 1]))
print("Squamous cell carcinoma images: ", len(df[df["SCC"] == 1]))
print("None of the above images: ", len(df[df["UNK"] == 1]))

Melanoma images:  4522
Melanocytic nevus images:  12875
Basal cell carcinoma images:  3323
Actinic keratosis images:  867
Benign keratosis images:  2624
Dermatofibroma images:  239
Vascular lesion images:  253
Squamous cell carcinoma images:  628
None of the above images:  0


In [6]:
'''FILTERING'''
# Hair removal
def remove_hair(img):
    #Black hat filter
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(7,17))
    blackhat = cv2.morphologyEx(img, cv2.MORPH_BLACKHAT, kernel)
    #Gaussian filter
    bhg= cv2.GaussianBlur(blackhat,(3,3),cv2.BORDER_DEFAULT)
    #Binary thresholding (MASK)
    ret,mask = cv2.threshold(bhg,10,255,cv2.THRESH_BINARY)
    #Replace pixels of the mask
    dst = cv2.inpaint(img,mask,7,cv2.INPAINT_TELEA)
    # plt.imshow(dst)
    # plt.show
    return dst
    
# Gray scale
def convert_to_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    
# Noise reduction
def reduce_noise(image):
    bilateral = cv2.bilateralFilter(image, 9, 75, 75)
    median = cv2.medianBlur(bilateral, 5)
    return median
    
# Contrast enhancement
def enhance_contrast(image):
    enhanced_img = (cv2.createCLAHE(clipLimit=2, tileGridSize=(8,8))).apply(image)
    return enhanced_img

# Resizing
def resize_img(img, size=(224, 224)):
    resized_img = cv2.resize(img, size)
    return resized_img

# Other filters (need to debug)

# Edge detection
def segment_lesion(image):
        
    #https://docs.opencv.org/4.x/d7/d4d/tutorial_py_thresholding.html 
    # look at Otsu binarization; very nice
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    kernel = np.ones((5,5), np.uint8)
    mask = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
    return mask

def enhance_borders(image):

    # https://docs.opencv.org/4.x/d2/d2c/tutorial_sobel_derivatives.html
    sobelx = cv2.Sobel(image, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(image, cv2.CV_64F, 0, 1, ksize=3)
    gradient = np.sqrt(sobelx**2 + sobely**2)
    
    gradient = np.uint8(gradient * 255 / gradient.max())
    return gradient


In [102]:
'''PREPROCESSING'''
import shutil
def preprocessing(image):
    image = cv2.imread(image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    gray = convert_to_grayscale(image)
    hair_remove = remove_hair(gray)
    noise_reduced = reduce_noise(hair_remove)
    
    contrast = enhance_contrast(noise_reduced)

    # Very wonky things happen depending on specific images when trying to get specific borders
    #mask = segment_lesion(image)   
    # borders = enhance_borders(noise_reduced)
    #final = cv2.bitwise_and(contrast, contrast, mask=mask)

    resized = cv2.resize(contrast, (224,224))
    return resized

def preprocess_all_images(image_list, output_dir):
    if not os.path.exists(output_dir):
        print("Creating output directory")
        os.makedirs(output_dir)
    elif os.path.exists(output_dir):
        print("Output directory already exists, deleting")
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
    for image_name in range(len(image_list)): # <------------------------------ Change for full dataset len(image_list)
        image_name = image_list[image_name]
        image_path = os.path.join(training_path, image_name)
        preprocessed_image = preprocessing(image_path)
        output_path = os.path.join(output_dir, image_name)
        cv2.imwrite(output_path, preprocessed_image)

preprocess_all_images(training_data, "../preprocessed_images")

Output directory already exists, deleting


In [7]:
'''UPDATING LABELS'''

def update_labels(labels_path):
    df = pd.read_csv(labels_path)
    df = df[['image', 'MEL']] # Drop other columns
    return df


df_train_labels_updated = update_labels(training_labels_path)

In [8]:
'''DATA AUGMENTATION'''

'''UNDERSAMPLING'''
# Simple random undersampling to achieve desired balance
# random_state HH 88

def undersample_data(df, majority_split, new_labels_path):
    majority = df[df["MEL"] == 0]
    minority = df[df["MEL"] == 1]

    # Calculate size
    # major_count = majority.shape[0]
    minor_count = minority.shape[0]
    size = int(minor_count // (1 - majority_split)) - minor_count

    majority_undersampled = resample(majority,
                                    replace=False,
                                    n_samples=size,
                                    random_state=88)
    
    df_undersampled = pd.concat([majority_undersampled, minority])
    df_ordered = df_undersampled.sort_values(by="image").reset_index(drop=True)

    df_ordered.to_csv(new_labels_path, index=False)
    #print(len(df_ordered))

majority_split = 0.5 # XX% majority (Non-Melanoma), XX% minority (Melanoma)
undersample_data(df_train_labels_updated, majority_split, "../preprocessed_images/ISIC_2019_Training_GroundTruth_preprocessed.csv")


In [9]:

def data_augmentation():
    transformation = A.Compose([
        A.RandomRotate90(p=0.5),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),

        A.RandomBrightnessContrast(
            brightness_limit=0.2,
            contrast_limit=0.2,
            p=0.5
        ),

        A.RandomGamma(gamma_limit=(80, 120), p=0.5),

        A.GaussianBlur(blur_limit=(3, 5), p=0.3),
    ])
    return transformation

def augment_image(image, transformation, num_augmentations=5):
    augmented_images = []
    # image = cv2.imread(image_path)
    for _ in range(num_augmentations):
        augmented = transformation(image=image)["image"]
        augmented_images.append(augmented)
    return augmented_images

def augment_data(input, output, num_augmentations=5):
    if not os.path.exists(output):
        os.makedirs(output)

    transformation = data_augmentation()

    for image_name in os.listdir(input):
        image = os.path.join(input, image_name).replace("\\", "/")
        if image_name.endswith(".csv"):
            continue
        
        image = cv2.imread(image)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        augmented_images = augment_image(image, transformation, num_augmentations)
        for i, aug_img in enumerate(augmented_images):
            
            base_name = os.path.basename(image_name)[:-4]
            
            
            aug_filename = f"{base_name}_aug_{i+1}.jpg"
            
            
            aug_path = os.path.join(output, aug_filename)
            #print(aug_path)
            
            cv2.imwrite(aug_path, aug_img)

augment_data("../preprocessed_images", "../augmented_images", num_augmentations=3)