In [5]:
import os
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2

from sklearn.utils import resample
import albumentations as A

In [6]:
training_path = "../isic_2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input"
testing_path = "../isic_2019/ISIC_2019_Test_Input/ISIC_2019_Test_Input"

training_labels_path = "../isic_2019/ISIC_2019_Training_Input/ISIC_2019_Training_GroundTruth.csv"
testing_labels_path = "../isic_2019/ISIC_2019_Test_Input/ISIC_2019_Test_GroundTruth.csv"

training_labels = pd.read_csv(training_labels_path)
testing_labels = pd.read_csv(testing_labels_path)


def load_data(path):
    data = []
    for filename in os.listdir(path):
        if filename.endswith(".jpg"):
            data.append(filename)
    return data

training_data = load_data(training_path)
print("Training data: ", len(training_data))
testing_data = load_data(testing_path)
print("Testing data: ", len(testing_data))

print("Training labels: ", len(training_labels))
print("Testing labels: ", len(testing_labels))


Training data:  25331
Testing data:  8238
Training labels:  25331
Testing labels:  8238


In [7]:
# Verify extracted data


df = training_labels
print("Training labels preview:")
print(df.head())


# Peek of MEL column and NON-MEL values
melanoma_count = df[df["MEL"] == 1].shape[0]
non_melanoma_count = df[df["MEL"] == 0].shape[0]

# Printing the counts
print(f"Melanoma images: {melanoma_count}")
print(f"Non-Melanoma images: {non_melanoma_count}")

# Get overall class distribution
print(df["MEL"].value_counts(normalize=True) * 100)  # Shows % distribution


Training labels preview:
          image  MEL   NV  BCC   AK  BKL   DF  VASC  SCC  UNK
0  ISIC_0000000  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
1  ISIC_0000001  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
2  ISIC_0000002  1.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
3  ISIC_0000003  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
4  ISIC_0000004  1.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
Melanoma images: 4522
Non-Melanoma images: 20809
MEL
0.0    82.148356
1.0    17.851644
Name: proportion, dtype: float64


In [8]:
print("Melanoma images: ", len(df[df["MEL"] == 1]))
print("Melanocytic nevus images: ", len(df[df["NV"] == 1]))
print("Basal cell carcinoma images: ", len(df[df["BCC"] == 1]))
print("Actinic keratosis images: ", len(df[df["AK"] == 1]))
print("Benign keratosis images: ", len(df[df["BKL"] == 1]))
print("Dermatofibroma images: ", len(df[df["DF"] == 1]))
print("Vascular lesion images: ", len(df[df["VASC"] == 1]))
print("Squamous cell carcinoma images: ", len(df[df["SCC"] == 1]))
print("None of the above images: ", len(df[df["UNK"] == 1]))

Melanoma images:  4522
Melanocytic nevus images:  12875
Basal cell carcinoma images:  3323
Actinic keratosis images:  867
Benign keratosis images:  2624
Dermatofibroma images:  239
Vascular lesion images:  253
Squamous cell carcinoma images:  628
None of the above images:  0


Goal:
To keep a good proportion of the different types of non-melanoma images
Apply data augmentation to true melanoma to oversample
Apply under sampling to non melanoma set

In [9]:
'''NEW UNDERSAMPLING'''

def undersample_data_proportional(df, target_size, new_labels_path):
    df = pd.read_csv(training_labels_path)
    mel_df = df[df["MEL"] == 1]
    non_mel_df = df[df["MEL"] == 0]

    non_mel_types = ["NV","BCC","AK","BKL","DF","VASC","SCC"]
    non_mel_counts = {label: non_mel_df[non_mel_df[label] == 1.0].shape[0] for label in non_mel_types}
    print("Non Mel count: ", non_mel_counts)
    total_non_mel = sum(non_mel_counts.values())
    print("Total non-mel: ", total_non_mel)

    # Calculate sampling proportions
    sampling_props = {k: v / total_non_mel for k, v in non_mel_counts.items()}
    print("Sampling proportions: ", sampling_props)

    undersampled_non_mel = []

    for label in non_mel_types:
        label_df = non_mel_df[non_mel_df[label] == 1.0]
        
        # Sample proportionally
        target_label_count = int(sampling_props[label] * target_size)
        sampled_df = label_df.sample(n=target_label_count, random_state=42)
        undersampled_non_mel.append(sampled_df)

    # Combine all undersampled non-melanoma and melanoma
    df_undersampled = pd.concat([mel_df] + undersampled_non_mel)
    df_undersampled = df_undersampled.sort_values(by="image").reset_index(drop=True)

    # Shuffle the final dataset
    #undersampled_df = undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

    print("Size: ", df_undersampled.shape[0])
    print("Total non-mel: ", len(df_undersampled[df_undersampled["MEL"] == 0]))
    print("Total mel: ", len(df_undersampled[df_undersampled["MEL"] == 1]))
    for i in non_mel_types:
        print(i, len(df_undersampled[df_undersampled[i] == 1]))
        
    # Save to CSV
    df_undersampled.to_csv(new_labels_path, index=False)


undersample_data_proportional(training_labels_path, 10000, "../ISIC_2019_Training_GroundTruth_preprocessed.csv")

# Read the updated csv and get all new file names
undersampled_training_labels = "../ISIC_2019_Training_GroundTruth_preprocessed.csv"
undersampled_training_labels_df = pd.read_csv(undersampled_training_labels)

undersampled_filenames = undersampled_training_labels_df["image"].tolist()
undersampled_filenames = [filename + ".jpg" for filename in undersampled_filenames]

Non Mel count:  {'NV': 12875, 'BCC': 3323, 'AK': 867, 'BKL': 2624, 'DF': 239, 'VASC': 253, 'SCC': 628}
Total non-mel:  20809
Sampling proportions:  {'NV': 0.6187226680763132, 'BCC': 0.15969051852563795, 'AK': 0.04166466432793503, 'BKL': 0.12609928396366957, 'DF': 0.011485414964678744, 'VASC': 0.0121582007785093, 'SCC': 0.030179249363256284}
Size:  14517
Total non-mel:  9995
Total mel:  4522
NV 6187
BCC 1596
AK 416
BKL 1260
DF 114
VASC 121
SCC 301


In [10]:
'''FILTERING'''
# Hair removal
def remove_hair(img):
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(7,17))
    blackhat = cv2.morphologyEx(img, cv2.MORPH_BLACKHAT, kernel)
    bhg= cv2.GaussianBlur(blackhat,(3,3),cv2.BORDER_DEFAULT)
    ret,mask = cv2.threshold(bhg,10,255,cv2.THRESH_BINARY)
    dst = cv2.inpaint(img,mask,7,cv2.INPAINT_TELEA)
    return dst
    
# Gray scale
def convert_to_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    
# Noise reduction
def reduce_noise(image):
    bilateral = cv2.bilateralFilter(image, 9, 75, 75)
    median = cv2.medianBlur(bilateral, 5)
    return median
    
# Contrast enhancement
def enhance_contrast(image):
    enhanced_img = (cv2.createCLAHE(clipLimit=2, tileGridSize=(8,8))).apply(image)
    return enhanced_img

# Resizing
def resize_img(img, size=(224, 224)):
    resized_img = cv2.resize(img, size)
    return resized_img

# Other filters (need to debug)

# Edge detection
def segment_lesion(image):
        
    #https://docs.opencv.org/4.x/d7/d4d/tutorial_py_thresholding.html 
    # look at Otsu binarization; very nice
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    kernel = np.ones((5,5), np.uint8)
    mask = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
    return mask

def enhance_borders(image):

    # https://docs.opencv.org/4.x/d2/d2c/tutorial_sobel_derivatives.html
    sobelx = cv2.Sobel(image, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(image, cv2.CV_64F, 0, 1, ksize=3)
    gradient = np.sqrt(sobelx**2 + sobely**2)
    
    gradient = np.uint8(gradient * 255 / gradient.max())
    return gradient


In [11]:
'''PREPROCESSING'''
import shutil
def preprocessing(image):
    image = cv2.imread(image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    gray = convert_to_grayscale(image)
    hair_remove = remove_hair(gray)
    noise_reduced = reduce_noise(hair_remove)
    
    contrast = enhance_contrast(noise_reduced)

    resized = cv2.resize(contrast, (224,224))
    return resized

def preprocess_all_images(image_list, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    elif os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
    
    for image_name in range(len(image_list)):
        image_name = image_list[image_name]
        image_path = os.path.join(training_path, image_name)
        preprocessed_image = preprocessing(image_path)
        output_path = os.path.join(output_dir, image_name)
        cv2.imwrite(output_path, preprocessed_image)
        
    print("Total Preprocessed: ", i)

preprocess_all_images(undersampled_filenames, "../preprocessed_images")

KeyboardInterrupt: 

In [None]:
'''UPDATING LABELS'''

def update_labels(labels_path):
    df = pd.read_csv(labels_path)
    df = df[['image', 'MEL']] # Drop other columns
    df.to_csv(labels_path, index=False)
    return df


df_train_labels_updated = update_labels(undersampled_training_labels)
print(df_train_labels_updated.head())
print(df_train_labels_updated.shape)

          image  MEL
0  ISIC_0000000  0.0
1  ISIC_0000002  1.0
2  ISIC_0000004  1.0
3  ISIC_0000006  0.0
4  ISIC_0000011  0.0
(14517, 2)


In [None]:
'''DATA AUGMENTATION and OVER-SAMPLING'''
# Augmentation pipeline
def data_augmentation():
    transformation = A.Compose([
        A.RandomRotate90(p=0.5),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomBrightnessContrast(
            brightness_limit=0.2,
            contrast_limit=0.2,
            p=0.5
        ),
        A.RandomGamma(gamma_limit=(80, 120), p=0.5),
        A.GaussianBlur(blur_limit=(3, 5), p=0.3),
    ])
    return transformation

def augment_image(image, transformation, num_augmentations):
    augmented_images = []
    for _ in range(num_augmentations):
        augmented = transformation(image=image)["image"]
        augmented_images.append(augmented)
    return augmented_images

def augment_data(input_labels, input_dir, output_dir, new_file_num, num_augmentations):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read labels
    df = pd.read_csv(input_labels)

    # find melanoma images in the training set
    all_melanoma = df[df["MEL"] == 1]['image'].tolist()
    print("Total mel:", len(all_melanoma))

    # Create transformation pipeline
    transformation = data_augmentation()

    new_label_rows = []
    # new_file_num

    for image_name in all_melanoma[:]:
        image_name = image_name + ".jpg"
        image = os.path.join(input_dir, image_name).replace("\\", "/")
        if image_name.endswith(".csv"):
            continue
        
        # Read image
        try:
            image = cv2.imread(image)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        except Exception as e:
            continue
        
        # Augment image
        augmented_images = augment_image(image, transformation, num_augmentations) 
        for i, aug_img in enumerate(augmented_images):
            # Create new image name
            new_image_name = f"ISIC_00{new_file_num}"
            new_file_num += 1

            # Track new label row
            new_row = {"image": new_image_name,
                       "MEL": 1.0}
            new_label_rows.append(new_row)

            # Create new file path
            new_filepath = os.path.join(output_dir, new_image_name+".jpg").replace("\\", "/")
            
            # Save augmented images
            cv2.imwrite(new_filepath, aug_img)

            
        
    print("New labels: ", len(new_label_rows))
    df = pd.concat([df, pd.DataFrame(new_label_rows)], ignore_index=True)
    df.to_csv(input_labels, index=False)
        
        

# Get the newest file number from the original training labels file
df = pd.read_csv(training_labels_path)
new_file_num = int((df.tail(1)["image"].values[0])[5:]) + 1

preprocessed_directory = "../preprocessed_images"
augmented_directory = "../augmented_images"

df = pd.read_csv(undersampled_training_labels)
print("Size before :", df.shape)

augment_data(undersampled_training_labels, preprocessed_directory, preprocessed_directory, new_file_num, num_augmentations=1)

df = pd.read_csv(undersampled_training_labels)
print("Size after :", df.shape)




Size before : (14517, 2)
Size after : (14517, 2)
