In [6]:
import os
import cv2
import numpy as np
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path

# Constants
IMAGE_SIZE = (224, 224)
SUPPORTED_FORMATS = ['.jpg', '.jpeg', '.png']

In [2]:
class ImagePreprocessor:
    """Skin Cancer Image preprocessing pipeline"""
    
    @staticmethod
    def hair_remove(image):
        """Remove hair from skin images"""
        try:
            grayScale = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 17))
            blackhat = cv2.morphologyEx(grayScale, cv2.MORPH_BLACKHAT, kernel)
            _, threshold = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)
            final_image = cv2.inpaint(image, threshold, 1, cv2.INPAINT_TELEA)
            return final_image
        except Exception as e:
            print(f"Error in hair removal: {str(e)}")
            return image

    @staticmethod
    def sharpen_image(image):
        """Sharpen image using unsharp masking"""
        gaussian = cv2.GaussianBlur(image, (0, 0), 2.0)
        return cv2.addWeighted(image, 1.5, gaussian, -0.5, 0)

In [4]:
def preprocess_image(image, target_size=(224, 224)):
    """Apply all preprocessing steps to an image"""
    preprocessor = ImagePreprocessor()
    
    image = preprocessor.hair_remove(image)
    image = preprocessor.sharpen_image(image)
    image = cv2.resize(image, target_size, interpolation=cv2.INTER_NEAREST)
    
    return image

In [7]:
base_directory = 'original_images'
os.mkdir(base_directory)

train_directory = os.path.join(base_directory, 'train_directory')
os.mkdir(train_directory)

validation_directory = os.path.join(base_directory, 'validation_directory')
os.mkdir(validation_directory)

test_directory = os.path.join(base_directory, 'test_directory')
os.mkdir(test_directory)

directory = [train_directory, validation_directory, test_directory]
classes = ['nv', 'mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']

for dirc in directory:
    for cls in classes:
        path = os.path.join(dirc, cls)
        os.mkdir(path)

In [8]:
def process_and_organize_dataset(source_path, destination_path):
    """Process images and organize them into the new structure"""
    source_path = Path(source_path)
    destination_path = Path(destination_path)
    
    # Process each split (train, test, val)
    splits = ['train_directory', 'test_directory', 'validation_directory']
    for split in splits:
        split_path = source_path / split
        
        dest_split = split
        
        for category in ['nv', 'mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']:
            category_path = split_path / category
            if not category_path.exists():
                continue
                
            print(f"Processing {split}/{category}...")
            
            # Process each image in the category
            for img_file in tqdm([f for ext in SUPPORTED_FORMATS for f in category_path.glob(f'*{ext}')]):
                img = cv2.imread(str(img_file))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                
                # Apply preprocessing
                processed_img = preprocess_image(img.copy(), target_size=(224, 224))
                
                # Generate filename without extension
                filename = img_file.stem
                
                # Save original preprocessed image
                original_path = destination_path / dest_split / category / f"{filename}.jpg"
                cv2.imwrite(str(original_path), cv2.cvtColor(processed_img, cv2.COLOR_RGB2BGR))

In [9]:
source_path = '/kaggle/input/multiclassskincancer'
destination_path = '/kaggle/working/original_images'
process_and_organize_dataset(source_path, destination_path)

Processing train_directory/nv...


100%|██████████| 5115/5115 [03:48<00:00, 22.34it/s]


Processing train_directory/mel...


100%|██████████| 5950/5950 [02:08<00:00, 46.21it/s]


Processing train_directory/bkl...


100%|██████████| 5990/5990 [02:09<00:00, 46.42it/s]


Processing train_directory/bcc...


100%|██████████| 5462/5462 [01:39<00:00, 55.08it/s]


Processing train_directory/akiec...


100%|██████████| 5510/5510 [01:40<00:00, 54.77it/s]


Processing train_directory/vasc...


100%|██████████| 4810/4810 [01:01<00:00, 78.38it/s]


Processing train_directory/df...


100%|██████████| 4090/4090 [01:10<00:00, 58.35it/s]


Processing test_directory/nv...


100%|██████████| 883/883 [00:41<00:00, 21.24it/s]


Processing test_directory/mel...


100%|██████████| 46/46 [00:02<00:00, 15.64it/s]


Processing test_directory/bkl...


100%|██████████| 88/88 [00:05<00:00, 16.64it/s]


Processing test_directory/bcc...


100%|██████████| 35/35 [00:01<00:00, 21.54it/s]


Processing test_directory/akiec...


100%|██████████| 30/30 [00:01<00:00, 18.42it/s]


Processing test_directory/vasc...


100%|██████████| 13/13 [00:00<00:00, 23.23it/s]


Processing test_directory/df...


100%|██████████| 8/8 [00:00<00:00, 14.28it/s]


Processing validation_directory/nv...


100%|██████████| 707/707 [00:34<00:00, 20.66it/s]


Processing validation_directory/mel...


100%|██████████| 37/37 [00:02<00:00, 18.31it/s]


Processing validation_directory/bkl...


100%|██████████| 71/71 [00:04<00:00, 16.47it/s]


Processing validation_directory/bcc...


100%|██████████| 28/28 [00:01<00:00, 21.73it/s]


Processing validation_directory/akiec...


100%|██████████| 24/24 [00:01<00:00, 17.73it/s]


Processing validation_directory/vasc...


100%|██████████| 10/10 [00:00<00:00, 19.30it/s]


Processing validation_directory/df...


100%|██████████| 6/6 [00:00<00:00, 16.74it/s]


In [10]:
base_dir = '/kaggle/working/original_images'
directory = ['train_directory', 'validation_directory', 'test_directory']
classes = ['nv', 'mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']

for dirc in directory:
    print(dirc)
    for cls in classes:
        path = os.path.join(base_dir, dirc, cls)
        print(f"{cls}    : ", len(os.listdir(path)))

train_directory
nv    :  5115
mel    :  5950
bkl    :  5990
bcc    :  5462
akiec    :  5510
vasc    :  4810
df    :  4090
validation_directory
nv    :  707
mel    :  37
bkl    :  71
bcc    :  28
akiec    :  24
vasc    :  10
df    :  6
test_directory
nv    :  883
mel    :  46
bkl    :  88
bcc    :  35
akiec    :  30
vasc    :  13
df    :  8
