## Decisions
- There are 254 images with missing metadate. These will be kept, as the data is not neccessary for my case of predicting the diagnosis only from the image. For further analysis it could be interesing to use the metadata (see documentation for discussion)

**Preprocessing for ResNet-50**<br>

- Using PIL instead of Tensorflow for resizing
- stratified splitting into train/test datasets
- Organizing Images for train and testing into subfolders for diagnoses
- no one-hot-encoding, will use keras "datagen.flow_from_dataframe()"
- no resampling (using class weights)

In [1]:
import os
from PIL import Image, ImageOps
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Directory Paths
image_dir = r"D:\cancer\images"
resized_dir = r'D:\cancer\resized_images'
destination_dir = r'D:\cancer\organized'

In [3]:
df = pd.read_csv(r"D:\cancer\HAM10000_metadata.csv")

In [4]:
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


## Resizing images

In [5]:
def resize_and_pad_image(img, desired_size=224):
    """
    Resize an image to the desired size while maintaining its original aspect ratio. 
    Pad with black pixels if the resized image is not square.
    """
    # Calculate aspect ratio of the original image
    aspect_ratio = img.width / img.height
    new_width = int(desired_size * aspect_ratio)
    new_height = desired_size
    
    # Resize the image (maintaining the aspect ratio)
    img_resized = img.resize((new_width, new_height), Image.BICUBIC)
    
    # Pad the image to make it square
    img_padded = ImageOps.expand(img_resized, (0, 0, desired_size - new_width, 0), fill='black')
    
    return img_padded

def process_images(input_dir, output_dir, desired_size=224):
    os.makedirs(output_dir, exist_ok=True)

    for image_file in os.listdir(input_dir):
        if image_file.endswith('.jpg'):
            input_path = os.path.join(input_dir, image_file)
            output_path = os.path.join(output_dir, image_file)
            
            try:
                # Open the image
                with Image.open(input_path) as img:
                    img_resized = resize_and_pad_image(img, desired_size)
                    img_resized.save(output_path)
            except Exception as e:
                print(f"Error processing {image_file}: {e}")


In [6]:
# Call the function
process_images(image_dir, resized_dir)

## Reorganizing the images into separate Diagnosis directories

In [7]:
# Split data into training, validation, and testing sets (60% train, 20% validation, 20% test)
train_df, temp_df = train_test_split(df, test_size=0.4, stratify=df['dx'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['dx'], random_state=42)

In [8]:
# Function to organize images
def organize_images(dataframe, subset_name):
    subset_dir = os.path.join(destination_dir, subset_name)
    os.makedirs(subset_dir, exist_ok=True)
    
    for index, row in dataframe.iterrows():
        diagnosis = row['dx']
        image_file = row['image_id'] + '.jpg'
        
        # Create directory for diagnosis
        diagnosis_dir = os.path.join(subset_dir, diagnosis)
        os.makedirs(diagnosis_dir, exist_ok=True)
        
        # Move the image to the corresponding directory
        source_path = os.path.join(resized_dir, image_file)
        destination_path = os.path.join(diagnosis_dir, image_file)
        shutil.move(source_path, destination_path)

In [9]:
# Organize training images
organize_images(train_df, 'train')

# Organize validation images
organize_images(val_df, 'validation')

# Organize testing images
organize_images(test_df, 'test')

In [10]:
##End##