In [23]:
import os
import kagglehub
import pandas as pd
import tensorflow as tf

# Download latest dataset version
print("Downloading dataset from Kaggle...")
path = kagglehub.dataset_download("joydippaul/mpox-skin-lesion-dataset-version-20-msld-v20")
print("Path to dataset files:", path)


Downloading dataset from Kaggle...
Path to dataset files: /Users/vasu7400/.cache/kagglehub/datasets/joydippaul/mpox-skin-lesion-dataset-version-20-msld-v20/versions/4


In [24]:
# Define paths to dataset folders
data_path = path
original_images_path = os.path.join(data_path, "Original Images", "Original Images", "FOLDS")
augmented_images_path = os.path.join(data_path, "Augmented Images", "Augmented Images", "FOLDS_AUG")

# Define image dimensions
IMG_HEIGHT = 224
IMG_WIDTH = 224
BATCH_SIZE = 32

# Class names
CLASS_NAMES = ["Chickenpox", "Cowpox", "Healthy", "HFMD", "Measles", "Monkeypox"]

# Helper function to load image paths and labels
def load_images_from_directory(base_path, fold, dataset_type, augmented=False):
    images = []
    for class_index, class_name in enumerate(CLASS_NAMES):
        folder_path = os.path.join(
            base_path,
            f"fold{fold}_AUG/train/{class_name}" if augmented else f"fold{fold}/{dataset_type}/{class_name}"
        )
        if not os.path.exists(folder_path):
            print(f"Warning: Path does not exist: {folder_path}")
            continue

        for img_name in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_name)
            if os.path.isfile(img_path):
                images.append((img_path, class_index))
    return images

# Load original and augmented data
def load_data():
    datasets = {}
    for fold in range(1, 6):
        print(f"Loading data for fold {fold}...")

        # Load original data
        datasets[f"fold{fold}_train"] = load_images_from_directory(original_images_path, fold, "train")
        datasets[f"fold{fold}_val"] = load_images_from_directory(original_images_path, fold, "valid")
        datasets[f"fold{fold}_test"] = load_images_from_directory(original_images_path, fold, "test")

        # Load augmented data
        datasets[f"fold{fold}_train_aug"] = load_images_from_directory(augmented_images_path, fold, "train", augmented=True)

    return datasets

# Define function to preprocess and create TensorFlow datasets
def preprocess_data(data, batch_size=BATCH_SIZE):
    if not data:
        raise ValueError("Dataset is empty. Ensure the data paths and structure are correct.")

    file_paths, labels = zip(*data)

    dataset = tf.data.Dataset.from_tensor_slices((list(file_paths), list(labels)))

    def parse_image(file_path, label):
        image = tf.io.read_file(file_path)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, [IMG_HEIGHT, IMG_WIDTH])
        image = image / 255.0  # Normalize to [0,1]
        return image, label

    dataset = dataset.map(parse_image, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=len(data)).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

    return dataset

# Load datasets
datasets = load_data()

# Example: Preparing fold 1 train and validation datasets
try:
    fold1_train = preprocess_data(datasets['fold1_train'])
    fold1_val = preprocess_data(datasets['fold1_val'])
    print("Data loading and preprocessing complete.")
except ValueError as e:
    print(e)


Loading data for fold 1...
Loading data for fold 2...
Loading data for fold 3...
Loading data for fold 4...
Loading data for fold 5...
Data loading and preprocessing complete.
