In [7]:
import os
import pandas as pd
import torch
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# Paths
metadata_path = "ham_metadata.csv"
images_path = "HAMImages/"
output_file = "HAM10000Skin_Cancer.pt"

# Load the metadata
metadata = pd.read_csv(metadata_path)# Dictionary for mapping target labels
dx_mapping = {
    "bkl": 0,
    "bcc": 1,
    "df": 2,
    "mel": 3,
    "nv": 4,
    "vasc": 5,
    "akiec": 6,
}

# Map 'dx' column to numerical labels
metadata["dx"] = metadata["dx"].map(dx_mapping)

# One-hot encode dx_type, sex, and localization columns
metadata = pd.get_dummies(metadata, columns=["dx_type", "sex", "localization"], prefix=["dx_type", "sex", "local"])
metadata = metadata.replace({True: 1, False: 0})



from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression 
imputer = IterativeImputer(estimator=LinearRegression(), 
                          max_iter=10, 
                          random_state=0) 


metadata['age'] = imputer.fit_transform(metadata[['age']])

In [9]:
metadata = pd.read_csv(metadata_path)# Dictionary for mapping target labels
metadata.columns 

Index(['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization'], dtype='object')

In [3]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize images to a fixed size
    transforms.ToTensor(),         # Convert to tensor
])

# Initialize lists for storing processed data
images = []
metadata_list = []
nfc = 0
fc = 0
# Iterate through the metadata
for _, row in tqdm(metadata.iterrows(), total=len(metadata)):
    image_path = os.path.join(images_path, row["image_id"] + ".jpg")
    
    # Check if the image file exists
    if os.path.exists(image_path):
        # Load and preprocess the image
        try:
            image = Image.open(image_path).convert("RGB")
            image = transform(image)
            images.append(image)

            # Append metadata (excluding 'image_id' and 'lesion_id')
            metadata_list.append(row.drop(["image_id", "lesion_id"]).values)
            fc += 1
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
    else:
        nfc += 1
        

# Convert metadata list to tensor
metadata_tensor = torch.tensor(metadata_list, dtype=torch.float32)

# Stack images into a single tensor
images_tensor = torch.stack(images)

# Save the dataset
dataset = {
    "images": images_tensor,
    "metadata": metadata_tensor,
}
torch.save(dataset, output_file)

print(f"Dataset saved to {output_file}")

100%|██████████| 10015/10015 [07:36<00:00, 21.95it/s]
  metadata_tensor = torch.tensor(metadata_list, dtype=torch.float32)


Dataset saved to HAM10000Skin_Cancer.pt


In [3]:
import os
import pandas as pd
import torch
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
import random

# Paths
metadata_path = "ham_metadata.csv"
images_path = "HAMImages/"
output_file = "HAM10000Skin_Cancer_Balanced.pt"

# Load the metadata
metadata = pd.read_csv(metadata_path)

# Dictionary for mapping target labels
dx_mapping = {
    "bkl": 0,
    "bcc": 1,
    "df": 2,
    "mel": 3,
    "nv": 4,
    "vasc": 5,
    "akiec": 6,
}

# Map 'dx' column to numerical labels
metadata["dx"] = metadata["dx"].map(dx_mapping)

# One-hot encode dx_type, sex, and localization columns
metadata = pd.get_dummies(metadata, columns=["dx_type", "sex", "localization"], prefix=["dx_type", "sex", "local"])
metadata = metadata.replace({True: 1, False: 0})

# Handle missing age values using Iterative Imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
imputer = IterativeImputer(estimator=LinearRegression(), max_iter=10, random_state=0)
metadata['age'] = imputer.fit_transform(metadata[['age']])

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize images to a fixed size
    transforms.ToTensor(),         # Convert to tensor
])

# Initialize lists for storing processed data
images = []
metadata_list = []
class_counts = {k: 0 for k in dx_mapping.values()}  # Track class counts
nfc = 0
fc = 0

# Iterate through the metadata
for _, row in tqdm(metadata.iterrows(), total=len(metadata)):
    image_path = os.path.join(images_path, row["image_id"] + ".jpg")
    if os.path.exists(image_path):
        try:
            image = Image.open(image_path).convert("RGB")
            image = transform(image)
            images.append(image)
            metadata_list.append(row.drop(["image_id", "lesion_id"]).values)
            class_counts[row["dx"]] += 1
            fc += 1
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
    else:
        nfc += 1

# Convert to tensors
metadata_tensor = torch.tensor(metadata_list, dtype=torch.float32)
images_tensor = torch.stack(images)

print(f"Original class distribution: {class_counts}")

# Balance classes
balanced_images = []
balanced_metadata = []

for cls, count in class_counts.items():
    # Get indices of the class
    cls_indices = [i for i, row in enumerate(metadata_tensor) if int(row[0]) == cls]
    if count > 3000:
        # Randomly select 1000 samples if the class has more
        selected_indices = random.sample(cls_indices, 3000)
    else:
        # Oversample to 1000 if the class has fewer samples
        selected_indices = resample(cls_indices, replace=True, n_samples=3000, random_state=0)
    balanced_images.extend(images_tensor[selected_indices])
    balanced_metadata.extend(metadata_tensor[selected_indices])

# Convert balanced data back to tensors
balanced_images_tensor = torch.stack(balanced_images)
balanced_metadata_tensor = torch.stack(balanced_metadata)

# Save the dataset
dataset = {
    "images": balanced_images_tensor,
    "metadata": balanced_metadata_tensor,
}
torch.save(dataset, output_file)

print(f"Balanced dataset saved to {output_file}")
print(f"Final dataset size: {len(balanced_images_tensor)} images and {len(balanced_metadata_tensor)} metadata entries")


100%|██████████| 10015/10015 [02:01<00:00, 82.25it/s]


Original class distribution: {0: 1024, 1: 484, 2: 109, 3: 1074, 4: 5954, 5: 131, 6: 301}
Balanced dataset saved to HAM10000Skin_Cancer_Balanced.pt
Final dataset size: 21000 images and 21000 metadata entries


In [1]:
import torch

# Load the dataset
dataset = torch.load("HAM10000Skin_Cancer_Balanced.pt")

# Extract images and metadata
images = dataset["images"]
metadata = dataset["metadata"]

# The target labels are the first column of metadata (assuming dx was mapped first)
labels = metadata[:, 0].long()

# Get the total number of samples
total_samples = len(labels)

# Count the number of samples in each class
class_counts = torch.bincount(labels)

# Get the size of the image tensor
image_size = images.size()

# Display results
print(f"Total samples: {total_samples}")
print(f"Image size (C x H x W): {image_size[1:]}")
print("Samples per class:")
for class_id, count in enumerate(class_counts):
    print(f"  Class {class_id}: {count.item()} samples")


  dataset = torch.load("HAM10000Skin_Cancer_Balanced.pt")


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 16515072000 bytes.

In [None]:
import os
import pandas as pd
import torch
from torchvision import transforms, utils
from PIL import Image
from tqdm import tqdm
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
import random
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
import numpy as np

# Paths
metadata_path = "ham_metadata.csv"
images_path = "HAMImages/"
output_file = "HAM10000Skin_Cancer_Balanced_with_SMOTE.pt"

# Load the metadata
metadata = pd.read_csv(metadata_path)

# Dictionary for mapping target labels
dx_mapping = {
    "bkl": 0,
    "bcc": 1,
    "df": 2,
    "mel": 3,
    "nv": 4,
    "vasc": 5,
    "akiec": 6,
}

# Map 'dx' column to numerical labels
metadata["dx"] = metadata["dx"].map(dx_mapping)

# One-hot encode dx_type, sex, and localization columns
metadata = pd.get_dummies(metadata, columns=["dx_type", "sex", "localization"], prefix=["dx_type", "sex", "local"])
metadata = metadata.replace({True: 1, False: 0})

# Handle missing age values using Iterative Imputer
imputer = IterativeImputer(estimator=LinearRegression(), max_iter=10, random_state=0)
metadata['age'] = imputer.fit_transform(metadata[['age']])

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize images to a fixed size
    transforms.ToTensor(),         # Convert to tensor
])

# Separate out the image_id for later handling
# Separate out the image_id for later handling
image_ids = metadata['image_id']

# Drop non-numeric columns (e.g., image_id, and any categorical columns)
# Keep only numeric columns for SMOTE
metadata_no_id = metadata.drop(columns=['image_id'])

# Ensure only numeric columns are used in SMOTE (exclude non-numeric columns like 'dx', 'image_id', etc.)
numeric_columns = metadata_no_id.select_dtypes(include=[np.number]).columns
metadata_numeric = metadata_no_id[numeric_columns]

# Apply SMOTE to clinical data (excluding 'image_id')
smote = SMOTE(random_state=0)
synthetic_data, _ = smote.fit_resample(metadata_numeric, metadata['dx'])

# Add 'image_id' back to the synthetic data, keeping the original data intact
synthetic_data = pd.DataFrame(synthetic_data, columns=numeric_columns)
synthetic_data['image_id'] = np.nan  # Mark synthetic samples with NaN

# Append the synthetic data at the end of the original data
balanced_metadata = pd.concat([metadata, synthetic_data], ignore_index=True)


# Track the class distribution
class_counts = balanced_metadata['dx'].value_counts()

# Initialize lists for processed images and their metadata
images = []
metadata_list = []

# Process original (real) images
for _, row in tqdm(metadata.iterrows(), total=len(metadata)):
    image_path = os.path.join(images_path, row["image_id"] + ".jpg")
    if os.path.exists(image_path):
        try:
            image = Image.open(image_path).convert("RGB")
            image = transform(image)
            images.append(image)
            metadata_list.append(row.drop(["image_id"]).values)
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

# Process synthetic images by augmenting real images
for _, row in tqdm(synthetic_data.iterrows(), total=len(synthetic_data)):
    if pd.isna(row['image_id']):  # Check if it's a synthetic row
        # Find matching class in the original metadata
        matching_rows = metadata[metadata['dx'] == row['dx']]
        real_row = matching_rows.sample(1).iloc[0]  # Randomly select a real sample from the matching class
        
        # Load the corresponding image and apply random augmentation
        real_image_path = os.path.join(images_path, real_row["image_id"] + ".jpg")
        if os.path.exists(real_image_path):
            try:
                real_image = Image.open(real_image_path).convert("RGB")
                augmented_image = transform(real_image)
                
                # Apply random augmentation (random horizontal flip and color jitter for diversity)
                if random.random() > 0.5:
                    augmented_image = transforms.RandomHorizontalFlip()(augmented_image)
                augmented_image = transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2)(augmented_image)
                
                # Append the augmented image and the synthetic metadata
                images.append(augmented_image)
                metadata_list.append(row.drop(["image_id"]).values)

            except Exception as e:
                print(f"Error processing augmented image: {e}")

# Convert to tensors
metadata_tensor = torch.tensor(metadata_list, dtype=torch.float32)
images_tensor = torch.stack(images)

# Balance the dataset to 3000 samples per class
final_images = []
final_metadata = []

# Ensuring every class has exactly 3000 samples
for cls in class_counts.index:
    cls_indices = [i for i, row in enumerate(metadata_tensor) if int(row[0]) == cls]
    
    # Oversample and undersample to make sure each class has 3000 samples
    if len(cls_indices) > 3000:
        selected_indices = random.sample(cls_indices, 3000)
    else:
        selected_indices = resample(cls_indices, replace=True, n_samples=3000, random_state=0)
    
    final_images.extend(images_tensor[selected_indices])
    final_metadata.extend(metadata_tensor[selected_indices])

# Convert final balanced dataset to tensors
final_images_tensor = torch.stack(final_images)
final_metadata_tensor = torch.stack(final_metadata)

# Save the dataset
dataset = {
    "images": final_images_tensor,
    "metadata": final_metadata_tensor,
}
torch.save(dataset, output_file)

print(f"Balanced dataset saved to {output_file}")
print(f"Final dataset size: {len(final_images_tensor)} images and {len(final_metadata_tensor)} metadata entries")


100%|██████████| 10015/10015 [02:43<00:00, 61.09it/s]
 93%|█████████▎| 43650/46935 [11:05<27:43,  1.98it/s]