In [9]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

In [10]:
# Set paths
data_dir = "/kaggle/input/ham1000-segmentation-and-classification"
images_dir = os.path.join(data_dir, "images")
filtered_dir = "./filtered"

In [11]:
# Make the directory
os.makedirs(filtered_dir, exist_ok=True)

In [12]:
def filter_dataset():
    # Load metadata
    metadata_path = os.path.join(data_dir, "GroundTruth.csv")
    df = pd.read_csv(metadata_path)
    
    # The groundTruth.csv is a one-hot encoding table
    # Adding MEL and BCC to a new column "dx"
    df['dx'] = df[['MEL', 'BCC']].idxmax(axis=1)
    filtered_df = df[df['dx'].isin(['MEL', 'BCC'])]
    
    # a1000 images subset,500 from each
    mel_df = filtered_df[filtered_df['dx'] == 'MEL'].sample(n=500, random_state=42, replace=False)
    bcc_df = filtered_df[filtered_df['dx'] == 'BCC'].sample(n=500, random_state=42, replace=False)
    new_df = pd.concat([mel_df, bcc_df])
    
    # # Copy images to filtered directory
    missing_counter = 0
    for _, row in new_df.iterrows():
        src = os.path.join(images_dir, row['image'] + ".jpg")
        if os.path.exists(src):
            dest_folder = os.path.join(filtered_dir, row['dx'])
            os.makedirs(dest_folder, exist_ok=True)
            dest = os.path.join(dest_folder, row['image'] + ".jpg")
            shutil.copyfile(src, dest)
        else:
            missing_counter += 1
    
    print(f"{missing_counter} images are missing")

    return new_df

In [14]:
filtered_data = filter_dataset()
print("Subset of HAM10000 dataset prepared.")

0 images are missing
Filtered dataset created successfully.
Subset of HAM10000 dataset prepared.


In [15]:
filtered_data.count()

image    1000
MEL      1000
NV       1000
BCC      1000
AKIEC    1000
BKL      1000
DF       1000
VASC     1000
dx       1000
dtype: int64

In [8]:
filtered_data.head()

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC,dx
8483,ISIC_0032789,0.0,1.0,0.0,0.0,0.0,0.0,0.0,MEL
4892,ISIC_0029198,0.0,1.0,0.0,0.0,0.0,0.0,0.0,MEL
8021,ISIC_0032327,0.0,1.0,0.0,0.0,0.0,0.0,0.0,MEL
5602,ISIC_0029908,0.0,1.0,0.0,0.0,0.0,0.0,0.0,MEL
7134,ISIC_0031440,0.0,1.0,0.0,0.0,0.0,0.0,0.0,MEL


In [19]:
final_df = filtered_data[['image', 'dx']]
final_df.head(10)

Unnamed: 0,image,dx
8483,ISIC_0032789,MEL
4892,ISIC_0029198,MEL
8021,ISIC_0032327,MEL
5602,ISIC_0029908,MEL
7134,ISIC_0031440,MEL
5569,ISIC_0029875,MEL
7817,ISIC_0032123,MEL
3517,ISIC_0027823,MEL
555,ISIC_0024861,MEL
5112,ISIC_0029418,MEL
