In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('dataset/ham10000_metadata_2024-05-05.csv')

In [4]:
df.shape

(11720, 13)

In [5]:
df.columns.to_list()

['isic_id',
 'attribution',
 'copyright_license',
 'age_approx',
 'anatom_site_general',
 'benign_malignant',
 'concomitant_biopsy',
 'diagnosis',
 'diagnosis_confirm_type',
 'image_type',
 'lesion_id',
 'melanocytic',
 'sex']

In [6]:
df.head()

Unnamed: 0,isic_id,attribution,copyright_license,age_approx,anatom_site_general,benign_malignant,concomitant_biopsy,diagnosis,diagnosis_confirm_type,image_type,lesion_id,melanocytic,sex
0,ISIC_0024306,"ViDIR Group, Department of Dermatology, Medica...",CC-BY-NC,45.0,,benign,False,nevus,serial imaging showing no change,dermoscopic,IL_7252831,True,male
1,ISIC_0024307,"ViDIR Group, Department of Dermatology, Medica...",CC-BY-NC,50.0,lower extremity,benign,False,nevus,serial imaging showing no change,dermoscopic,IL_6125741,True,male
2,ISIC_0024308,"ViDIR Group, Department of Dermatology, Medica...",CC-BY-NC,55.0,,benign,False,nevus,serial imaging showing no change,dermoscopic,IL_3692653,True,female
3,ISIC_0024309,"ViDIR Group, Department of Dermatology, Medica...",CC-BY-NC,40.0,,benign,False,nevus,serial imaging showing no change,dermoscopic,IL_0959663,True,male
4,ISIC_0024310,"ViDIR Group, Department of Dermatology, Medica...",CC-BY-NC,60.0,anterior torso,malignant,True,melanoma,histopathology,dermoscopic,IL_8194852,True,male


In [7]:
df.diagnosis.nunique()

8

In [8]:
df.benign_malignant.nunique()

2

benign: Non-cancerous lesion

malignant: Cancerous lesion

In [9]:
df.benign_malignant.value_counts()

benign_malignant
benign       7737
malignant    1305
Name: count, dtype: int64

In [10]:
df.diagnosis.value_counts()

diagnosis
nevus                         7737
pigmented benign keratosis    1338
melanoma                      1305
basal cell carcinoma           622
squamous cell carcinoma        229
vascular lesion                180
dermatofibroma                 160
actinic keratosis              149
Name: count, dtype: int64

In [11]:
df.lesion_id.nunique()

8838

In [12]:
target_df = df[['isic_id', 'diagnosis']]

In [13]:
target_df

Unnamed: 0,isic_id,diagnosis
0,ISIC_0024306,nevus
1,ISIC_0024307,nevus
2,ISIC_0024308,nevus
3,ISIC_0024309,nevus
4,ISIC_0024310,melanoma
...,...,...
11715,ISIC_0036060,pigmented benign keratosis
11716,ISIC_0036061,nevus
11717,ISIC_0036062,actinic keratosis
11718,ISIC_0036063,pigmented benign keratosis


# Train Test

In [14]:
# Splitting the data into training and testing data
from sklearn.model_selection import train_test_split

train, test = train_test_split(target_df, test_size=0.2, stratify=target_df['diagnosis'], random_state=42)

In [15]:
train.shape, test.shape

((9376, 2), (2344, 2))

In [16]:
train.to_csv('dataset/train.csv', index=False)
test.to_csv('dataset/test.csv', index=False)

In [19]:
import shutil
from tqdm import tqdm 
import os
for img in tqdm(train['isic_id']):
    image_class = train['diagnosis'][train['isic_id'] == img].values[0]
    image_path = f'dataset/ISIC-images (1)/{img}.jpg'

    # Create a folder with the name of the class
    if not os.path.exists(f'dataset/train/{image_class}'):
        os.makedirs(f'dataset/train/{image_class}')
        
    # Copy the image to 'dataset/train' folder
    shutil.copy(image_path, f'dataset/train/{image_class}/')
    


100%|██████████| 9376/9376 [00:13<00:00, 671.85it/s]


In [20]:
import shutil
from tqdm import tqdm 
for img in tqdm(test['isic_id']):
    image_path = f'dataset/ISIC-images (1)/{img}.jpg'
    image_class = test['diagnosis'][test['isic_id'] == img].values[0]

    # Create a folder with the name of the class
    if not os.path.exists(f'dataset/test/{image_class}'):
        os.makedirs(f'dataset/test/{image_class}')

    # Copy the image to 'dataset/test' folder
    shutil.copy(image_path, f'dataset/test/{image_class}/')
    
    


100%|██████████| 2344/2344 [00:03<00:00, 741.08it/s]
