### PAD-UFES-20: a skin lesion dataset composed of patient data and clinical images collected from smartphones 

In [1]:
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [2]:
metadata = pd.read_csv("metadata.csv")

In [3]:
# cv_train_img_idxs.txt 
# cv_val_img_idxs.txt
# cv_test_img_idxs.txt

# import ast

# train_file_path = "tabular_data/cv_train_img_idxs.txt"
# val_file_path = "tabular_data/cv_val_img_idxs.txt"
# test_file_path = "tabular_data/cv_test_img_idxs.txt"

# with open(train_file_path, "r") as f:
#     train_data_list = ast.literal_eval(f.readline().strip())

# with open(val_file_path, "r") as f:
#     val_data_list = ast.literal_eval(f.readline().strip())

# with open(test_file_path, "r") as f:
#     test_data_list = ast.literal_eval(f.readline().strip())

# print(len(train_data_list)) 
# print(len(val_data_list)) 
# print(len(test_data_list)) 

In [4]:
metadata.columns 

Index(['patient_id', 'lesion_id', 'smoke', 'drink', 'background_father',
       'background_mother', 'age', 'pesticide', 'gender',
       'skin_cancer_history', 'cancer_history', 'has_piped_water',
       'has_sewage_system', 'fitspatrick', 'region', 'diameter_1',
       'diameter_2', 'diagnostic', 'itch', 'grew', 'hurt', 'changed', 'bleed',
       'elevation', 'img_id', 'biopsed'],
      dtype='object')

In [5]:
metadata.shape

(2298, 26)

In [6]:
metadata.isnull().sum()

patient_id               0
lesion_id                0
smoke                  804
drink                  804
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1             804
diameter_2             804
diagnostic               0
itch                     0
grew                     0
hurt                     0
changed                  0
bleed                    0
elevation                0
img_id                   0
biopsed                  0
dtype: int64

In [7]:
metadata

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,has_sewage_system,fitspatrick,region,diameter_1,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,,,,,8,,,,,,,,ARM,,,NEV,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,True,True,True,3.0,NECK,6.0,5.0,BCC,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,,,,,77,,,,,,,,FACE,,,ACK,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,,,,,75,,,,,,,,HAND,,,ACK,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,False,False,False,1.0,FOREARM,5.0,5.0,BCC,True,True,False,False,True,True,PAT_684_1302_588.png,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2293,PAT_1708,3156,,,,,73,,,,,,,,HAND,,,ACK,True,False,False,False,False,False,PAT_1708_3156_175.png,False
2294,PAT_46,880,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,True,True,True,3.0,CHEST,13.0,12.0,BCC,True,True,False,True,False,False,PAT_46_880_140.png,True
2295,PAT_1343,1217,,,,,74,,,,,,,,FOREARM,,,SEK,False,False,False,False,False,False,PAT_1343_1217_404.png,False
2296,PAT_326,690,False,False,POMERANIA,POMERANIA,58,True,FEMALE,True,True,False,False,3.0,FACE,5.0,4.0,BCC,True,False,False,False,False,True,PAT_326_690_823.png,True


In [8]:
drop_cols = ['lesion_id', 'patient_id']

### Splitting Image

In [9]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

# --------------------------
# Paths
# --------------------------
CSV_PATH = "metadata.csv"
IMAGE_DIR = "./images"        # folder where all images are stored
OUTPUT_DIR = "./output"

In [10]:
df = pd.read_csv(CSV_PATH)

# Check columns (must have image_id and label)
assert "img_id" in df.columns
assert "diagnostic" in df.columns

In [11]:
# --------------------------
# Step 1: Train 60%, Temp 40%
# --------------------------
train_df, temp_df = train_test_split(
    df,
    test_size=0.4,
    stratify=df["diagnostic"],
    random_state=42
)

# --------------------------
# Step 2: Split temp â†’ DSEL 20%, Test 20%
# --------------------------
dsel_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["diagnostic"],
    random_state=42
)

### Follow target paper 

In [12]:
# print(len(train_data_list)) 
# print(len(val_data_list)) 
# print(len(test_data_list)) 


# train_df = df[df.img_id.isin(train_data_list)]
# dsel_df = df[df.img_id.isin(val_data_list)]
# test_df = df[df.img_id.isin(test_data_list)]

In [13]:
# --------------------------
# Create folder structure
# --------------------------
splits = ["train", "dsel", "test"]
for split in splits:
    for cls in sorted(df["diagnostic"].unique()):
        os.makedirs(f"{OUTPUT_DIR}/{split}/{cls}", exist_ok=True)

In [14]:
# --------------------------
# Function to copy images
# --------------------------
def copy_images(subset_df, split_name):
    for _, row in subset_df.iterrows():
        img_name = row["img_id"]
        label = row["diagnostic"]

        src = os.path.join(IMAGE_DIR, img_name)
        dst = os.path.join(OUTPUT_DIR, split_name, str(label), img_name)

        if os.path.exists(src):
            shutil.copy(src, dst)
        else:
            print(f"WARNING: Missing file {src}")

# --------------------------
# Copy images for each split
# --------------------------
copy_images(train_df, "train")
copy_images(dsel_df, "dsel")
copy_images(test_df, "test")

print("ðŸŽ‰ Done! Dataset prepared successfully.")

ðŸŽ‰ Done! Dataset prepared successfully.


### Tabular data 

In [15]:
# train_df, dsel_df, test_df 

train_df['smoke'] = train_df['smoke'].fillna("Not Given")
train_df['drink'] = train_df['drink'].fillna("Not Given")
train_df['background_father'] = train_df['background_father'].fillna("Not Given")
train_df['background_mother'] = train_df['background_mother'].fillna("Not Given")
train_df['pesticide'] = train_df['pesticide'].fillna("Not Given")
train_df['gender'] = train_df['gender'].fillna("Not Given")
train_df['skin_cancer_history'] = train_df['skin_cancer_history'].fillna("Not Given")
train_df['cancer_history'] = train_df['cancer_history'].fillna("Not Given")
train_df['has_piped_water'] = train_df['has_piped_water'].fillna("Not Given")
train_df['has_sewage_system'] = train_df['has_sewage_system'].fillna("Not Given")
train_df['fitspatrick'] = train_df['fitspatrick'].fillna(-99)
train_df['diameter_1'] = train_df['diameter_1'].fillna(-99)
train_df['diameter_2'] = train_df['diameter_2'].fillna(-99)


dsel_df['smoke'] = dsel_df['smoke'].fillna("Not Given")
dsel_df['drink'] = dsel_df['drink'].fillna("Not Given")
dsel_df['background_father'] = dsel_df['background_father'].fillna("Not Given")
dsel_df['background_mother'] = dsel_df['background_mother'].fillna("Not Given")
dsel_df['pesticide'] = dsel_df['pesticide'].fillna("Not Given")
dsel_df['gender'] = dsel_df['gender'].fillna("Not Given")
dsel_df['skin_cancer_history'] = dsel_df['skin_cancer_history'].fillna("Not Given")
dsel_df['cancer_history'] = dsel_df['cancer_history'].fillna("Not Given")
dsel_df['has_piped_water'] = dsel_df['has_piped_water'].fillna("Not Given")
dsel_df['has_sewage_system'] = dsel_df['has_sewage_system'].fillna("Not Given")
dsel_df['fitspatrick'] = dsel_df['fitspatrick'].fillna(-99)
dsel_df['diameter_1'] = dsel_df['diameter_1'].fillna(-99)
dsel_df['diameter_2'] = dsel_df['diameter_2'].fillna(-99)


test_df['smoke'] = test_df['smoke'].fillna("Not Given")
test_df['drink'] = test_df['drink'].fillna("Not Given")
test_df['background_father'] = test_df['background_father'].fillna("Not Given")
test_df['background_mother'] = test_df['background_mother'].fillna("Not Given")
test_df['pesticide'] = test_df['pesticide'].fillna("Not Given")
test_df['gender'] = test_df['gender'].fillna("Not Given")
test_df['skin_cancer_history'] = test_df['skin_cancer_history'].fillna("Not Given")
test_df['cancer_history'] = test_df['cancer_history'].fillna("Not Given")
test_df['has_piped_water'] = test_df['has_piped_water'].fillna("Not Given")
test_df['has_sewage_system'] = test_df['has_sewage_system'].fillna("Not Given")
test_df['fitspatrick'] = test_df['fitspatrick'].fillna(-99)
test_df['diameter_1'] = test_df['diameter_1'].fillna(-99)
test_df['diameter_2'] = test_df['diameter_2'].fillna(-99)

In [16]:
test_df.isnull().sum()

patient_id             0
lesion_id              0
smoke                  0
drink                  0
background_father      0
background_mother      0
age                    0
pesticide              0
gender                 0
skin_cancer_history    0
cancer_history         0
has_piped_water        0
has_sewage_system      0
fitspatrick            0
region                 0
diameter_1             0
diameter_2             0
diagnostic             0
itch                   0
grew                   0
hurt                   0
changed                0
bleed                  0
elevation              0
img_id                 0
biopsed                0
dtype: int64

In [17]:
drop_cols = ['lesion_id', 'patient_id']

train_df.drop(drop_cols, axis=1, inplace=True)
dsel_df.drop(drop_cols, axis=1, inplace=True)
test_df.drop(drop_cols, axis=1, inplace=True)

#### Label Encoding 

In [18]:
train_df.head()

Unnamed: 0,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,has_sewage_system,fitspatrick,region,diameter_1,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
2186,False,True,POMERANIA,POMERANIA,51,True,MALE,True,False,True,True,3.0,CHEST,9.0,7.0,BCC,False,UNK,False,UNK,False,True,PAT_730_1385_585.png,True
1378,False,False,Not Given,Not Given,73,False,FEMALE,False,False,True,True,2.0,FOREARM,18.0,14.0,ACK,False,False,False,False,False,True,PAT_632_3700_890.png,False
2051,True,True,POMERANIA,POMERANIA,67,True,MALE,True,False,True,True,4.0,FACE,13.0,10.0,ACK,True,False,False,False,False,False,PAT_942_1792_411.png,True
2254,Not Given,Not Given,Not Given,Not Given,58,Not Given,Not Given,Not Given,Not Given,Not Given,Not Given,-99.0,FOREARM,-99.0,-99.0,ACK,True,False,False,False,False,False,PAT_1429_1491_451.png,False
728,False,False,POMERANIA,POMERANIA,68,True,MALE,False,False,True,True,2.0,FOREARM,10.0,10.0,ACK,True,True,False,False,True,True,PAT_735_1391_683.png,True


In [19]:
train_df.columns

Index(['smoke', 'drink', 'background_father', 'background_mother', 'age',
       'pesticide', 'gender', 'skin_cancer_history', 'cancer_history',
       'has_piped_water', 'has_sewage_system', 'fitspatrick', 'region',
       'diameter_1', 'diameter_2', 'diagnostic', 'itch', 'grew', 'hurt',
       'changed', 'bleed', 'elevation', 'img_id', 'biopsed'],
      dtype='object')

In [20]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['smoke', 'drink', 'background_father', 'background_mother',
       'pesticide', 'gender', 'skin_cancer_history', 'cancer_history',
       'has_piped_water', 'has_sewage_system', 'region',
       'diagnostic', 'itch', 'grew', 'hurt',
       'changed', 'bleed', 'elevation', 'biopsed']
le_dict = {}  # to store encoders for later use (e.g., test/val set)

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))  # convert NaN to string if needed
    le_dict[col] = le

# Check the transformed DataFrame
print(train_df.head())

# Optional: see mapping for each column
for col, le in le_dict.items():
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"{col} mapping: {mapping}")

      smoke  drink  background_father  background_mother  age  pesticide  \
2186      0      2                  9                  8   51          2   
1378      0      0                  7                  6   73          0   
2051      2      2                  9                  8   67          2   
2254      1      1                  7                  6   58          1   
728       0      0                  9                  8   68          2   

      gender  skin_cancer_history  cancer_history  has_piped_water  \
2186       1                    2               0                2   
1378       0                    0               0                2   
2051       1                    2               0                2   
2254       2                    1               1                1   
728        1                    0               0                2   

      has_sewage_system  fitspatrick  region  diameter_1  diameter_2  \
2186                  2          3.0       3      

In [21]:
for col in categorical_cols:
    le = LabelEncoder()
    dsel_df[col] = le.fit_transform(dsel_df[col].astype(str))  # convert NaN to string if needed
    le_dict[col] = le

In [22]:
for col in categorical_cols:
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col].astype(str))  # convert NaN to string if needed
    le_dict[col] = le

In [23]:
train_df.to_csv("tabular_data/train.csv", index=False) 
dsel_df.to_csv("tabular_data/dsel.csv", index=False) 
test_df.to_csv("tabular_data/test.csv", index=False) 

In [24]:
dsel_df.isnull().sum()

smoke                  0
drink                  0
background_father      0
background_mother      0
age                    0
pesticide              0
gender                 0
skin_cancer_history    0
cancer_history         0
has_piped_water        0
has_sewage_system      0
fitspatrick            0
region                 0
diameter_1             0
diameter_2             0
diagnostic             0
itch                   0
grew                   0
hurt                   0
changed                0
bleed                  0
elevation              0
img_id                 0
biopsed                0
dtype: int64