# Validate, prepare and split datasets


In [2]:
from glob import glob
import hashlib
import json
import os
import shutil
import pandas as pd
import requests
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

In [3]:
seed = 19
split_ratio_1 = 0.2  # (validation+test) / (train+validation+test)
split_ratio_2 = 0.5  # (test) / (validation+test)

# --------------------------------------------------------------------------------------------------------

# NOTE: More quality issues are explained in: https://github.com/Digital-Dermatology/SelfClean-Revised-Benchmarks

csv_paths = ["../datasets/PAD-UFES-20/metadata.csv"]
images_root_paths = ["../datasets/PAD-UFES-20/images/"]
target_column = "diagnostic"
file_column = "img_id"
group_columns = ["patient_id"]  # , "lesion_id"

csv_paths = ["../datasets/ddi-diverse-dermatology-images/ddi_metadata.csv"]
images_root_paths = ["../datasets/ddi-diverse-dermatology-images/"]
target_column = "malignant"  # "disease"
file_column = "DDI_file"
group_columns = []

csv_paths = [
    "../datasets/HAM10000/HAM10000_metadata",
    "../datasets/HAM10000/ISIC2018_Task3_Test_GroundTruth.csv",
]  # Given test dataset
images_root_paths = ["../datasets/HAM10000/images/"]
target_column = "dx"
file_column = "image_id"
group_columns = ["lesion_id"]
# NOTE: The image 'ISIC_0035068' (known as the 'easter egg') is corrupted and was therefore excluded manually from the test dataset!

csv_paths = ["../datasets/fitzpatrick17k/fitzpatrick17k.csv"]
images_root_paths = ["../datasets/fitzpatrick17k/images/"]
target_column = "three_partition_label"  # "nine_partition_label"
file_column = "md5hash"  # "url"
group_columns = []

csv_paths = []
images_root_paths = ["../datasets/PlantDataset/"]
target_column = None
file_column = None
group_columns = []

csv_paths = []
images_root_paths = [
    "../datasets/plantdoc-dataset/train/",
    "../datasets/plantdoc-dataset/test/",
]
target_column = None
file_column = None
group_columns = []
# The class "Tomato two spotted spider mites leaf" only appears in the train, but not in the test set and was removed manually (2 images)
# The following files are originally included in the train and test set and were moved manually to the duplicates subdirectory:
# ../datasets/plantdoc-dataset/train/Corn Gray leaf spot/2013Corn_GrayLeafSpot_0815_0003.JPG.jpg
# ../datasets/plantdoc-dataset/test/Corn leaf blight/2013Corn_GrayLeafSpot_0815_0003.JPG.jpg
# ../datasets/plantdoc-dataset/train/Potato leaf early blight/early-blight-or-target-spot-alternaria-solani-lesions-on-a-tomato-by9j8r.jpg
# ../datasets/plantdoc-dataset/train/Tomato Early blight leaf/early-blight-or-target-spot-alternaria-solani-lesions-on-a-tomato-BY9J8R.jpg

csv_paths = ["../datasets/cassava-leaf-disease-classification/train.csv"]
images_root_paths = ["../datasets/cassava-leaf-disease-classification/train_images/"]
target_column = "label"
file_column = "image_id"
group_columns = []

csv_paths = []
images_root_paths = ["../datasets/PlantVillage-Dataset/raw/color/"]
target_column = None
file_column = None
group_columns = []

In [4]:
if target_column is None:
    target_column = "."
    # Use directory structure instead of target column

if file_column is None:
    file_column = "filepath"
    # Use default value

if 0 == len(csv_paths):
    csv_paths = [None]

In [5]:
def move_duplicates(src_path, duplicates_path):
    dst_dir = os.path.join(duplicates_path, os.path.basename(os.path.dirname(src_path)))
    os.makedirs(dst_dir, exist_ok=True)
    dst_path = os.path.join(dst_dir, os.path.basename(src_path))
    shutil.move(src_path, dst_path)
    return dst_path


# move_duplicates(
#     "../datasets/plantdoc-dataset/train/Corn Gray leaf spot/2013Corn_GrayLeafSpot_0815_0003.JPG.jpg",
#     "../datasets/plantdoc-dataset/train/duplicates",
# )
# move_duplicates(
#     "../datasets/plantdoc-dataset/test/Corn leaf blight/2013Corn_GrayLeafSpot_0815_0003.JPG.jpg",
#     "../datasets/plantdoc-dataset/test/duplicates",
# )
# move_duplicates(
#     "../datasets/plantdoc-dataset/train/Potato leaf early blight/early-blight-or-target-spot-alternaria-solani-lesions-on-a-tomato-by9j8r.jpg",
#     "../datasets/plantdoc-dataset/train/duplicates",
# )
# move_duplicates(
#     "../datasets/plantdoc-dataset/train/Tomato Early blight leaf/early-blight-or-target-spot-alternaria-solani-lesions-on-a-tomato-BY9J8R.jpg",
#     "../datasets/plantdoc-dataset/train/duplicates",
# )
# move_duplicates(
#     "../datasets/plantdoc-dataset/train/Potato leaf early blight/potato-blight-phytophora-infestans-close-up-of-infected-leaf-top-surface-a60hxg.jpg",
#     "../datasets/plantdoc-dataset/train/duplicates",
# )
# move_duplicates(
#     "../datasets/plantdoc-dataset/train/Potato leaf late blight/potato-blight-phytophora-infestans-close-up-of-infected-leaf-top-surface-A60HXG.jpg",
#     "../datasets/plantdoc-dataset/train/duplicates",
# )

# NOTE: In some cases the file hash is not enough to check...
# hashlib.md5(open("../datasets/plantdoc-dataset/train/Tomato Early blight leaf/earlyblightpotato.jpg", "rb").read()).hexdigest() # d2c45e81c5de5a2f731829ed491e8df5
# hashlib.md5(open("../datasets/plantdoc-dataset/test/Potato leaf early blight/earlyblightpotato.jpg", "rb").read()).hexdigest() # be11519577cb929d21a68170b2ec88f1
# hashlib.md5(Image.open('../datasets/plantdoc-dataset/test/Potato leaf early blight/earlyblightpotato.jpg', "r").tobytes()).hexdigest() # 8fec6255afb5003f62de6989e9b40721
# hashlib.md5(Image.open('../datasets/plantdoc-dataset/train/Tomato Early blight leaf/earlyblightpotato.jpg', "r").tobytes()).hexdigest() # 8fec6255afb5003f62de6989e9b40721

In [6]:
file_hash_dict = {}
duplicates = set()
skip_extensions = [".csv", ".db"]


def check_root_dir(root_dir):
    print(f"Checking {root_dir}")
    duplicates_path = os.path.normpath(os.path.join(root_dir, "duplicates/"))
    for sub_path, _, filenames in os.walk(os.path.normpath(root_dir)):
        sub_path = os.path.normpath(sub_path)
        if duplicates_path in sub_path:
            print(f"Skip: {sub_path}")
            continue

        print(f"Found {len(filenames)} files")
        for filename in filenames:
            file_extension = os.path.splitext(filename)[1].lower()
            if file_extension in skip_extensions:
                print(f"Skip non image file: {filename}")
                continue

            file_path = os.path.normpath(os.path.join(sub_path, filename))
            # file_hash = os.path.basename(file_path).lower()
            file_hash = hashlib.md5(Image.open(file_path, "r").tobytes()).hexdigest()
            # file_hash = hashlib.md5(open(file_path, "rb").read()).hexdigest()
            if file_hash in file_hash_dict:
                print(
                    f"File with equal hash ({file_hash}) found: {file_hash_dict[file_hash]}, {file_path}"
                )
                move_duplicates(file_path, duplicates_path)
                file_hash_dict[file_hash] = move_duplicates(
                    file_hash_dict[file_hash], duplicates_path
                )
            else:
                file_hash_dict[file_hash] = file_path

    for sub_path, _, filenames in os.walk(os.path.normpath(duplicates_path)):
        for filename in filenames:
            file_path = os.path.normpath(os.path.join(sub_path, filename))
            duplicates.add(file_path)


for root_dir in images_root_paths:
    check_root_dir(root_dir)
print(f"Found {len(duplicates)} duplicates")

existing_file_paths = set(file_hash_dict.values()) - duplicates
print(f"Total number of files: {len(existing_file_paths)}")

Checking ../datasets/plantdoc-dataset/train/
Found 0 files
Found 82 files
Found 79 files
Found 83 files
Found 53 files
Found 62 files
Found 105 files
Found 47 files
Found 61 files
Found 179 files
Found 106 files
Skip: ..\datasets\plantdoc-dataset\train\duplicates
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Blueberry leaf
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Corn Gray leaf spot
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Potato leaf early blight
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Potato leaf late blight
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Tomato Early blight leaf
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Tomato leaf late blight
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Tomato leaf yellow virus
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Tomato mold leaf
Skip: ..\datasets\plantdoc-dataset\train\duplicates\Tomato Septoria leaf spot
Found 57 files
Found 56 files
Found 103 files
Found 102 files
Found

In [7]:
def init_dataframes(csv_file, rootpath):
    df_return = None
    if csv_file is None:
        file_column_values = []
        target_column_values = []
        target_dirs = [
            name
            for name in os.listdir(path=rootpath)
            if os.path.isdir(os.path.join(rootpath, name))
        ]
        for target_dir in target_dirs:
            image_file_paths = [
                os.path.normpath(os.path.join(rootpath, target_dir, name))
                for name in os.listdir(path=os.path.join(rootpath, target_dir))
                if os.path.isfile(os.path.join(rootpath, target_dir, name))
            ]
            length_previous = len(file_column_values)
            file_column_values.extend(image_file_paths)
            target_column_values.extend([target_dir] * len(image_file_paths))
        df_return = pd.DataFrame(
            list(zip(target_column_values, file_column_values)),
            columns=[target_column, file_column],
        )
    else:
        df_return = pd.read_csv(csv_file)
        assert target_column in df_return.columns.values
        assert file_column in df_return.columns.values
    return df_return


df_primary = init_dataframes(csv_paths[0], images_root_paths[0])
df_secondary = pd.DataFrame(columns=df_primary.columns)
print(f"Columns: {list(df_primary.columns)}")
if 1 < len(images_root_paths) or 1 < len(csv_paths):
    print("Using predefined testset")
    df_secondary = init_dataframes(csv_paths[-1], images_root_paths[-1])
    assert list(df_primary.columns) == list(df_secondary.columns)
print(f"Total number of rows: {len(df_primary) + len(df_secondary)}")

Columns: ['.', 'filepath']
Using predefined testset
Total number of rows: 2536


In [8]:
# df_primary["filepath"] = df_primary
sample = df_primary.iloc[0]
sample[file_column]

# sample_path = os.path.normpath(os.path.join(images_root_paths[0], sample[target_column], sample[file_column]))
# if not os.path.exists(sample_path):
print(sample[file_column])


def merge_filepaths():
    df_file = pd.DataFrame(
        {
            "filepath": list(existing_file_paths) + list(duplicates),
            "included": np.concatenate(
                [np.ones(len(existing_file_paths)), np.zeros(len(duplicates))]
            ).astype(bool),
        }
    )
    if (df_file["filepath"] == sample[file_column]).any():
        print("Merge by unique filepath")

        df_primary_extended = pd.merge(
            df_primary, df_file, left_on=file_column, right_on="filepath"
        )
        print(f"{len(df_primary) - len(df_primary_extended)} rows could not be found")

        df_secondary_extended = pd.merge(
            df_secondary, df_file, left_on=file_column, right_on="filepath"
        )
        print(
            f"{len(df_secondary) - len(df_secondary_extended)} rows could not be found"
        )
        return df_primary_extended, df_secondary_extended

    df_file["filename"] = df_file["filepath"].apply(lambda x: os.path.basename(x))
    if not df_file["filename"].is_unique:
        print("A bit tricky case")
        print(df_file[df_file["filename"].duplicated(False)])
        return

    if (df_file["filename"] == sample[file_column]).any():
        print("Merge by unique filename")

    elif df_file["filename"].str.contains(sample[file_column]).sum():
        print("Merge by substring")
        matching_filenames = df_file[
            df_file["filename"].str.contains(sample[file_column])
        ]["filename"].values
        if 1 < len(matching_filenames):
            print(f"Too many matches: {matching_filenames}")
            return
        start_idx = matching_filenames[0].find(sample[file_column])
        prefix = matching_filenames[0][:start_idx]
        postfix = matching_filenames[0][start_idx + len(sample[file_column]) :]
        df_primary[file_column] = prefix + df_primary[file_column] + postfix
        df_secondary[file_column] = prefix + df_secondary[file_column] + postfix
    else:
        print("No merge found!")
        return

    df_primary_extended = pd.merge(
        df_primary, df_file, left_on=file_column, right_on="filename"
    )
    print(f"{len(df_primary) - len(df_primary_extended)} rows could not be found")

    df_secondary_extended = pd.merge(
        df_secondary, df_file, left_on=file_column, right_on="filename"
    )
    print(f"{len(df_secondary) - len(df_secondary_extended)} rows could not be found")
    return df_primary_extended, df_secondary_extended


df_primary_extended, df_secondary_extended = merge_filepaths()

..\datasets\plantdoc-dataset\train\Apple leaf\20130519yellowingappleleaves.jpg
Merge by unique filepath
0 rows could not be found
0 rows could not be found


In [9]:
df_primary = df_primary_extended[df_primary_extended["included"]]
df_secondary = df_secondary_extended[df_secondary_extended["included"]]
print(f"Total number of rows: {len(df_primary) + len(df_secondary)}")

Total number of rows: 2536


In [10]:
def check_stratify(groups, verbose=False):
    for name, group in groups:
        sub_groups = group.groupby(target_column)
        for sub_name, sub_group in sub_groups:
            if verbose:
                print(f"{name}: {sub_name}")
            if 1 < len(sub_groups):
                print("Stratify not possible")
                print(group[[*group_columns, target_column]])
                return False
    return True


groups = None
groupby_columns = [df_primary.index]
if group_columns == []:
    groups = df_primary.groupby(groupby_columns)
    assert len(groups) == len(df_primary)
else:
    groupby_columns = group_columns
    groups = df_primary.groupby(groupby_columns)
    assert len(groups) < len(df_primary)
stratify_possible = check_stratify(groups)

if stratify_possible:
    previous_length = len(groups)
    groups = df_primary.groupby([*groupby_columns, target_column])
    assert previous_length == len(groups)

df_grouped = groups.size().reset_index()
df_grouped.index = df_grouped["level_0"]
assert len(df_grouped.index) == len(groups.groups.keys())

In [11]:
def split_sets(set_combined, split_ratio):
    if 0 == split_ratio:
        return set_combined, set_combined.iloc[:0, :].copy()

    stratify_series = None
    if stratify_possible:
        stratify_series = set_combined[target_column]

    return train_test_split(
        set_combined, test_size=split_ratio, random_state=seed, stratify=stratify_series
    )


if 0 < len(df_secondary):
    split_ratio_1 = (
        split_ratio_1 * (1 - split_ratio_2) / (1 - split_ratio_1 * split_ratio_2)
    )
    split_ratio_2 = 0

df_train, df_valid_test = split_sets(df_grouped, split_ratio_1)
df_valid, df_test = split_sets(df_valid_test, split_ratio_2)

train_ids = df_train[group_columns].values
valid_ids = df_valid[group_columns].values
test_ids = df_test[group_columns].values

if group_columns == []:
    train_ids = df_train.index.values
    valid_ids = df_valid.index.values
    test_ids = df_test.index.values

print(f"Training: {len(train_ids)}")
print(f"Validation: {len(valid_ids)}")
print(f"Test (without predefined): {len(test_ids)}")

Training: 2056
Validation: 257
Test (without predefined): 0


In [12]:
assert 0 == len(np.intersect1d(train_ids, valid_ids))
assert 0 == len(np.intersect1d(valid_ids, test_ids))
assert 0 == len(np.intersect1d(test_ids, train_ids))

In [13]:
def get_set(values):
    group_id = tuple(values)
    if 1 == len(group_id):
        group_id = group_id[0]

    if group_id in train_ids:
        return "train"
    elif group_id in valid_ids:
        return "valid"
    elif group_id in test_ids:
        return "test"
    else:
        print(f"Group_id '{group_id}' cannot be assigned")
        return None


df_split = df_primary[[target_column, "filepath"]].copy()
original_columns = df_split.columns
df_split.columns = ["target_code", "filepath"]

if group_columns == []:
    df_split["set"] = df_primary.index.to_frame().apply(get_set, axis=1)
else:
    df_split["set"] = df_primary[groupby_columns].apply(get_set, axis=1)

if 0 < len(df_secondary):
    pd.options.mode.chained_assignment = None
    df_split_test = df_secondary[original_columns]
    df_split_test["set"] = "test"
    df_split_test.columns = df_split.columns
    assert 0 == (df_split["set"] == "test").sum()
    df_split = pd.concat([df_split, df_split_test])

print(set(df_split.index.unique()) ^ set(df_grouped.index.unique()))
df_split[df_split["set"].isnull()]["filepath"].values

set()


array([], dtype=object)

In [14]:
df_test = df_split[df_split["set"] == "train"]
df_test.groupby("target_code")["filepath"].count() / len(df_test)

target_code
Apple Scab Leaf               0.035992
Apple leaf                    0.035506
Apple rust leaf               0.034047
Bell_pepper leaf              0.022860
Bell_pepper leaf spot         0.026751
Blueberry leaf                0.045233
Cherry leaf                   0.020428
Corn Gray leaf spot           0.026265
Corn leaf blight              0.077335
Corn rust leaf                0.045720
Peach leaf                    0.044261
Potato leaf early blight      0.044261
Potato leaf late blight       0.040856
Raspberry leaf                0.048152
Soyabean leaf                 0.024805
Squash Powdery mildew leaf    0.053502
Strawberry leaf               0.037938
Tomato Early blight leaf      0.033074
Tomato Septoria leaf spot     0.058852
Tomato leaf                   0.023833
Tomato leaf bacterial spot    0.043774
Tomato leaf late blight       0.042802
Tomato leaf mosaic virus      0.018969
Tomato leaf yellow virus      0.029183
Tomato mold leaf              0.036479
grape leaf   

In [15]:
df_valid = df_split[df_split["set"] == "valid"]
df_valid.groupby("target_code")["filepath"].count() / len(df_valid)

target_code
Apple Scab Leaf               0.035019
Apple leaf                    0.035019
Apple rust leaf               0.035019
Bell_pepper leaf              0.023346
Bell_pepper leaf spot         0.027237
Blueberry leaf                0.046693
Cherry leaf                   0.019455
Corn Gray leaf spot           0.027237
Corn leaf blight              0.077821
Corn rust leaf                0.046693
Peach leaf                    0.046693
Potato leaf early blight      0.042802
Potato leaf late blight       0.038911
Raspberry leaf                0.050584
Soyabean leaf                 0.023346
Squash Powdery mildew leaf    0.054475
Strawberry leaf               0.038911
Tomato Early blight leaf      0.031128
Tomato Septoria leaf spot     0.058366
Tomato leaf                   0.023346
Tomato leaf bacterial spot    0.042802
Tomato leaf late blight       0.042802
Tomato leaf mosaic virus      0.019455
Tomato leaf yellow virus      0.031128
Tomato mold leaf              0.035019
grape leaf   

In [16]:
df_test = df_split[df_split["set"] == "test"]
df_test.groupby("target_code")["filepath"].count() / len(df_test)

target_code
Apple Scab Leaf               0.044843
Apple leaf                    0.040359
Apple rust leaf               0.044843
Bell_pepper leaf              0.035874
Bell_pepper leaf spot         0.040359
Blueberry leaf                0.044843
Cherry leaf                   0.044843
Corn Gray leaf spot           0.013453
Corn leaf blight              0.040359
Corn rust leaf                0.044843
Peach leaf                    0.040359
Potato leaf early blight      0.026906
Potato leaf late blight       0.022422
Raspberry leaf                0.031390
Soyabean leaf                 0.035874
Squash Powdery mildew leaf    0.026906
Strawberry leaf               0.035874
Tomato Early blight leaf      0.040359
Tomato Septoria leaf spot     0.044843
Tomato leaf                   0.035874
Tomato leaf bacterial spot    0.035874
Tomato leaf late blight       0.044843
Tomato leaf mosaic virus      0.044843
Tomato leaf yellow virus      0.022422
Tomato mold leaf              0.026906
grape leaf   

In [17]:
images_root_path = images_root_paths[0]
if 1 < len(images_root_paths):
    images_root_path = os.path.commonpath(images_root_paths)
df_split.to_csv(os.path.join(images_root_path, "split.csv"))

In [42]:
# files_with_case_deviations = df_merged[df_merged[file_column].str.lower().duplicated(False)][file_column]
# if 0 < len(files_with_case_deviations):
#     print(files_with_case_deviations)
#     assert False

In [43]:
# def copy_wrapper(src, dst):
#     if os.path.exists(dst):
#         print(f"Cannot copy from {src} to {dst}")
#         return
#     return shutil.copy(src, dst)

# if type(images_root_paths) is tuple:
#     common_path = os.path.join(os.path.commonpath(images_root_paths), "all/")
#     os.makedirs(common_path, exist_ok=False)
#     shutil.copytree(images_root_paths[0], common_path, dirs_exist_ok=True, copy_function = copy_wrapper)
#     shutil.copytree(images_root_paths[1], common_path, dirs_exist_ok=True, copy_function = copy_wrapper)
#     images_root_paths = common_path

In [44]:
# if not df_merged[file_column].is_unique:
#     assert df_merged[[target_column,file_column]].apply(lambda x: os.path.join(*x), axis=1).is_unique
#     df_primary[file_column] = df_primary[[target_column,file_column]].apply(lambda x: os.path.join("..", *x), axis=1)
#     assert df_primary[file_column].is_unique
#     df_secondary[file_column] = df_secondary[[target_column,file_column]].apply(lambda x: os.path.join("..", *x), axis=1)
#     assert df_secondary[file_column].is_unique
#     df_merged = pd.concat([df_primary, df_secondary])
#     assert df_merged[file_column].is_unique

# df_merged[[file_column, target_column]].groupby(target_column).count()

In [45]:
# def check_existing_file(df):
#     for image_path in df[df[target_column] == value][file_column]:
#         image_name = os.path.basename(image_path)

#         if not image_name in file_name_list:
#             if not image_name in file_name_list:
#             if image_name = image_name.replace("?", "") # windows characters


#         file_extension = os.path.splitext(image_name)[1]

#         if file_extension == "":
#             found_image_names = [
#                 image_file_name
#                 for image_file_name in image_file_names
#                 if image_file_name.startswith(image_name)
#             ]

#             if 1 != len(found_image_names):
#                 print(
#                     f"Image name '{image_name}' cannot be assigned to existing files: {found_image_names}"
#                 )
#             assert 1 == len(found_image_names)
#             image_name_new = found_image_names[0]
#             df.loc[df[file_column] == image_name, [file_column]] = image_name_new
#             image_name = image_name_new

# if os.path.exists(source_path):
#             else:
#                 print(f"Missing file: {source_path}")

In [46]:
# def copy_images_to_target_subdirectory(df):
#     if csv_paths is None:
#         return

#     image_file_names = [
#         name
#         for name in os.listdir(path=images_root_paths)
#         if os.path.isfile(os.path.join(images_root_paths, name))
#     ]

#     for value in df[target_column].unique():
#         subdirectory = os.path.join(images_root_paths, target_column, str(value))
#         print(f"Copying to {subdirectory}")
#         os.makedirs(subdirectory, exist_ok=True)
#         for image_path in df[df[target_column] == value][file_column]:
#             image_name = os.path.basename(image_path)
#             source_path = os.path.join(images_root_paths, image_name)
#             if not os.path.exists(os.path.join(subdirectory, image_name)):
#                 shutil.copy(source_path, f"{subdirectory}/")

# copy_images_to_target_subdirectory(df_primary)

# if df_secondary is not None:
#     copy_images_to_target_subdirectory(df_secondary)

In [52]:
def plot_example_images(title, image_paths):
    number_of_columns = len(image_paths)
    number_of_rows = len(image_paths[0])
    fig, ax = plt.subplots(
        number_of_columns,
        number_of_rows,
        figsize=(3 * number_of_rows, 3 * number_of_columns),
        squeeze=False,
    )

    fig.suptitle(title)
    for row in range(len(image_paths)):
        for column in range(len(image_paths[0])):
            image_path = image_paths[row][column]
            pil_im = Image.open(image_path, "r")
            title = os.path.basename(image_path)
            ax[row][column].imshow(pil_im)
            ax[row][column].set_title(title)
            ax[row][column].axis("off")

    fig.tight_layout()
    plt.show()

In [51]:
image_paths = [
    [
        "../datasets/HAM10000/images/ISIC_0033084.jpg",
        "../datasets/HAM10000/images/ISIC_0033550.jpg",
        "../datasets/HAM10000/images/ISIC_0033536.jpg",
    ],
]
# plot_example_images("HAM10000 examples", image_paths)

In [53]:
# NOTE: grouping is tricky. Examples:
# PAT_1064_272_668, PAT_1064_273_980
# PAT_759_1538_566, PAT_759_1433_914
# PAT_1216_759_365, PAT_1216_759_542

# Different lesions should not get mixed up
# Sometimes different lesions have the same image

# <patient_id>_<lesion_id>_<image_number>.png
image_paths = [
    [
        "../datasets/PAD-UFES-20/images/PAT_1064_273_980.png",
        "../datasets/PAD-UFES-20/images/PAT_1064_272_668.png",
    ],
    [
        "../datasets/PAD-UFES-20/images/PAT_1288_1003_553.png",
        "../datasets/PAD-UFES-20/images/PAT_1288_1003_969.png",
    ],
    [
        "../datasets/PAD-UFES-20/images/duplicates/images/PAT_38_1003_68.png",
        "../datasets/PAD-UFES-20/images/duplicates/images/PAT_38_1003_226.png",
    ],
    [
        "../datasets/PAD-UFES-20/images/duplicates/images/PAT_759_1538_566.png",
        "../datasets/PAD-UFES-20/images/duplicates/images/PAT_759_1433_914.png",
    ],
    [
        "../datasets/PAD-UFES-20/images/PAT_1216_759_365.png",
        "../datasets/PAD-UFES-20/images/PAT_1216_759_542.png",
    ],
]
# plot_example_images("PAD-UFES-20 examples", image_paths)

In [50]:
# NOTE: Only used to download Fitzpatrick17k images from original source, but many links are dead. It is better to download the dataset from the Google Drive.
#
# if "url" in df_primary.columns:
#     for image_url in df_primary["url"]:
#         file_name = None
#         if str(image_url) == "nan":
#             # print(f"image_url: {image_url}")
#             continue
#         elif str(image_url).startswith("https://www.dermaamin.com"):
#             continue
#         elif str(image_url).startswith("http://atlasdermatologico.com.br/img"):
#             continue

#         file_name = os.path.basename(image_url)
#         file_name = file_name.replace("?", "")
#         file_path = os.path.join(images_root_paths, file_name)

#         if not os.path.exists(file_path):
#             response = requests.get(
#                 image_url, stream=True, headers={"User-Agent": "XY"}
#             )
#             if not response.ok:
#                 print(f"image_url: {image_url}")
#                 print(response)
#                 continue

#             with open(file_path, "wb") as handle:
#                 for block in response.iter_content(1024):
#                     if not block:
#                         break
#                     handle.write(block)
# else:
#     print("Skip download")