In [1]:
import os
CWD = os.getcwd()
import numpy as np
import pandas as pd 
import cv2
import h5py
import random
# print(CWD)

In [2]:
def load(dataset, abbrev):
    metadata = pd.read_excel(f'{dataset}/metadata_{abbrev}.xlsx', index_col = 'index')
    num_imgs = len(metadata['id'])

    images = np.empty(shape=[num_imgs, 64, 64, 3], dtype=np.uint8)
    for i in range(num_imgs):
        id = metadata['id'][i]
        # print(id)
        image = cv2.imread(f'{dataset}/all_images/{id}')
        image = cv2.resize(image, (64, 64))
        images[i] = image
    
    if not (len(images) == num_imgs):
        return ValueError
    
    if not (images.shape == (num_imgs, 64, 64, 3)):
        return ValueError
    
    return images, metadata

In [3]:
# Path for abbrev: ISIC
ISIC = CWD + "/datasets/ISIC"
# print(ISIC)
images_ISIC, metadata_ISIC = load(ISIC, "ISIC")

In [4]:
# Path for abbrev: AGI
ArGI = CWD + "/datasets/ArGI"
# print(ArGI)
images_ArGI, metadata_ArGI = load(ArGI, "ArGI")

In [5]:
def split(images, labels, ids, train_split, test_split, val_split):
    n = len(images)
    n_train = round(train_split*n)
    n_test = round(test_split*n)
    n_val = round(val_split*n)
    
    train_idx, test_idx, val_idx = 0,0,0

    train = np.empty(shape=[n_train, 64, 64, 3], dtype=np.uint8)
    test = np.empty(shape=[n_test, 64, 64, 3], dtype=np.uint8)
    val = np.empty(shape=[n_val, 64, 64, 3], dtype=np.uint8)
    
    train_ids = []
    test_ids = []
    val_ids = []

    train_labels = np.empty(shape=[n_train, 1])
    test_labels = np.empty(shape=[n_test, 1])
    val_labels = np.empty(shape=[n_val, 1])
    
    test_indices = random.sample(range(n), n_test)
    val_indices = random.sample(range(n), n_val)

    for i in range(n):
        if i in test_indices:
            test[test_idx] = images[i]
            test_labels[test_idx] = labels[i]
            test_ids.append(ids[test_idx])
            test_idx += 1
        elif i in val_indices:
            val[val_idx] = images[i]
            val_labels[val_idx] = labels[i]
            val_ids.append(ids[val_idx])
            val_idx += 1
        else:
            train[train_idx] = images[i]
            train_labels[train_idx] = labels[i]
            train_ids.append(ids[train_idx])
            train_idx += 1
    
    if not (len(train) == len(train_labels) == n_train == len(train_ids)):
        return ValueError
    if not (len(test) == len(test_labels) == n_test == len(test_ids)):
        return ValueError
    if not (len(val) == len(val_labels) == n_val == len(val_ids)):
        return ValueError

    return train, train_labels, train_ids, test, test_labels, test_ids, val, val_labels, val_ids

In [6]:
train_ISIC, train_labels_ISIC, train_ids_ISIC, _, _, _, val_ISIC, val_labels_ISIC, val_ids_ISIC = split(images_ISIC, 
                                                                                                           metadata_ISIC['malignance'], 
                                                                                                           metadata_ISIC['id'], 
                                                                                                           0.8, 
                                                                                                           0.0, 
                                                                                                           0.2
                                                                                                           )

In [7]:
train_ArGI, train_labels_ArGI, train_ids_ArGI, _, _, _, _, _, _ = split(images_ArGI, 
                                                                        metadata_ArGI['malignance'],
                                                                        metadata_ArGI['id'], 
                                                                        1.0, 
                                                                        0.0, 
                                                                        0.0
                                                                        )

In [8]:
# remove ArGI images that used content from validation set
badids = []
num_imgs_ArGI = len(metadata_ArGI['id'])
for i in range(num_imgs_ArGI):
    if metadata_ArGI['content'][i] in val_ids_ISIC:
        badids.append(i)
print(badids)
print(len(badids))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126]
127


In [9]:
metadata_ArGI.drop(labels=badids, inplace=True)
print(metadata_ArGI)

                    id      label  malignance  skin_tone          content  \
index                                                                       
127    ArGI_000128.jpg     benign           0         34  ISIC_000460.jpg   
128    ArGI_000129.jpg  malignant           1         34  ISIC_000469.jpg   
129    ArGI_000130.jpg     benign           0         56  ISIC_000477.jpg   
130    ArGI_000131.jpg     benign           0         56  ISIC_000480.jpg   
131    ArGI_000132.jpg     benign           0         56  ISIC_000485.jpg   
...                ...        ...         ...        ...              ...   
651    ArGI_000652.jpg  malignant           1         56  ISIC_015185.jpg   
652    ArGI_000653.jpg  malignant           1         34  ISIC_015204.jpg   
653    ArGI_000654.jpg     benign           0         56  ISIC_015233.jpg   
654    ArGI_000655.jpg  malignant           1         56  ISIC_015256.jpg   
655    ArGI_000656.jpg  malignant           1         34  ISIC_015284.jpg   

In [10]:
def local_export(images, labels, ids, subfolder, badids=[]):
    n_imgs = len(labels)
    for i in range(n_imgs):
        if i in badids:
            continue
        print(ids[i])
        if labels[i] == 0:
            cv2.imwrite(f'{subfolder}/benign/{ids[i]}', images[i])
        elif labels[i] == 1:
            cv2.imwrite(f'{subfolder}/malignant/{ids[i]}', images[i])

In [11]:
local_export(train_ISIC, train_labels_ISIC, train_ids_ISIC, f'{CWD}/datasets/ISIC-80')
local_export(val_ISIC, val_labels_ISIC, val_ids_ISIC, f'{CWD}/datasets/ISIC-20')

ISIC_000000.jpg
ISIC_000001.jpg
ISIC_000002.jpg
ISIC_000003.jpg
ISIC_000004.jpg
ISIC_000006.jpg
ISIC_000007.jpg
ISIC_000008.jpg
ISIC_000009.jpg
ISIC_000010.jpg
ISIC_000011.jpg
ISIC_000012.jpg
ISIC_000013.jpg
ISIC_000014.jpg
ISIC_000015.jpg
ISIC_000016.jpg
ISIC_000017.jpg
ISIC_000018.jpg
ISIC_000019.jpg
ISIC_000020.jpg
ISIC_000021.jpg
ISIC_000022.jpg
ISIC_000023.jpg
ISIC_000024.jpg
ISIC_000025.jpg
ISIC_000026.jpg
ISIC_000027.jpg
ISIC_000028.jpg
ISIC_000029.jpg
ISIC_000030.jpg
ISIC_000031.jpg
ISIC_000032.jpg
ISIC_000034.jpg
ISIC_000035.jpg
ISIC_000036.jpg
ISIC_000037.jpg
ISIC_000038.jpg
ISIC_000039.jpg
ISIC_000040.jpg
ISIC_000041.jpg
ISIC_000042.jpg
ISIC_000043.jpg
ISIC_000044.jpg
ISIC_000045.jpg
ISIC_000046.jpg
ISIC_000047.jpg
ISIC_000048.jpg
ISIC_000049.jpg
ISIC_000050.jpg
ISIC_000051.jpg
ISIC_000052.jpg
ISIC_000053.jpg
ISIC_000054.jpg
ISIC_000055.jpg
ISIC_000056.jpg
ISIC_000057.jpg
ISIC_000058.jpg
ISIC_000059.jpg
ISIC_000060.jpg
ISIC_000061.jpg
ISIC_000062.jpg
ISIC_000063.jpg
ISIC_000

In [12]:
local_export(train_ArGI, train_labels_ArGI, train_ids_ArGI, f'{CWD}/datasets/ArGI-refined', badids=badids)

ArGI_000128.jpg
ArGI_000129.jpg
ArGI_000130.jpg
ArGI_000131.jpg
ArGI_000132.jpg
ArGI_000133.jpg
ArGI_000134.jpg
ArGI_000135.jpg
ArGI_000136.jpg
ArGI_000137.jpg
ArGI_000138.jpg
ArGI_000139.jpg
ArGI_000140.jpg
ArGI_000141.jpg
ArGI_000142.jpg
ArGI_000143.jpg
ArGI_000144.jpg
ArGI_000145.jpg
ArGI_000146.jpg
ArGI_000147.jpg
ArGI_000148.jpg
ArGI_000149.jpg
ArGI_000150.jpg
ArGI_000151.jpg
ArGI_000152.jpg
ArGI_000153.jpg
ArGI_000154.jpg
ArGI_000155.jpg
ArGI_000156.jpg
ArGI_000157.jpg
ArGI_000158.jpg
ArGI_000159.jpg
ArGI_000160.jpg
ArGI_000161.jpg
ArGI_000162.jpg
ArGI_000163.jpg
ArGI_000164.jpg
ArGI_000165.jpg
ArGI_000166.jpg
ArGI_000167.jpg
ArGI_000168.jpg
ArGI_000169.jpg
ArGI_000170.jpg
ArGI_000171.jpg
ArGI_000172.jpg
ArGI_000173.jpg
ArGI_000174.jpg
ArGI_000175.jpg
ArGI_000176.jpg
ArGI_000177.jpg
ArGI_000178.jpg
ArGI_000179.jpg
ArGI_000180.jpg
ArGI_000181.jpg
ArGI_000182.jpg
ArGI_000183.jpg
ArGI_000184.jpg
ArGI_000185.jpg
ArGI_000186.jpg
ArGI_000187.jpg
ArGI_000188.jpg
ArGI_000189.jpg
ArGI_000

In [13]:
# should end up with ~1500 in train ~400 in val and ~500 in ArGI.