# Splits
80/20 train/test

90/10 train/validation

whole

In [1]:
import os
CWD = os.getcwd()
import numpy as np
import pandas as pd 
import cv2
import h5py
import random

In [None]:
# print(CWD)

# Path for abbrev: ISIC
ISIC = CWD + "/datasets/ISIC"
# print(ISIC)

# Path for abbrev: DiDI
DiDI = CWD + "/datasets/DiDI"
# print(DiDI)

# Path for abbrev: AGI
ArGI = CWD + "/datasets/ArGI"
# print(ArGI)

BrAD = CWD + "/datasets/BrAD"
# print(BrAD)

In [None]:
def split(images, labels, train_split, test_split, val_split):
    n = len(images)
    n_train = round(train_split*n)
    n_test = round(test_split*n)
    n_val = round(val_split*n)
    
    train_idx, test_idx, val_idx = 0,0,0

    train = np.empty(shape=[n_train, 64, 64, 3], dtype=np.uint8)
    test = np.empty(shape=[n_test, 64, 64, 3], dtype=np.uint8)
    val = np.empty(shape=[n_val, 64, 64, 3], dtype=np.uint8)
    
    
    train_labels = np.empty(shape=[n_train, 1])
    test_labels = np.empty(shape=[n_test, 1])
    val_labels = np.empty(shape=[n_val, 1])
    
    test_indices = random.sample(range(n), n_test)
    val_indices = random.sample(range(n), n_val)

    for i in range(n):
        if i in test_indices:
            test[test_idx] = images[i]
            test_labels[test_idx] = labels[i]
            test_idx += 1
        elif i in val_indices:
            val[val_idx] = images[i]
            val_labels[val_idx] = labels[i]
            val_idx += 1
        else:
            train[train_idx] = images[i]
            train_labels[train_idx] = labels[i]
            train_idx += 1
    
    assert len(train) == len(train_labels) == n_train
    assert len(test) == len(test_labels) == n_test
    assert len(val) == len(val_labels) == n_val

    # print(type(train))
    # print(type(test))
    # print(type(train_labels))
    # print(type(test_labels))
    # print(type(val))
    # print(type(val_labels))
    
    return train, train_labels, test, test_labels, val, val_labels

In [2]:
def load(dataset, abbrev):
    metadata = pd.read_excel(f'{dataset}/metadata_{abbrev}.xlsx', index_col = 'index')
    num_imgs = len(metadata['id'])
    labels = np.array(metadata['malignance'], dtype=np.uint8).reshape(num_imgs, 1)
    images = np.empty(shape=[num_imgs, 64, 64, 3], dtype=np.uint8)
    for i in range(num_imgs):
        id = metadata['id'][i]
        print(id)
        image = cv2.imread(f'{dataset}/all_images/{id}')
        image = cv2.resize(image, (64, 64))
        images[i] = image
    
    if not (len(images) == len(labels) == num_imgs):
        return ValueError
    
    return images, labels

In [3]:
def export(images, labels, filename, subfolder):
    with h5py.File(f'{CWD}/datasets/{subfolder}/{filename}.h5','w') as f:
        f.create_dataset('images', data = images)
        f.create_dataset('labels', data = labels)

In [None]:
images_ISIC, labels_ISIC = load(ISIC, "ISIC")
images_DiDI, labels_DiDI = load(DiDI, "DiDI")
images_ArGI, labels_ArGI = load(ArGI, "ArGI")
images_BrAD, labels_BrAD = load(BrAD, "BrAD")

In [None]:
export(images_ISIC, labels_ISIC, 'ISIC', 'ISIC')
export(images_DiDI, labels_DiDI, 'DiDI', 'DiDI')
export(images_ArGI, labels_ArGI, 'ArGI', 'ArGI')
export(images_BrAD, labels_BrAD, 'BrAD', 'BrAD')


In [5]:
REZK = CWD + "/datasets/REZK"
images_REZK, labels_REZK = load(REZK, "REZK")
export(images_REZK, labels_REZK, 'REZK', 'REZK')

REZK_000001.jpg
REZK_010603.jpg
REZK_010604.jpg
REZK_010605.jpg
REZK_002348.jpg
REZK_002349.jpg
REZK_002350.jpg
REZK_002351.jpg
REZK_002352.jpg
REZK_002353.jpg
REZK_002354.jpg
REZK_002357.jpg
REZK_002358.jpg
REZK_002359.jpg
REZK_002360.jpg
REZK_002361.jpg
REZK_002362.jpg
REZK_002363.jpg
REZK_002364.jpg
REZK_002365.jpg
REZK_003054.jpg
REZK_003055.jpg
REZK_003056.jpg
REZK_003057.jpg
REZK_003058.jpg
REZK_003059.jpg
REZK_003060.jpg
REZK_003061.jpg
REZK_003062.jpg
REZK_003063.jpg
REZK_003064.jpg
REZK_003065.jpg
REZK_003066.jpg
REZK_003067.jpg
REZK_003068.jpg
REZK_003069.jpg
REZK_003070.jpg
REZK_003071.jpg
REZK_003072.jpg
REZK_003073.jpg
REZK_003074.jpg
REZK_003075.jpg
REZK_003076.jpg
REZK_003077.jpg
REZK_003078.jpg
REZK_003079.jpg
REZK_003080.jpg
REZK_003081.jpg
REZK_003082.jpg
REZK_003083.jpg
REZK_003084.jpg
REZK_003085.jpg
REZK_003086.jpg
REZK_004657.jpg
REZK_004658.jpg
REZK_004659.jpg
REZK_004684.jpg
REZK_004685.jpg
REZK_004686.jpg
REZK_004687.jpg
REZK_004688.jpg
REZK_004689.jpg
REZK_004

## whole (1.00 - 0.00 - 0.00 split)

In [None]:
if (len(images_ISIC) == len(labels_ISIC)):
    print(len(images_ISIC))
    print(labels_ISIC.shape)

images_ISIC_DiDI = np.append(images_ISIC, images_DiDI, axis = 0)
labels_ISIC_DiDI = np.append(labels_ISIC, labels_DiDI)
labels_ISIC_DiDI = np.reshape(labels_ISIC_DiDI, [len(labels_ISIC_DiDI), 1])
if (len(images_ISIC_DiDI) == len(labels_ISIC_DiDI)):
    print(len(images_ISIC_DiDI))
    print(labels_ISIC_DiDI.shape)

images_ISIC_ArGI = np.append(images_ISIC, images_ArGI, axis = 0)
labels_ISIC_ArGI = np.append(labels_ISIC, labels_ArGI)
labels_ISIC_ArGI = np.reshape(labels_ISIC_ArGI, [len(labels_ISIC_ArGI), 1])
if (len(images_ISIC_ArGI) == len(labels_ISIC_ArGI)):
    print(len(images_ISIC_ArGI))
    print(labels_ISIC_ArGI.shape)

In [None]:
export(images_ISIC, labels_ISIC, 'ISIC', 'whole')
export(images_ISIC_DiDI, labels_ISIC_DiDI, 'ISIC_DiDI', 'whole')
export(images_ISIC_ArGI, labels_ISIC_ArGI, 'ISIC_ArGI', 'whole')

## train/test (0.80 - 0.20 - 0.00 split)

In [None]:
train_ISIC, train_labels_ISIC, test_ISIC, test_labels_ISIC, _, _ = split(images_ISIC, labels_ISIC, 0.8, 0.2, 0)
print(len(train_ISIC))
print(len(train_labels_ISIC))
print(len(test_ISIC))
print(len(test_labels_ISIC))

train_DiDI, train_labels_DiDI, test_DiDI, test_labels_DiDI, _, _ = split(images_DiDI, labels_DiDI, 0.8, 0.2, 0)
print(len(train_DiDI))
print(len(train_labels_DiDI))
print(len(test_DiDI))
print(len(test_labels_DiDI))

train_ArGI, train_labels_ArGI, _, _, _, _ = split(images_ArGI, labels_ArGI, 0.8, 0.2, 0)
print(len(train_ArGI))
print(len(train_labels_ArGI))

In [None]:
train_ISIC_DiDI = np.append(train_ISIC, train_DiDI, axis=0)
train_labels_ISIC_DiDI = np.append(train_labels_ISIC, train_labels_DiDI)
train_labels_ISIC_DiDI = np.reshape(train_labels_ISIC_DiDI, [len(train_ISIC_DiDI), 1])
if len(train_ISIC_DiDI) == len(train_labels_ISIC_DiDI):
    print(len(train_ISIC_DiDI)) 
    print(train_labels_ISIC_DiDI.shape)

test_ISIC_DiDI = np.append(test_ISIC, test_DiDI, axis=0)
test_labels_ISIC_DiDI = np.append(test_labels_ISIC, test_labels_DiDI)
test_labels_ISIC_DiDI = np.reshape(test_labels_ISIC_DiDI, [len(test_ISIC_DiDI), 1])
if len(test_ISIC_DiDI) == len(test_labels_ISIC_DiDI):
    print(len(test_ISIC_DiDI)) 
    print(test_labels_ISIC_DiDI.shape)

train_ISIC_ArGI = np.append(train_ISIC, train_ArGI, axis=0)
train_labels_ISIC_ArGI = np.append(train_labels_ISIC, train_labels_ArGI)
train_labels_ISIC_ArGI = np.reshape(train_labels_ISIC_ArGI, [len(train_ISIC_ArGI), 1])
if len(train_ISIC_ArGI) == len(train_labels_ISIC_ArGI):
    print(len(train_ISIC_ArGI)) 
    print(train_labels_ISIC_ArGI.shape)

In [None]:
export(train_ISIC, train_labels_ISIC, 'train_ISIC', 'split-80train-20test')
export(train_ISIC_DiDI, train_labels_ISIC_DiDI, 'train_ISIC_DiDI', 'split-80train-20test')
export(train_ISIC_ArGI, train_labels_ISIC_ArGI, 'train_ISIC_ArGI', 'split-80train-20test')
export(test_ISIC_DiDI, test_labels_ISIC_DiDI, 'test_ISIC_DiDI', 'split-80train-20test')

## train/val (0.90 - 0.00 - 0.10 split)

In [None]:
train_ISIC, train_labels_ISIC, _, _, val_ISIC, val_labels_ISIC = split(images_ISIC, labels_ISIC, 0.9, 0, 0.1)
print(len(train_ISIC))
print(len(train_labels_ISIC))
print(len(val_ISIC))
print(len(val_labels_ISIC))

train_DiDI, train_labels_DiDI, _, _, val_DiDI, val_labels_DiDI = split(images_DiDI, labels_DiDI, 0.9, 0, 0.1)
print(len(train_DiDI))
print(len(train_labels_DiDI))
print(len(val_DiDI))
print(len(val_labels_DiDI))

train_ArGI, train_labels_ArGI, _, _, val_ArGI, val_labels_ArGI = split(images_ArGI, labels_ArGI, 0.9, 0, 0.1)
print(len(train_ArGI))
print(len(train_labels_ArGI))
print(len(val_ArGI))
print(len(val_labels_ArGI))

In [None]:
train_ISIC_DiDI = np.append(train_ISIC, train_DiDI, axis=0)
train_labels_ISIC_DiDI = np.append(train_labels_ISIC, train_labels_DiDI)
train_labels_ISIC_DiDI = np.reshape(train_labels_ISIC_DiDI, [len(train_ISIC_DiDI), 1])
if len(train_ISIC_DiDI) == len(train_labels_ISIC_DiDI):
    print(len(train_ISIC_DiDI)) 
    print(train_labels_ISIC_DiDI.shape)

train_ISIC_ArGI = np.append(train_ISIC, train_ArGI, axis=0)
train_labels_ISIC_ArGI = np.append(train_labels_ISIC, train_labels_ArGI)
train_labels_ISIC_ArGI = np.reshape(train_labels_ISIC_ArGI, [len(train_ISIC_ArGI), 1])
if len(train_ISIC_ArGI) == len(train_labels_ISIC_ArGI):
    print(len(train_ISIC_ArGI)) 
    print(train_labels_ISIC_ArGI.shape)

val_ISIC_DiDI = np.append(val_ISIC, val_DiDI, axis=0)
val_labels_ISIC_DiDI = np.append(val_labels_ISIC, val_labels_DiDI)
val_labels_ISIC_DiDI = np.reshape(val_labels_ISIC_DiDI, [len(val_ISIC_DiDI), 1])
if len(val_ISIC_DiDI) == len(val_labels_ISIC_DiDI):
    print(len(val_ISIC_DiDI)) 
    print(val_labels_ISIC_DiDI.shape)

val_ISIC_ArGI = np.append(val_ISIC, val_ArGI, axis=0)
val_labels_ISIC_ArGI = np.append(val_labels_ISIC, val_labels_ArGI)
val_labels_ISIC_ArGI = np.reshape(val_labels_ISIC_ArGI, [len(val_ISIC_ArGI), 1])
if len(val_ISIC_ArGI) == len(val_labels_ISIC_ArGI):
    print(len(val_ISIC_ArGI)) 
    print(val_labels_ISIC_ArGI.shape)

In [None]:
export(train_ISIC, train_labels_ISIC, 'train_ISIC', 'split-90train-10val')
export(train_ISIC_DiDI, train_labels_ISIC_DiDI, 'train_ISIC_DiDI', 'split-90train-10val')
export(train_ISIC_ArGI, train_labels_ISIC_ArGI, 'train_ISIC_ArGI', 'split-90train-10val')

In [None]:
export(val_ISIC, val_labels_ISIC, 'val_ISIC', 'split-90train-10val')
export(val_ISIC_DiDI, val_labels_ISIC_DiDI, 'val_ISIC_DiDI', 'split-90train-10val')
export(val_ISIC_ArGI, val_labels_ISIC_ArGI, 'val_ISIC_ArGI', 'split-90train-10val')