## Split original datasets into train and test

each image is RGB, 64 by 64.
Training datasets:
1. ISIC 
2. ISIC + DiDI
3. ISIC + ArGI

Testing Dataset:
ISIC + DiDI
Exclude ArGI because these images are not guaranteed to be real

80% of ISIC will be used in training
80% of DiDI will be used in training
80% of ArGI will be used in training
20% of ISIC will go to testing
20% of DiDI will go to testing

Data Sources:
ISIC: N = 1972
DiDI: N = 656
ArGI: N = 656

80/20 train/test split

In [None]:
import os
CWD = os.getcwd()
import numpy as np
import pandas as pd 
import cv2
import tensorflow as tf
from tensorflow import image
from tensorflow.image import ResizeMethod
import h5py

In [None]:
print(CWD)

# output path
NEW = CWD + "/newdatasets"
print(NEW)

# Path for abbrev: ISIC
ISIC = CWD + "/datasets/ISIC"
print(ISIC)

# Path for abbrev: DiDI
DIDI = CWD + "/datasets/DiDI"
print(DIDI)

# Path for abbrev: AGI
ARGI = CWD + "/datasets/ArGI"
print(ARGI)

Processing
Create function to store the right images

1. Create ISIC_Train and ISIC_Test
2. Export ISIC_Train to .h5
3. Create DiDI_Train and DiDI_Test
4. Combine ISIC_Train and DiDI_Train
5. Export ISIC_DiDI_Train
6. Combine ISIC_Test and DiDI_Test
7. Export ISIC_DiDI_Test
8. Create ArGI_Train
9. Combine ISIC_Train and ArGI_Train
10. Export ISIC_ArGI_Train

Output .h5 files only contain two cols: images and labels

In [None]:
import random
def split(images, labels):
    n = len(images)
    n_test = round(0.2*n)
    n_train = round(0.8*n)
    if not (n_train == n - n_test):
        return ValueError
    
    test_idx, train_idx = 0,0
    test = np.empty(shape=[n_test, 64, 64, 3], dtype=np.uint8)
    train = np.empty(shape=[n_train, 64, 64, 3], dtype=np.uint8)
    test_labels = np.empty(shape=[n_test, 1])
    train_labels = np.empty(shape=[n_train ,1])
    
    test_indices = random.sample(range(n),n_test)
    for i in range(n):
        if i in test_indices:
            test[test_idx] = images[i]
            test_labels[test_idx] = labels[i]
            test_idx += 1
        else:
            train[train_idx] = images[i]
            train_labels[train_idx] = labels[i]
            train_idx += 1
        
    if not (len(test) == len(test_labels) == n_test):
        return ValueError

    if not (len(train) == len(train_labels) == n_train):
        return ValueError
    
    return train, train_labels, test, test_labels

In [None]:
def load(dataset, abbrev):
    metadata = pd.read_excel(f'{dataset}/metadata_{abbrev}.xlsx', index_col = 'index')
    num_imgs = len(metadata['id'])
    labels = np.array(metadata['malignance'], dtype=np.uint8).reshape(num_imgs, 1)
    images = np.empty(shape=[num_imgs, 64, 64, 3], dtype=np.uint8)
    for i in range(num_imgs):
        id = metadata['id'][i]
        #image = tf.keras.utils.load_img(f'{dataset}/all_images/{id}', target_size = (64, 64))
        #image =- cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        image = cv2.imread(f'{dataset}/all_images/{id}')
        image = cv2.resize(image, (64, 64))
        images[i] = image
    
    if not (len(images) == len(labels) == num_imgs):
        return ValueError
    
    return images, labels

In [None]:
images_ISIC, labels_ISIC = load(ISIC, "ISIC")
train_ISIC, train_labels_ISIC, test_ISIC, test_labels_ISIC = split(images_ISIC, labels_ISIC)
print(len(train_ISIC))
print(len(train_labels_ISIC))
print(len(test_ISIC))
print(len(test_labels_ISIC))

In [None]:
images_DIDI, labels_DIDI = load(DIDI, "DiDI")
train_DIDI, train_labels_DIDI, test_DIDI, test_labels_DIDI = split(images_DIDI, labels_DIDI)
print(len(train_DIDI))
print(len(train_labels_DIDI))
print(len(test_DIDI))
print(len(test_labels_DIDI))

In [None]:
images_ArGI, labels_ArGI = load(ARGI, "ArGI")
train_ArGI, train_labels_ArGI, _, _ = split(images_ArGI, labels_ArGI)
print(len(train_ArGI))
print(len(train_labels_ArGI))

In [None]:
def export(images, labels, filename):
    with h5py.File(f'{CWD}/datasets/split/{filename}.h5','w') as f:
        f.create_dataset('images', data = images)
        f.create_dataset('labels', data = labels)


# """ Execute on Exports
# Processing
# Create function to store the right images

# 1. Create ISIC_Train and ISIC_Test
# 2. Export ISIC_Train to .h5
# 3. Create DiDI_Train and DiDI_Test
# 4. Combine ISIC_Train and DiDI_Train
# 5. Export ISIC_DiDI_Train
# 6. Combine ISIC_Test and DiDI_Test
# 7. Export ISIC_DiDI_Test
# 8. Create ArGI_Train
# 9. Combine ISIC_Train and ArGI_Train
# 10. Export ISIC_ArGI_Train

# Output .h5 files only contain two cols: images and labels
# """

In [None]:
# Merging to create compound datasets

train_ISIC_DiDI = np.append(train_ISIC, train_DIDI, axis=0)
train_labels_ISIC_DiDI = np.append(train_labels_ISIC, train_labels_DIDI)
train_labels_ISIC_DiDI = np.reshape(train_labels_ISIC_DiDI, [len(train_ISIC_DiDI), 1])
if len(train_ISIC_DiDI) == len(train_labels_ISIC_DiDI):
    print(len(train_ISIC_DiDI)) 
    print(train_labels_ISIC_DiDI.shape)

test_ISIC_DiDI = np.append(test_ISIC, test_DIDI, axis=0)
test_labels_ISIC_DiDI = np.append(test_labels_ISIC, test_labels_DIDI)
test_labels_ISIC_DiDI = np.reshape(test_labels_ISIC_DiDI, [len(test_ISIC_DiDI), 1])
if len(test_ISIC_DiDI) == len(test_labels_ISIC_DiDI):
    print(len(test_ISIC_DiDI)) 
    print(test_labels_ISIC_DiDI.shape)

train_ISIC_ArGI = np.append(train_ISIC, train_ArGI, axis=0)
train_labels_ISIC_ArGI = np.append(train_labels_ISIC, train_labels_ArGI)
train_labels_ISIC_ArGI = np.reshape(train_labels_ISIC_ArGI, [len(train_ISIC_ArGI), 1])
if len(train_ISIC_ArGI) == len(train_labels_ISIC_ArGI):
    print(len(train_ISIC_ArGI)) 
    print(train_labels_ISIC_ArGI.shape)

In [None]:
export(train_ISIC, train_labels_ISIC, 'train_ISIC')
export(train_ISIC_DiDI, train_labels_ISIC_DiDI, 'train_ISIC_DiDI')
export(train_ISIC_ArGI, train_labels_ISIC_ArGI, 'train_ISIC_ArGI')
export(test_ISIC_DiDI, test_labels_ISIC_DiDI, 'test_ISIC_DiDI')