### Import the needed packages

In [23]:
# packages fro handling file and directory
import os
import sys

sys.path.insert(0, "./../")
import utils_preprocess as utils

# packages for handling data
import pandas as pd
# packages for handling the spliting of data
from sklearn.model_selection import StratifiedShuffleSplit

### Global variable

In [24]:
path_dataset = './../../../dataset_used/'
split_ratio = {'direct': {'train': 0.7, 'val': 0.1, 'test': 0.2}, # split data directly into 3 subsets
            'indirect': {'train': 0.875, 'val': 0.125, 'test': 0.2}} # split data into 2 subsets and then split the first subset into 2 subsets
images = {}

### Get the fundus and mask images list

In [25]:
# get a list of used fundus and mask images
for directory in os.listdir(path_dataset):
    images[directory] = []
    for file in os.listdir(os.path.join(path_dataset, directory)):
        images[directory].append(file)

In [26]:
# store the data in a dataframe
images = pd.DataFrame(images)

# add more columns to the dataframe
images['sts_validate'] = images.fundus_image.apply(lambda x: x.split(".")[0]) == images.mask_image.apply(lambda x: x.split("_mask.")[0])
images['classes'] = images.fundus_image.apply(lambda x: x.split("_")[1]).map({'0': 'non_glaucoma', '1': 'glaucoma'})

images.head(2)

Unnamed: 0,aug_image,fundus_image,mask_image,sts_validate,classes
0,fff_0_111784_l_y_aug.jpg,fff_0_111784_l_y.jpg,fff_0_111784_l_y_mask.png,True,non_glaucoma
1,fff_0_111784_r_y_aug.jpg,fff_0_111784_r_y.jpg,fff_0_111784_r_y_mask.png,True,non_glaucoma


### Split the data using stratified method

In [27]:
# prepare the splitting tools
sss_temp_test = StratifiedShuffleSplit(n_splits=1, test_size=split_ratio['indirect']['test'], random_state=191502)
sss_train_val = StratifiedShuffleSplit(n_splits=1, test_size=split_ratio['indirect']['val'], random_state=191502)

In [28]:
# split the original images
temp_ori_index, test_ori_index = next(sss_temp_test.split(images.fundus_image, images.classes))
train_ori_index, val_ori_index = next(sss_train_val.split(images.iloc[temp_ori_index].fundus_image, images.iloc[temp_ori_index].classes))
# split the augmented images
temp_aug_index, test_aug_index = next(sss_temp_test.split(images.aug_image, images.classes))
train_aug_index, val_aug_index = next(sss_train_val.split(images.iloc[temp_aug_index].aug_image, images.iloc[temp_aug_index].classes))

In [29]:
# check the distribution of the data in each subset using percentage
print("Original Images",
        f'temp data size\t: {len(temp_ori_index)} -> {round(len(temp_ori_index)/len(images)*100)}%',
        f'test data size\t: {len(test_ori_index)} -> {round(len(test_ori_index)/len(images)*100)}%',
        f'train data size\t: {len(train_ori_index)} -> {round(len(train_ori_index)/len(images)*100)}%',
        f'val data size\t: {len(val_ori_index)} -> {round(len(val_ori_index)/len(images)*100)}%',
        sep='\n', end='\n\n')
print("Augmented Images",
        f'temp data size\t: {len(temp_aug_index)} -> {round(len(temp_aug_index)/len(images)*100)}%',
        f'test data size\t: {len(test_aug_index)} -> {round(len(test_aug_index)/len(images)*100)}%',
        f'train data size\t: {len(train_aug_index)} -> {round(len(train_aug_index)/len(images)*100)}%',
        f'val data size\t: {len(val_aug_index)} -> {round(len(val_aug_index)/len(images)*100)}%',
        sep='\n', end='\n\n')       

Original Images
temp data size	: 282 -> 80%
test data size	: 71 -> 20%
train data size	: 246 -> 70%
val data size	: 36 -> 10%

Augmented Images
temp data size	: 282 -> 80%
test data size	: 71 -> 20%
train data size	: 246 -> 70%
val data size	: 36 -> 10%



In [30]:
# store the splitted data into different dataframes
# original images
train_ori_set = images.iloc[train_ori_index]
val_ori_set = images.iloc[val_ori_index]
test_ori_set = images.iloc[test_ori_index]
# augmented images
train_aug_set = images.iloc[train_aug_index]
val_aug_set = images.iloc[val_aug_index]
test_aug_set = images.iloc[test_aug_index]

### Create directory to store the splitted dataset

In [31]:
new_ori_dir = './../../../dataset_used_split_ori/'
new_aug_dir = './../../../dataset_used_split_aug/'
for subset in ['train', 'val', 'test']:
    try:
        os.makedirs(os.path.join(new_ori_dir, subset)) # create the directory for the original images
        os.makedirs(os.path.join(new_aug_dir, subset)) # create the directory for the augmented images
    except FileExistsError:
        print(f'{subset} directory already exists')

### Copy the splited image

In [32]:
for img_type in ['fundus_image', 'mask_image']:
    print(f'copying {img_type} original images') # copy the original images
    for subset, df in {'train': train_ori_set, 'val': val_ori_set, 'test': test_ori_set}.items():
        print(utils.copy_images(list(df[img_type]), subset,
                            os.path.join(path_dataset, img_type),
                            new_ori_dir))
for img_type in ['aug_image', 'mask_image']:
    print(f'copying {img_type} augmented images') # copy the augmented images
    for subset, df in {'train': train_aug_set, 'val': val_aug_set, 'test': test_aug_set}.items():
        print(utils.copy_images(list(df[img_type]), subset,
                            os.path.join(path_dataset, img_type),
                            new_aug_dir))

copying fundus_image original images


train done
val done
test done
copying mask_image original images
train done
val done
test done
copying aug_image augmented images
train done
val done
test done
copying mask_image augmented images
train done
val done
test done
