### Import the needed packages

In [1]:
# packages fro handling file and directory
import os
import sys

sys.path.insert(0, "./../")
import utils_preprocess as utils

# packages for handling data
import pandas as pd
# packages for handling the spliting of data
from sklearn.model_selection import StratifiedShuffleSplit

### Global variable

In [2]:
path_dataset = './../../../dataset_used/'
split_ratio = {'direct': {'train': 0.7, 'val': 0.1, 'test': 0.2}, # split data directly into 3 subsets
            'indirect': {'train': 0.875, 'val': 0.125, 'test': 0.2}} # split data into 2 subsets and then split the first subset into 2 subsets
images = {}

### Get the fundus and mask images list

In [3]:
# get a list of used fundus and mask images
for directory in os.listdir(path_dataset):
    images[directory] = []
    for file in os.listdir(os.path.join(path_dataset, directory)):
        images[directory].append(file)

In [4]:
# store the data in a dataframe
images = pd.DataFrame(images)

images['sts_validate'] = images.fundus_image.apply(lambda x: x.split(".")[0]) == images.mask_image.apply(lambda x: x.split("_mask.")[0])
images['classes'] = images.fundus_image.apply(lambda x: x.split("_")[1]).map({'0': 'non_glaucoma', '1': 'glaucoma'})

images.head(2)

Unnamed: 0,fundus_image,mask_image,sts_validate,classes
0,fff_0_111784_l_y.jpg,fff_0_111784_l_y_mask.png,True,non_glaucoma
1,fff_0_111784_r_y.jpg,fff_0_111784_r_y_mask.png,True,non_glaucoma


### Split the data using stratified method

In [5]:
# prepare the splitting tools
sss_temp_test = StratifiedShuffleSplit(n_splits=1, test_size=split_ratio['indirect']['test'], random_state=191502)
sss_train_val = StratifiedShuffleSplit(n_splits=1, test_size=split_ratio['indirect']['val'], random_state=191502)

In [6]:
# split the data
temp_index, test_index = next(sss_temp_test.split(images.fundus_image, images.classes))
train_index, val_index = next(sss_train_val.split(images.iloc[temp_index].fundus_image, images.iloc[temp_index].classes))

In [7]:
# check the distribution of the data in each subset using percentage
print(f'temp data size\t: {len(temp_index)} -> {round(len(temp_index)/len(images)*100)}%',
        f'test data size\t: {len(test_index)} -> {round(len(test_index)/len(images)*100)}%',
        f'train data size\t: {len(train_index)} -> {round(len(train_index)/len(images)*100)}%',
        f'val data size\t: {len(val_index)} -> {round(len(val_index)/len(images)*100)}%',
        sep='\n')       

temp data size	: 282 -> 80%
test data size	: 71 -> 20%
train data size	: 246 -> 70%
val data size	: 36 -> 10%


In [8]:
# store the splitted data into different dataframes
train_set = images.iloc[train_index]
val_set = images.iloc[val_index]
test_set = images.iloc[test_index]

### Create directory to store the splitted dataset

In [9]:
new_dir = './../../../dataset_used_split/'
for subset in ['train', 'val', 'test']:
    try:
        os.makedirs(os.path.join(new_dir, subset))
    except FileExistsError:
        print(f'{subset} directory already exists')

### Copy the splited image

In [10]:
for img_type in ['fundus_image', 'mask_image']:
    print(f'copying {img_type} images')
    for subset, df in {'train': train_set, 'val': val_set, 'test': test_set}.items():
        print(utils.copy_images(list(df[img_type]), subset,
                            os.path.join(path_dataset, img_type),
                            new_dir))

copying fundus_image images


train done
val done
test done
copying mask_image images
train done
val done
test done
