In [None]:
"""
Step 1: randomly split all images and their masks into the following:
a training set (90%): 1070 images;
a validation set (5%): 59 images;
a test set (5%): 61 images.

Step 2: randomly select 120 images from the training set above 
and parse them into 3000 smaller pieces (25 pieces for each image).
The following sets of split images are created:
a training set (95%): 2850 split images;
a validation set (5%): 150 split images.

Step 3: parsing all images from the test set in the first step into smaller
pieces (25 pieces for each image) to generate
a test set: 1525 split images.

The package 'splitfolders' is used to split images and their masks into 
different set folders. Self-defined functions in the 'image_split.py' file 
are used for parsing whole images and their masks into smaller pieces.

Parent folders for storing files are created manually. See split-folders 
documentation for specific directory formats. 
https://pypi.org/project/split-folders/
"""

In [None]:
""" Step 1 """

In [None]:
!pip install split-folders
import splitfolders

In [None]:
# set a seed for reproducibility
seed = 789

In [None]:
# create a training set, a validation set, and a test set of whole images
input_img_path = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/image_all'
output_img_path = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_image'
splitfolders.ratio(input_img_path, output=output_img_path, seed=seed, ratio=(0.9, 0.05, 0.05)) 

In [None]:
# get their corresponded masks by using the same seed and partition
input_mask_path = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/mask_all
output_mask_path = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_mask'
splitfolders.ratio(input_mask_path, output=output_mask_path, seed=seed, ratio=(0.9, 0.05,0.05))

In [None]:
""" Step 2 """

In [2]:
!cp /content/drive/MyDrive/Colab\ Notebooks/codes/modules/image_split.py /content
from image_split import crop, randimg, get_mask, split

In [None]:
# set a seed for reproducibility
seed = 246

In [None]:
# randomly select 120 images from the training set created, split them into 3000 (256, 256) pieces and store them
input_image_all_dir = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_image/train/images/'
input_mask_all_dir = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_mask/train/masks_xbuffer/'
img_list = randimg(input_image_all_dir, 120, seed = seed) ## a list of filenames of selected images

In [None]:
# get the corresponding masks of the randomly selected 120 images
mask_list = get_mask(img_list) ## a list of filenames of corresponding masks

In [None]:
# parsing each image and its mask to 25 (256, 256) pieces, 25 x 120 = 3000 in total 
output_image_all_dir = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/split_image_all/images/'
output_mask_all_dir = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/split_mask_all/masks_xbuffer/'
split(input_image_all_dir, output_image_all_dir, 256, 256, 120, 0, img_list=img_list)
split(input_mask_all_dir, output_mask_all_dir, 256, 256, 120, 0, img_list=mask_list)

In [None]:
# split the parsed images to a training set and a validation set
input_img_path_split = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/split_image_all/'
output_img_path_split = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_image_split/'
splitfolders.ratio(input_img_path_split, output=output_img_path_split, seed=seed, ratio=(0.95, 0.05)) 

In [None]:
# split their corresponding masks
input_mask_path_split = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/split_mask_all/'
output_mask_path_split = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_mask_split/'
splitfolders.ratio(input_mask_path_split, output=output_mask_path_split, seed=seed, ratio=(0.95, 0.05)) 

In [None]:
""" Step 3 """

In [None]:
# parsing each image and its mask in the test set to 25 (256, 256) pieces, 25 x 61 = 1525 in total
input_test_img_dir = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_image/test/images/'
input_test_mask_dir = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_mask/test/masks_xbuffer/'
output_test_img_dir = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_image_split/test/images/'
output_test_mask_dir = '/content/drive/MyDrive/School/UPenn/Spring_2022/CPLN_680/sh_all_mask_split/test/masks_xbuffer/'

In [None]:
# parsing images
test_img_list = os.listdir(input_test_img_dir)
split(input_test_img_dir, output_test_img_dir, 256, 256, 120, 0, img_list=test_img_list)

In [None]:
# parsing masks
test_mask_list = os.listdir(input_test_mask_dir)
split(input_test_mask_dir, output_test_mask_dir, 256, 256, 120, 0, img_list=test_mask_list)