In [1]:
import os, math
from xml.etree import ElementTree
import shutil

In [2]:
# Instance variables
image_dir_p    = "archive/Images"
image_file_ext = r".tif"
image_xml_ext  = r".xml"
masks_dir_p    = "archive/Masks"
masks_file_ext = r".TIF"
split_ratio    = [60,20,20]
train_dir_p    = "datasets-multi/train"
val_dir_p      = "datasets-multi/validation"
test_dir_p     = "datasets-multi/test"

### Read folders

In [3]:
images_list     = [os.path.join(image_dir_p, _) for _ in os.listdir(image_dir_p) if _.endswith(image_file_ext)]
images_xml_list = [os.path.join(image_dir_p, _) for _ in os.listdir(image_dir_p) if _.endswith(image_xml_ext)]
masks_list      = [os.path.join(masks_dir_p, _) for _ in os.listdir(masks_dir_p) if _.endswith(masks_file_ext)]

# Split validation
total_items = len(images_list)

no_of_train_items = math.ceil(total_items * split_ratio[0] / 100)
no_of_val_items   = math.ceil(total_items * split_ratio[1] / 100)
no_of_test_items  = total_items * split_ratio[2] / 100

print(no_of_train_items, no_of_val_items, no_of_test_items)

35 12 11.6


In [4]:
# Remove exisiting directories and create new directories
shutil.rmtree(train_dir_p, ignore_errors=True)
shutil.rmtree(val_dir_p, ignore_errors=True)
shutil.rmtree(test_dir_p, ignore_errors=True)
os.mkdir(train_dir_p)
os.mkdir(val_dir_p)
os.mkdir(test_dir_p)

### Split Training images and masks set

In [5]:
for i in range( 0, no_of_train_items):
    # xml_fn   = os.path.split(images_xml_list[i])[1]
    
    # Read xml
    status = "unknown"
    tree = ElementTree.parse(images_xml_list[i])
    root  = tree.getroot()
    find_node = root.find("./*[@name='status']")
    status = find_node.attrib["value"]
    
    image_fn = os.path.split(images_list[i])[1]
    image_dir = train_dir_p + "/images/"
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)
    image_dest_p = image_dir + image_fn
    
    mask_fn  = os.path.split(masks_list[i])[1]
    # xml_dest_p = train_dir_p + "/images/" + xml_fn
    
    mask_benign_dir    = train_dir_p + "/masks/benign/"
    mask_malignant_dir = train_dir_p + "/masks/malignant/"
    if not os.path.exists(mask_benign_dir):
        os.makedirs(mask_benign_dir)
    if not os.path.exists(mask_malignant_dir):
        os.makedirs(mask_malignant_dir)
        
    if status == 'benign':
        mask_dest_p = mask_benign_dir + mask_fn
    elif status == "malignant":
        mask_dest_p = mask_malignant_dir + mask_fn
    
    shutil.copyfile(images_list[i], image_dest_p)
    # shutil.copyfile(images_xml_list[i], xml_dest_p)
    shutil.copyfile(masks_list[i], mask_dest_p)
    
train_list = [os.path.join(image_dir, _) for _ in os.listdir(image_dir) if _.endswith(image_file_ext)]
for index,item in enumerate(train_list):
    print(index + 1, item)

1 datasets-multi/train/images/ytma10_010704_benign1_ccd.tif
2 datasets-multi/train/images/ytma10_010704_benign2_ccd.tif
3 datasets-multi/train/images/ytma10_010704_benign3_ccd.tif
4 datasets-multi/train/images/ytma10_010704_malignant1_ccd.tif
5 datasets-multi/train/images/ytma10_010704_malignant2_ccd.tif
6 datasets-multi/train/images/ytma10_010704_malignant3_ccd.tif
7 datasets-multi/train/images/ytma12_010804_benign1_ccd.tif
8 datasets-multi/train/images/ytma12_010804_benign2_ccd.tif
9 datasets-multi/train/images/ytma12_010804_benign3_ccd.tif
10 datasets-multi/train/images/ytma12_010804_malignant1_ccd.tif
11 datasets-multi/train/images/ytma12_010804_malignant2_ccd.tif
12 datasets-multi/train/images/ytma12_010804_malignant3_ccd.tif
13 datasets-multi/train/images/ytma23_022103_benign1_ccd.tif
14 datasets-multi/train/images/ytma23_022103_benign2_ccd.tif
15 datasets-multi/train/images/ytma23_022103_benign3_ccd.tif
16 datasets-multi/train/images/ytma23_022103_malignant1_ccd.tif
17 datasets-

### Split Validation images and masks set

In [6]:
# Split Validation sets
for i in range( no_of_train_items, no_of_train_items + no_of_val_items):
    # xml_fn   = os.path.split(images_xml_list[i])[1]
    
    # Read xml
    status = "unknown"
    tree = ElementTree.parse(images_xml_list[i])
    root  = tree.getroot()
    find_node = root.find("./*[@name='status']")
    status = find_node.attrib["value"]    
    
    image_fn = os.path.split(images_list[i])[1]
    image_dir = val_dir_p + "/images/"
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)
    image_dest_p = image_dir + image_fn    
    
    mask_fn  = os.path.split(masks_list[i])[1]
    mask_benign_dir    = val_dir_p + "/masks/benign/"
    mask_malignant_dir = val_dir_p + "/masks/malignant/"
    if not os.path.exists(mask_benign_dir):
        os.makedirs(mask_benign_dir)
    if not os.path.exists(mask_malignant_dir):
        os.makedirs(mask_malignant_dir)    
    
    if status == 'benign':
        mask_dest_p = mask_benign_dir + mask_fn
    elif status == "malignant":
        mask_dest_p = mask_malignant_dir + mask_fn


    # xml_dest_p = val_dir_p + "/" + xml_fn
    shutil.copyfile(images_list[i], image_dest_p)
    # shutil.copyfile(images_xml_list[i], xml_dest_p)
    shutil.copyfile(masks_list[i], mask_dest_p)
    
val_list = [os.path.join(image_dir, _) for _ in os.listdir(image_dir) if _.endswith(image_file_ext)]
for index,item in enumerate(val_list):
    print(index + 1, item)

1 datasets-multi/validation/images/ytma49_042403_malignant3_ccd.tif
2 datasets-multi/validation/images/ytma49_072303_benign1_ccd.tif
3 datasets-multi/validation/images/ytma49_072303_benign2_ccd.tif
4 datasets-multi/validation/images/ytma49_072303_malignant1_ccd.tif
5 datasets-multi/validation/images/ytma49_072303_malignant2_ccd.tif
6 datasets-multi/validation/images/ytma49_111003_benign1_ccd.tif
7 datasets-multi/validation/images/ytma49_111003_benign2_ccd.tif
8 datasets-multi/validation/images/ytma49_111003_benign3_ccd.tif
9 datasets-multi/validation/images/ytma49_111003_malignant1_ccd.tif
10 datasets-multi/validation/images/ytma49_111003_malignant2_ccd.tif
11 datasets-multi/validation/images/ytma49_111003_malignant3_ccd.tif
12 datasets-multi/validation/images/ytma49_111303_benign1_ccd.tif


### Split Testing images and masks set

In [7]:
# Split Testing sets
for i in range( no_of_train_items + no_of_val_items, total_items):
    # xml_fn   = os.path.split(images_xml_list[i])[1]
    
    # Read xml
    status = "unknown"
    tree = ElementTree.parse(images_xml_list[i])
    root  = tree.getroot()
    find_node = root.find("./*[@name='status']")
    status = find_node.attrib["value"]    
    
    image_fn = os.path.split(images_list[i])[1]
    image_dir = test_dir_p + "/images/"
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)
    image_dest_p = image_dir + image_fn    
    
    mask_fn  = os.path.split(masks_list[i])[1]
    mask_benign_dir    = test_dir_p + "/masks/benign/"
    mask_malignant_dir = test_dir_p + "/masks/malignant/"
    if not os.path.exists(mask_benign_dir):
        os.makedirs(mask_benign_dir)
    if not os.path.exists(mask_malignant_dir):
        os.makedirs(mask_malignant_dir)    
    
    if status == 'benign':
        mask_dest_p = mask_benign_dir + mask_fn
    elif status == "malignant":
        mask_dest_p = mask_malignant_dir + mask_fn


    # xml_dest_p = val_dir_p + "/" + xml_fn
    shutil.copyfile(images_list[i], image_dest_p)
    # shutil.copyfile(images_xml_list[i], xml_dest_p)
    shutil.copyfile(masks_list[i], mask_dest_p)
test_list = [os.path.join(image_dir, _) for _ in os.listdir(image_dir) if _.endswith(image_file_ext)]
for index,item in enumerate(test_list):
    print(index + 1, item)

1 datasets-multi/test/images/ytma49_111303_benign2_ccd.tif
2 datasets-multi/test/images/ytma49_111303_benign3_ccd.tif
3 datasets-multi/test/images/ytma49_111303_malignant1_ccd.tif
4 datasets-multi/test/images/ytma49_111303_malignant2_ccd.tif
5 datasets-multi/test/images/ytma49_111303_malignant3_ccd.tif
6 datasets-multi/test/images/ytma55_030603_benign1_ccd.tif
7 datasets-multi/test/images/ytma55_030603_benign2_ccd.tif
8 datasets-multi/test/images/ytma55_030603_benign3_ccd.tif
9 datasets-multi/test/images/ytma55_030603_benign4_ccd.tif
10 datasets-multi/test/images/ytma55_030603_benign5_ccd.tif
11 datasets-multi/test/images/ytma55_030603_benign6_ccd.tif
