In [14]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PIL
from PIL import Image
import shutil
import math

In [15]:
# the function of copying the images
def copy_file_to_folder(source_file, dest_folder):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    dest_path = os.path.join(dest_folder, os.path.basename(source_file))
    shutil.copy(source_file, dest_path)

In [16]:
# pre varaiables 
origin_groundtruth_csv = "./HAM10000/origin/groundtruth/HAM10000_groundtruth.csv"   # read the csv file
origin_images_path = "./HAM10000/origin/images/"    # the folder path of train images
origin_masks_path = "./HAM10000/origin/masks/"      # the folder path of train masks

# read the file needed

origin_groundtruth = pd.read_csv(origin_groundtruth_csv)    # read the csv file of groundtruth


groundtruth_length = len(origin_groundtruth)    # get the number of dataset
groudtruth_categories = origin_groundtruth['dx'].unique()   # get the categories in the datasets
print("Unique categories:",groudtruth_categories)

# create folders for each categories
trainset_images_path = "./HAM10000/train/images/"     # the images path for train dataset
trainset_masks_path = "./HAM10000/train/masks/"     # the masks path for train dataset
testset_images_path = "./HAM10000/test/images/"     # the images path for test dataset
testset_masks_path = "./HAM10000/test/masks/"      # the masks path for test dataset

category_dictionary = {         # the dictionary for different categories
    'akiec':1,
    'bcc':2,
    'bkl':3,
    'df':4,
    'mel':5,
    'nv':6,
    'vasc':7,
}

for category in groudtruth_categories:      # create the corresponding folders in training images and masks
    category_images_train_path = trainset_images_path+category+"/"
    category_masks_train_path = trainset_masks_path+category+"/"
    category_images_test_path = testset_images_path+"/"
    category_masks_test_path = testset_masks_path+"/"
    os.makedirs(category_images_train_path, exist_ok=True)
    os.makedirs(category_masks_train_path, exist_ok=True)
    os.makedirs(category_images_test_path, exist_ok=True)
    os.makedirs(category_masks_test_path, exist_ok=True)

# generate the data file in folders of different categories
images_resources_path = "./HAM10000/origin/images/"
masks_resources_path = "./HAM10000/origin/masks/"
ratio = 0.8
for category in groudtruth_categories:      # each categories
    dest_folder_images = "./HAM10000/train/images/"+category    # the destination train set folder of copying the images
    dest_folder_masks = "./HAM10000/train/masks/"+category    # the destination trian set folder of copying the masks
    dest_folder_images_change = "./HAM10000/test/images/"+category     # the destination folder of test set images
    dest_folder_masks_change = "./HAM10000/test/masks/"+category      # the destination folder of test set masks
    data_categories = origin_groundtruth[origin_groundtruth['dx'] == category]      # extract each categories 
    length_categories = len(data_categories)
    chaneg_folder_point = math.floor(length_categories * ratio)     # get the point to change directory name 
    elements_count = 0
    print(category+" length:",length_categories)
    print("change folder point:",chaneg_folder_point)
    
    for image_name in data_categories['image_id']:      # each image_id in each categories
        if elements_count == chaneg_folder_point:
            dest_folder_images = dest_folder_images_change
            dest_folder_masks = dest_folder_masks_change
        images_file = image_name+".jpg"
        masks_file = image_name+"_segmentation.png"
        source_image = images_resources_path+images_file    # the full path of source of image : path + image file name
        source_mask = masks_resources_path+masks_file       # the full path of source of mask : path + mask file name
        copy_file_to_folder(source_image,dest_folder_images)
        # masks should be preprocess to the form of output for network (Width*Height*Category)
        image = Image.open(source_mask)
        image_array = np.array(image)
        image_array[image_array == 0] = category_dictionary[category]
        image_array[image_array == 255] = 0
        image = Image.fromarray(image_array)
        image.save(os.path.join(dest_folder_masks, masks_file))
        elements_count +=1
print("Dataset generation finished.")

Unique categories: ['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']
bkl length: 1099
change folder point: 879
nv length: 6705
change folder point: 5364
df length: 115
change folder point: 92
mel length: 1113
change folder point: 890
vasc length: 142
change folder point: 113
bcc length: 514
change folder point: 411
akiec length: 327
change folder point: 261
Dataset generation finished.
