In [1]:
import os
import os.path
import shutil
import random
from shutil import copyfile

In [3]:
# DEFINE DIRECTORIES
dataset_dir = os.path.join(os.getcwd(), 'sehatinV1')
split_dir = os.path.join(os.getcwd(), 'sehatinV1splitted')
trainset_dir = os.path.join(split_dir, 'training')
validationset_dir = os.path.join(split_dir, 'validation')
testset_dir = os.path.join(split_dir, 'testing')
labels = os.listdir(dataset_dir)

# CREATE LABELS FOR EACH TRAINING, VALIDATION, TESTING
for label in labels:
    try:
        os.makedirs(os.path.join(trainset_dir, label))
        os.makedirs(os.path.join(validationset_dir, label))
        os.makedirs(os.path.join(testset_dir, label))
    except:
        print("{} already exist!".format(label))

In [4]:
# DEFINE FUNCTION FOR SPLITTING DATA INTO THREE SETS
def split_data_into_three(SOURCE, TRAINING, VALIDATION, TESTING, SPLIT_SIZE):

  """
    SOURCE: source_path
    TRAINING: path of directory for training_set
    VALIDATION: path of directory for validation_set
    TEST: path of directory for testing_set
    SPLIT_SIZE: list of float
  """

  listdir = os.listdir(SOURCE)
  for img in os.listdir(SOURCE):
    if os.path.getsize(os.path.join(SOURCE, img)) == 0:
      listdir.remove(img)
      print(img + " is zero length, so ignoring.")

  #Randomize the list
  random.sample(listdir, len(listdir))
  
  #Set the portion
  training_size = int(SPLIT_SIZE[0] * len(listdir))
  validation_size = int(SPLIT_SIZE[1] * len(listdir))
  testing_size = len(listdir) - training_size - validation_size

  #Check the directory, whether it is empty or not. If not empty, make it empty
  if os.listdir(TRAINING) != []:
    for sfile in os.listdir(TRAINING):
      os.remove(os.path.join(TRAINING, sfile))
  if os.listdir(VALIDATION) != []:
    for sfile in os.listdir(VALIDATION):
      os.remove(os.path.join(VALIDATION, sfile))
  if os.listdir(TESTING) != []:
    for sfile in os.listdir(TESTING):
      os.remove(os.path.join(TESTING, sfile))
  
  #Copy from the source to training, validation, testing
  for i in range(training_size):
    img = listdir[i]
    copyfile(os.path.join(SOURCE, img), os.path.join(TRAINING, img))
  for i in range(validation_size):
    img = listdir[i + training_size]
    copyfile(os.path.join(SOURCE, img), os.path.join(VALIDATION, img))
  for i in range(testing_size):
    img = listdir[i + validation_size + training_size]
    copyfile(os.path.join(SOURCE, img), os.path.join(TESTING, img))


In [5]:
# DEFINE FUNCTION FOR SPLITTING DATA INTO TWO SETS
def split_data_into_two(SOURCE, TRAINING, VALIDATION, SPLIT_SIZE):

  """
    SOURCE: source_path
    TRAINING: path of directory for training_set
    VALIDATION: path of directory for validation_set
    SPLIT_SIZE: ratio val_set to dataset
  """

  listdir = os.listdir(SOURCE)
  for img in os.listdir(SOURCE):
    if os.path.getsize(os.path.join(SOURCE, img)) == 0:
      listdir.remove(img)
      print(img + " is zero length, so ignoring.")

  # Randomize the list
  random.sample(listdir, len(listdir))
  
  # Set the portion
  training_size = int(SPLIT_SIZE * len(listdir))
  validation_size = len(listdir) - training_size
  
  # Check the directory, whether it is empty or not. If not empty, make it empty
  if os.listdir(TRAINING) != []:
    for sfile in os.listdir(TRAINING):
      os.remove(os.path.join(TRAINING, sfile))
  if os.listdir(VALIDATION) != []:
    for sfile in os.listdir(VALIDATION):
      os.remove(os.path.join(VALIDATION, sfile))

  # Copy from the source to training, validation, testing
  for i in range(training_size):
    img = listdir[i]
    copyfile(os.path.join(SOURCE, img), os.path.join(TRAINING, img))
  for i in range(validation_size):
    img = listdir[i + training_size]
    copyfile(os.path.join(SOURCE, img), os.path.join(VALIDATION, img))

In [7]:
# SPLITTING DATASET
dataset_dir = os.path.join(os.getcwd(), 'sehatinV1')
split_dir = os.path.join(os.getcwd(), 'sehatinV1splitted')
trainset_dir = os.path.join(split_dir, 'training')
validationset_dir = os.path.join(split_dir, 'validation')
testset_dir = os.path.join(split_dir, 'testing')
labels = os.listdir(dataset_dir)

split_size_three = [.8, .1, .1]
split_size_two = .85

for label in labels:
    source = os.path.join(dataset_dir, label)
    training = os.path.join(trainset_dir, label)
    validation = os.path.join(validationset_dir, label)
    testing = os.path.join(testset_dir, label)
    # MAKE SURE USE THE RIGHT ONE: (into three or two sets)
    split_data_into_three(source, training, validation, testing, split_size_three) 
    print('splitting {} success'.format(label))

splitting apple success
splitting apple-pie success
splitting asparagus success
splitting avocado success
splitting bakso success
splitting barley success
splitting bayam success
splitting beef success
splitting beverage drinks success
splitting blackberry success
splitting bluefin-tuna success
splitting bread success
splitting broccoli success
splitting burger success
splitting cake success
splitting cherry success
splitting chicken-curry success
splitting chicken-wings success
splitting corn success
splitting cucumber success
splitting donut success
splitting dragon-fruit success
splitting dumpling success
splitting egg-yolk success
splitting eggplant success
splitting french-fries success
splitting fried-chicken success
splitting fried-mushroom success
splitting fried-rice success
splitting grape success
splitting gulai success
splitting hotdog success
splitting ice-cream success
splitting jus alpukat success
splitting jus mangga success
splitting kacang camilan success
splitting la

In [17]:
# ADDITIONAL: CREATE TXT FILES TO DEFINE LABELS IN THE SPLITTED DATASET
import csv
classes = []
for rootdir, dirs, files in os.walk('sehatinV1'):
    for dir in dirs:
        classes.append(dir)

with open('classes.txt', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow('ini labelnya:')

for aclass in classes:
    with open('classes.txt', 'a', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        # write each data
        writer.writerow([aclass])