In [1]:
import itertools
import os
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Configurations for Data Preparation

In [2]:
config = {
    'setup_dir': True,
    'split': {
        'training': 2000,
        'validation':500,
        'testing': 800,
    },
    'class_path': {
        'cat':'PetImages/Cat',
        'dog':'PetImages/Dog'
    },
    'dst_path': None,
    'dataset_label': 'PetImages',
    'data_augmentation': True,
    'classes': ['cat', 'dog'],
    'ext': 'jpg',
    'augmented_images':
    lambda from_path, to_path, batch_size, ext, classes : ImageDataGenerator(
            rotation_range=40,
            height_shift_range=0.2,
            width_shift_range=0.2,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            fill_mode='nearest'
        ).flow_from_directory(
            directory=from_path, 
            classes=classes,
            batch_size=batch_size, 
            save_to_dir=to_path,
            save_format=ext
        )
}

'''
setup_dir
--- if need to set up directory for Training, Validation, and Testing folders from a source folder


if setup_dir is True, properly set up the following:

    split
    --- number of samples for Training, Validation, and Testing
    
    class_path 
    --- directory path of each class relative to "data" folder
    
else, set up:

    dst_path
    --- directory path of where the Training, Validation, and Testing folders are located
    

dataset_path_label
--- label for the split folder
--- output format: <dataset_path_label>_<training_count>-<validation_count>-<test_count>

data_augmentation 
--- if applying data augmentation

classes
--- label for each class

ext
--- extension name of the file to be extracted

augmented_images
--- set intended data augmentation configuration here
'''

# Set variables
setup_dir = config['setup_dir']
split_dict = config['split']
data_augmentation  = config['data_augmentation']
class_path  = config['class_path']
classes = config['classes']
dataset_label = config['dataset_label']
ext = config['ext']
dst_path = config['dst_path']

# Checks if classes and class_path are aligned
for C in classes:
    assert class_path[C], 'The list "class_path" is missing a class found in list "classes"'
for key in class_path:
    assert key in classes, 'The list "classes" is missing a class found in list "class_path"'

### Make directory for the Training, Validation, and Testing dataset

In [3]:
os.chdir('data')

if setup_dir:
    dataset_dir = f'{dataset_label}_{split_dict["training"]}-{split_dict["validation"]}-{split_dict["testing"]}'

    if os.path.isdir(dataset_dir) is False:
        os.makedirs(dataset_dir)

        os.chdir(dataset_dir)
        for C in classes:
            os.makedirs(f'training/{C}')
            os.makedirs(f'validation/{C}')
            os.makedirs(f'testing/{C}')

        os.chdir('../')
        
        for C, path in class_path.items():
            index = 0
            # train, valid, test
            for key, value in split_dict.items():
                # copy pasting of images
                for i in range(value):
                    shutil.copy(f'{path}/{index}.{ext}',f'{dataset_dir}/{key}/{C}/')
                    index += 1
    else:
        print('Directory with this split configuration already exists')
    
    dst_path = dataset_dir 
else:
    assert dst_path, 'Path "dst_path" is empty. Nothing to read from!'

### Apply Data Augmentation

In [4]:
if data_augmentation:
    os.chdir(dst_path)
    
    from_path = 'nonaugmented_training'
    to_path = 'training'
    
    batch_size = 10
    range_size = split_dict['training']//batch_size
    
    assert range_size*batch_size == split_dict['training'], \
    'Please set "batch_size" to be divisible by the Training sample size'
    
    if not os.path.isdir(from_path):
        os.rename(to_path,from_path)
        os.makedirs(to_path)
    os.chdir(to_path)
    
    for C in classes:
        assert not os.path.isdir(C), 'Please delete all class folders in the "training" folder'
        os.makedirs(C)
    os.chdir('../')
    
    for C in classes:
        fn = config['augmented_images']
        augmented_images = fn(from_path=from_path, 
                              to_path=f'training/{C}', 
                              ext=ext,
                              classes=[C],
                              batch_size=batch_size)

        for i in range(range_size):
          augmented_images.next()

    os.chdir('../')
os.chdir('../')

Found 2000 images belonging to 1 classes.
Found 2000 images belonging to 1 classes.
