In [None]:
# default_exp dataset.core

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
import argparse
import logging
from abc import ABC
from os.path import join, basename, dirname
from aiforce.core import assign_arg_prefix
from aiforce.annotation.core import AnnotationAdapter, SubsetType
from aiforce.image.pillow_tools import assign_exif_orientation, write_exif_metadata
from aiforce.io.core import create_folder

In [None]:
# hide
%reload_ext autoreload
%autoreload 2

In [None]:
# export
logger = logging.getLogger(__name__)

# Dataset

> Dataset Notes.

Creating a dataset for a classification or segmentation task. If an annotation file is present, the annotations are also prepared.
The dataset is created based on an imageset.

## Imageset

Imagesets are collected images to build a dataset from, stored in the `imagesets` folder.
The `imagesets` folder contains the following folder structure:
- imagesets/*[imageset_type]*/*[imageset_name]*

Inside the `[imageset_name]` folder are the following files / folders
- `test/`: test images (benchmark)
- `trainval/`: training and validation images for [cross validation](https://pdc-pj.backlog.jp/wiki/RAD_RAD/Neural+Network+-+Training)
- `categories.txt`: all categories (classes) the imageset contains

## Dataset Folders

Data-sets are stored in the `datasets` base folder.
The `datasets` folder contains the following folder structure:
- datasets/*[dataset_type]*/*[dataset_name]*
where `[dataset_type]` is the same as the corresponding `[imageset_type]` and `[dataset_name]` is the same as the corresponding `[imageset_name]`.

Inside the `[dataset_name]` folder are the following files / folders
- `test/`: test set (benchmark)
- `train/`: training set
- `val/`: validation set
- `categories.txt`: all categories (classes) the dataset contains

In [None]:
# export
class Dataset(ABC):
    """
    Dataset base class to build datasets.
    `args`: the arguments containing the parameters
    """

    DEFAULT_SPLIT = 0.2

    def __init__(self, input_adapter: AnnotationAdapter, output_adapter: AnnotationAdapter, split=None, seed=None,
                 sample=None):
        self.input_adapter = input_adapter
        self.output_adapter = output_adapter
        self.split = self.DEFAULT_SPLIT if split is None else split
        self.seed = seed
        self.sample = sample
        self.categories = input_adapter.read_categories()
        self.annotations = input_adapter.read_annotations()

    @classmethod
    def argparse(cls, prefix=None):
        """
        Returns the argument parser containing argument definition for command line use.
        `prefix`: a parameter prefix to set, if needed
        return: the argument parser
        """
        parser = argparse.ArgumentParser()
        parser.add_argument(assign_arg_prefix('--split', prefix),
                            dest="split",
                            help="Percentage of the data which belongs to validation set.",
                            type=float,
                            default=0.2)
        parser.add_argument(assign_arg_prefix('--seed', prefix),
                            dest="seed",
                            help="A random seed to reproduce splits.",
                            type=int,
                            default=None)
        parser.add_argument(assign_arg_prefix('--sample', prefix),
                            dest="sample",
                            help="Percentage of the data which will be copied as a sample set.",
                            type=float,
                            default=0)

        return parser

    def create_folders(self):
        """
        Creates the dataset folder structure, if not exist
        """
        output_folder = create_folder(self.output_adapter.path, clear=True)
        logger.info("Created folder {}".format(output_folder))

    def build_info(self):
        """
        Log build information
        """
        logger.info('Build configuration:')
        logger.info('input_adapter: {}'.format(type(self.input_adapter).__name__))
        logger.info('input_path: {}'.format(self.input_adapter.path))
        logger.info('output_adapter: {}'.format(type(self.output_adapter).__name__))
        logger.info('output_path: {}'.format(self.output_adapter.path))
        logger.info('split: {}'.format(self.split))
        logger.info('seed: {}'.format(self.seed))
        logger.info('sample: {}'.format(self.sample))

    def validate(self):
        """
        Validates the annotations.
        return: The skipped annotations
        """
        # validate only the trainval images, the test images have no annotations to validate
        logger.info('Start validate data at {}'.format(self.input_adapter.path))

        files = self.input_adapter.list_files()

        logger.info('Found {} files at {}'.format(len(files), self.input_adapter.path))

        delete_annotations = {}
        used_categories = set([])

        for annotation_id, annotation in self.annotations.items():

            delete_regions = {}
            for index, region in enumerate(annotation.regions):
                len_labels = len(region.labels)
                region_valid = len_labels and len(set(region.labels) & set(self.categories)) == len_labels
                if not region_valid:
                    message = '{} : Region {} with category {} is not in category list, skip region.'
                    logger.info(message.format(annotation.file_path, index, ','.join(region.labels)))

                    delete_regions[index] = True
                else:
                    # update the used regions
                    used_categories.update(region.labels)

            # delete regions after iteration is finished
            for index in sorted(list(delete_regions.keys()), reverse=True):
                del annotation.regions[index]

            # validate for empty region
            if not annotation.regions:
                logger.info('{} : Has empty regions, skip annotation.'.format(annotation.file_path))
                delete_annotations[annotation_id] = True
            # validate for file exist
            elif annotation.file_path not in files:
                logger.info('{} : File of annotations do not exist, skip annotations.'.format(annotation.file_path))
                delete_annotations[annotation_id] = True
            else:
                files.pop(files.index(annotation.file_path))

        for index, file in enumerate(files):
            logger.info('[{}] -> {} : File has no annotations, skip file.'.format(index, file))

        # list unused categories
        empty_categories = frozenset(self.categories) - used_categories
        if empty_categories:
            logger.info('The following categories have no images: {}'.format(" , ".join(empty_categories)))

        # delete annotations after iteration is finished
        for index in delete_annotations.keys():
            del self.annotations[index]

        logger.info('Finished validate image set at {}'.format(self.input_adapter.path))
        return delete_annotations

    def copy(self, train_annotation_keys, val_annotation_keys, test_files=None):
        """
        Copy the images to the dataset.
        `train_annotation_keys`: The list of training annotation keys
        `val_annotation_keys`: The list of validation annotation keys
        `test_files`: The list of test file paths
        return: A tuple containing train, val and test target file paths
        """

        logger.info('Start copy annotations from {} to {}'.format(self.input_adapter.path,
                                                                  self.output_adapter.path))

        # copy the categories files
        logger.info('Write categories to {}'.format(self.output_adapter.path))
        self.output_adapter.write_categories(self.categories)

        logger.info('Write {} annotations to {}'.format(str(SubsetType.TRAIN), self.output_adapter.path))
        annotations_train = dict(zip(train_annotation_keys, [self.annotations[key] for key in train_annotation_keys]))
        train_targets = self.output_adapter.write_annotations(annotations_train, SubsetType.TRAIN)
        logger.info('Write {} annotations to {}'.format(str(SubsetType.VAL), self.output_adapter.path))
        annotations_val = dict(zip(val_annotation_keys, [self.annotations[key] for key in val_annotation_keys]))
        val_targets = self.output_adapter.write_annotations(annotations_val, SubsetType.VAL)
        logger.info('Write {} files to {}'.format(str(SubsetType.TEST), self.output_adapter.path))
        test_targets = self.output_adapter.write_files(test_files, SubsetType.TEST) if test_files else []

        return train_targets, val_targets, test_targets

    def build(self, validate=True):
        """
        Build the data-set. This is the main logic.
        This method validates the images against the annotations,
        split the image-set into train and val on given split percentage,
        creates the data-set folders and copies the image.
        If a sample percentage is given, a sub-set is created as sample.
        `validate`: True if annotations should be validate, else False
        """
        logger.info('Validation set contains {}% of the images.'.format(int(self.split * 100)))

        # validate the image set
        skipped_annotations = self.validate() if validate else {}

        # split category files into train & val and create the sample split, if set
        train_annotation_keys = []
        val_annotation_keys = []
        sample_train_annotation_keys = []
        sample_val_annotation_keys = []

        if self.split == 0:
            train, val = (list(self.annotations.keys()), [])
        elif self.split == 1:
            train, val = ([], list(self.annotations.keys()))
        else:
            train, val = self.split_train_val_data(list(self.annotations.keys()), self.split, self.seed)
        train_annotation_keys.extend(train)
        val_annotation_keys.extend(val)

        # if a sample data set should be created, create the splits
        if self.sample:
            _, sample_train = self.split_train_val_data(train, self.sample, self.seed)
            _, sample_val = self.split_train_val_data(val, self.sample, self.seed)
            sample_train_annotation_keys.extend(sample_train)
            sample_val_annotation_keys.extend(sample_val)

        # if test files exist
        test_files = self.input_adapter.list_files(SubsetType.TEST)
        if test_files and self.sample:
            _, sample_test_files = self.split_train_val_data(test_files, self.sample, self.seed)
        else:
            sample_test_files = None

        # copy the annotations
        self.copy(train_annotation_keys, val_annotation_keys, test_files)

        if self.sample:
            # backup original output path
            output_path = self.output_adapter.path
            sample_name = "{}_sample".format(basename(output_path))
            # set output path to sample set
            self.output_adapter.path = join(dirname(output_path), sample_name)
            logger.info('Start build {} dataset containing {}% of images at {}'.format(sample_name,
                                                                                       int(self.sample * 100),
                                                                                       self.output_adapter.path))
            # create the sample data set folder
            create_folder(self.output_adapter.path)
            # copy the sample data
            self.copy(sample_train_annotation_keys, sample_val_annotation_keys, sample_test_files)

            logger.info('Finished build {} dataset containing {}% of images at {}'.format(sample_name,
                                                                                          int(self.sample * 100),
                                                                                          self.output_adapter.path))
            # restore original output path
            self.output_adapter.path = output_path

    @classmethod
    def split_train_val_data(cls, data, val_size=0.2, seed=None):
        """
        Splits the images in train and validation set
        `data`: the data to split
        `val_size`: the size of the validation set in percentage
        `seed`: A random seed to reproduce splits.
        return: the split train, validation images
        """
        from sklearn.model_selection import train_test_split
        train, test = train_test_split(data, random_state=seed, test_size=val_size) if len(data) > 1 else (data, [])
        return train, test

    @classmethod
    def assign_orientation(cls, file_path):
        """
        Assign the EXIF metadata orientation to an image.
        `file_path`: the path to the image file
        """

        # rotate image by EXIF orientation metadata and remove them
        image, exif_data, rotated = assign_exif_orientation(file_path)
        if rotated:
            write_exif_metadata(image, exif_data, file_path)


In [None]:
# hide

# for generating scripts from notebook directly
from nbdev.export import notebook2script
notebook2script()

Converted annotation-core.ipynb.
Converted annotation-folder_category_adapter.ipynb.
Converted annotation-multi_category_adapter.ipynb.
Converted annotation-via_adapter.ipynb.
Converted annotation-yolo_adapter.ipynb.
Converted annotation_converter.ipynb.
Converted annotation_viewer.ipynb.
Converted category_tools.ipynb.
Converted core.ipynb.
Converted dataset-core.ipynb.
Converted dataset-image_classification.ipynb.
Converted dataset-image_object_detection.ipynb.
Converted dataset-image_segmentation.ipynb.
Converted dataset-type.ipynb.
Converted dataset_generator.ipynb.
Converted evaluation-core.ipynb.
Converted geometry.ipynb.
Converted image-color_palette.ipynb.
Converted image-inference.ipynb.
Converted image-opencv_tools.ipynb.
Converted image-pillow_tools.ipynb.
Converted image-tools.ipynb.
Converted index.ipynb.
Converted io-core.ipynb.
Converted tensorflow-tflite_converter.ipynb.
Converted tensorflow-tflite_metadata.ipynb.
Converted tensorflow-tfrecord_builder.ipynb.
Converted t