In [None]:
# default_exp dataset_generator

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
import sys
import argparse
import logging
from os.path import join
from datetime import datetime
from logging.handlers import MemoryHandler
from aiforce.core import list_subclasses, parse_known_args_with_help
from aiforce import annotation as annotation_package
from aiforce.annotation.core import AnnotationAdapter
from aiforce import dataset as dataset_package
from aiforce.dataset.core import Dataset

In [None]:
# hide
%reload_ext autoreload
%autoreload 2

In [None]:
# export
logger = logging.getLogger('aiforce.dataset')

# Dataset Generator

> Dataset generator Notes.

Creating a dataset for a classification or segmentation task. If an annotation file is present, the annotations are also prepared.
The dataset is created based on an imageset.

## Imageset

Imagesets are collected images to build a data-set from, stored in the `imagesets` folder.
The `imagesets` folder contains the following folder structure:
- imagesets/*[imageset_type]*/*[imageset_name]*

Inside the `[imageset_name]` folder are the following files / folders
- `test/`: test images (benchmark)
- `trainval/`: training and validation images for [cross validation](https://pdc-pj.backlog.jp/wiki/RAD_RAD/Neural+Network+-+Training)
- `categories.txt`: all categories (classes) the imageset contains

## Dataset Folders

Datasets are stored in the `datasets` base folder.
The `datasets` folder contains the following folder structure:
- datasets/*[dataset_type]*/*[dataset_name]*
where `[dataset_type]` is the same as the corresponding `[imageset_type]` and `[dataset_name]` is the same as the corresponding `[imageset_name]`.

Inside the `[dataset_name]` folder are the following files / folders
- `test/`: test set (benchmark)
- `train/`: training set
- `val/`: validation set
- `categories.txt`: all categories (classes) the dataset contains

## Helper Methods

In [None]:
# export
def configure_logging(logging_level=logging.INFO):
    """
    Configures logging for the system.
    """
    logging.basicConfig(level=logging_level)

    log_memory_handler = MemoryHandler(1, flushLevel=logging_level)
    log_memory_handler.setLevel(logging_level)

    stdout_handler = logging.StreamHandler(sys.stdout)
    stdout_handler.setLevel(logging_level)

    logger.addHandler(log_memory_handler)
    logger.addHandler(stdout_handler)

    logger.setLevel(logging_level)

    return log_memory_handler

## Build a data-set

To build a data-set from an image-set. Handles currently classification and segmentation image-sets taken from the image-set-type, which is the parent folder, the image-set folder is located in. 

In [None]:
# export
def generate(dataset: Dataset, log_memory_handler):
    """
    Generate a dataset.
    `dataset`: the dataset to build
    `log_memory_handler`: the log handler for the build log
    """
    dataset.build_info()

    logger.info('Start build {} at {}'.format(type(dataset).__name__, dataset.output_adapter.path))

    dataset.create_folders()

    # create the build log file
    log_file_name = datetime.now().strftime("build_%Y.%m.%d-%H.%M.%S.log")
    file_handler = logging.FileHandler(join(dataset.output_adapter.path, log_file_name), encoding="utf-8")
    log_memory_handler.setTarget(file_handler)

    dataset.build()

    logger.info('Finished build {} at {}'.format(type(dataset).__name__, dataset.output_adapter.path))

## Run from command line

To run the data-set builder from command line, use the following command:
`python -m mlcore.dataset [parameters]`

The following parameters are supported:
- `[categories]`: The path to the categories file. (e.g.: *imagesets/segmentation/car_damage/categories.txt*)
- `--annotation`: The path to the image-set annotation file, the data-set is build from. (e.g.: *imagesets/classification/car_damage/annotations.csv* for classification, *imagesets/segmentation/car_damage/via_region_data.json* for segmentation)
- `--split`: The percentage of the data which belongs to validation set, default to *0.2* (=20%)
- `--seed`: A random seed to reproduce splits, default to None
- `--category-label-key`: The key, the category name can be found in the annotation file, default to *category*.
- `--sample`: The percentage of the data which will be copied as a sample set with in a separate folder with "_sample" suffix. If not set, no sample data-set will be created.
- `--type`: The type of the data-set, if not explicitly set try to infer from categories file path.
- `--tfrecord`: Also create .tfrecord files.
- `--join-overlapping-regions`: Whether overlapping regions of same category should be joined.
- `--annotation-area-thresh`: Keep only annotations with minimum size (width or height) related to image size.
- `--output`: The path of the dataset folder, default to *../datasets*.
- `--name`: The name of the data-set, if not explicitly set try to infer from categories file path.

In [None]:
# export
if __name__ == '__main__' and '__file__' in globals():
    # for direct shell execution
    log_handler = configure_logging()

    # read annotation adapters to use
    adapters = list_subclasses(annotation_package, AnnotationAdapter)

    # read datasets to use
    datasets = list_subclasses(dataset_package, Dataset)

    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="The annotation input adapter.",
                        type=str,
                        choices=adapters.keys(),
                        required=True)
    parser.add_argument("-d",
                        "--dataset",
                        help="The dataset to generate.",
                        type=str,
                        choices=datasets.keys(),
                        required=True)
    parser.add_argument("-o",
                        "--output",
                        help="The annotation output adapter.",
                        type=str,
                        choices=adapters.keys(),
                        required=True)

    argv = sys.argv
    args, argv = parse_known_args_with_help(parser, argv)
    input_adapter_class = adapters[args.input]
    dataset_class = datasets[args.dataset]
    output_adapter_class = adapters[args.output]

    # parse the input arguments
    input_parser = getattr(input_adapter_class, 'argparse')(prefix='input')
    input_args, argv = parse_known_args_with_help(input_parser, argv)

    # parse the dataset arguments
    dataset_parser = getattr(dataset_class, 'argparse')()
    dataset_args, argv = parse_known_args_with_help(dataset_parser, argv)

    # parse the output arguments
    output_parser = getattr(output_adapter_class, 'argparse')(prefix='output')
    output_args, argv = parse_known_args_with_help(output_parser, argv)

    input_adapter = input_adapter_class(**vars(input_args))
    output_adapter = output_adapter_class(**vars(output_args))
    dataset_args.input_adapter = input_adapter
    dataset_args.output_adapter = output_adapter
    target_dataset = dataset_class(**vars(dataset_args))

    logger.info('Build parameters:')
    logger.info(' '.join(sys.argv[1:]))

    generate(target_dataset, log_handler)

In [None]:
# hide

# for generating scripts from notebook directly
from nbdev.export import notebook2script
notebook2script()

Converted annotation-core.ipynb.
Converted annotation-folder_category_adapter.ipynb.
Converted annotation-multi_category_adapter.ipynb.
Converted annotation-via_adapter.ipynb.
Converted annotation-yolo_adapter.ipynb.
Converted annotation_converter.ipynb.
Converted annotation_viewer.ipynb.
Converted category_tools.ipynb.
Converted core.ipynb.
Converted dataset-core.ipynb.
Converted dataset-image_classification.ipynb.
Converted dataset-image_object_detection.ipynb.
Converted dataset-image_segmentation.ipynb.
Converted dataset-type.ipynb.
Converted dataset_generator.ipynb.
Converted evaluation-core.ipynb.
Converted geometry.ipynb.
Converted image-color_palette.ipynb.
Converted image-inference.ipynb.
Converted image-opencv_tools.ipynb.
Converted image-pillow_tools.ipynb.
Converted image-tools.ipynb.
Converted index.ipynb.
Converted io-core.ipynb.
Converted tensorflow-tflite_converter.ipynb.
Converted tensorflow-tflite_metadata.ipynb.
Converted tensorflow-tfrecord_builder.ipynb.
Converted t