In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# specify parameters
pipeline_params={
}
step_params={
}
substep_params={
    "FILTER_EMPTY_GT"    : False,
    "MIN_OBJECT_SIZE"    : 5
}

In [None]:
# define substep interface
from sinara.substep import NotebookSubstep, default_param_values, ENV_NAME, PIPELINE_NAME, ZONE_NAME, STEP_NAME, RUN_ID, ENTITY_NAME, ENTITY_PATH, SUBSTEP_NAME

substep = NotebookSubstep(pipeline_params, step_params, substep_params, **default_param_values("params/step_params.json"))

substep.interface(
    inputs =
    [
        {STEP_NAME: "data_load", ENTITY_NAME: "images"}, # images from data_load step
        {STEP_NAME: "data_load", ENTITY_NAME: "annotations"} # coco annotations from data_load step
    ],
    tmp_entities =
    [    
        { ENTITY_NAME: "images"}, # extracted temporary images from Sinara Archive
        { ENTITY_NAME: "annotations"}, # extracted temporary annotations from Sinara Archive
        { ENTITY_NAME: "train_data"}, # temporary coco dataset for object detector train
        { ENTITY_NAME: "eval_data"}, # temporary coco dataset for object detector eval
        { ENTITY_NAME: "test_data"}, # temporary coco dataset for object detector test
        #{ ENTITY_NAME: "dataset_config"} # information about all datasets and classes used for pipeline
    ],
    outputs = 
    [
        { ENTITY_NAME: "train_data"}, # coco dataset archived for object detector train
        { ENTITY_NAME: "eval_data"}, # coco dataset archived  for object detector eval
        { ENTITY_NAME: "test_data"}, # coco dataset archived  for object detector test
        #{ ENTITY_NAME: "dataset_config"} # information about all datasets and classes used for pipeline
    ]
)

substep.print_interface_info()

substep.exit_in_visualize_mode()

In [None]:
# run spark
from sinara.spark import SinaraSpark

spark = SinaraSpark.run_session(0)
SinaraSpark.ui_url()

### Loading dataset and annotation files (from the previous component data_load)

In [None]:
from sinara.store import SinaraStore

inputs = substep.inputs(step_name = "data_load")
tmp_entities = substep.tmp_entities()

# copy data from previos step to tmp_entities
SinaraStore.dearchive_store_files_to_tmp(store_path=inputs.images, tmp_dir=tmp_entities.images)
SinaraStore.dearchive_store_files_to_tmp(store_path=inputs.annotations, tmp_dir=tmp_entities.annotations)

### Selecting object categories from general annotation

In [None]:
from utils.coco import join_coco_files, load as load_coco
from utils.coco import preview_coco_file
from utils.coco import show_item
from utils.coco import get_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
import os.path as osp
import os

# Load annotation from json
coco_annotation = load_coco(osp.join(tmp_entities.annotations, "instances_val2017.json"))

# Selection of object types for subsequent neural network training
select_object_names = ["person", "bicycle", "car", "motorcycle", "bus", "truck"]
# CLASSES = select_object_names
select_categories= [cat_info.copy() for cat_info in coco_annotation["categories"] if cat_info["name"] in select_object_names]
for new_id, cat_info in enumerate(select_categories, 1):
    cat_info["old_id"] = cat_info["id"]
    cat_info["id"] = new_id 
    
# Select annotation object by select_categories
reid_categories_ids = {cat_info["old_id"]: cat_info["id"] for cat_info in select_categories} # reidentification categories

new_coco_annotations = []
for annot in coco_annotation["annotations"]:
    new_annot = annot.copy()
    category_id = new_annot["category_id"]
    if category_id in reid_categories_ids.keys():
        new_annot["category_id"] = reid_categories_ids[category_id]
        new_coco_annotations.append(new_annot)
        
# apply new annotation
coco_annotation["categories"] = select_categories.copy()
coco_annotation["annotations"] = new_coco_annotations.copy()

### Split Dataset to Train, Valid and Test

In [None]:
# split to train, valid and test parts
train_coco_images, val_coco_images = train_test_split(coco_annotation["images"], test_size=0.33, random_state=42)
val_coco_images, test_coco_images = train_test_split(val_coco_images.copy(), test_size=0.1, random_state=42)

train_images_ids = [img_info["id"] for img_info in train_coco_images]
val_images_ids = [img_info["id"] for img_info in val_coco_images]
test_images_ids = [img_info["id"] for img_info in test_coco_images]

train_images_names = [img_info["file_name"] for img_info in train_coco_images]
val_images_names = [img_info["file_name"] for img_info in val_coco_images]
test_images_names = [img_info["file_name"] for img_info in test_coco_images]

train_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in train_images_ids]
val_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in val_images_ids]
test_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in test_images_ids]


# create coco annotation for train dataset
train_coco = coco_annotation.copy()
train_coco["images"] = train_coco_images
train_coco["annotations"] = train_coco_annotations

# create coco annotation for train dataset
val_coco = coco_annotation.copy()
val_coco["images"] = val_coco_images
val_coco["annotations"] = val_coco_annotations

# create coco annotation for train dataset
test_coco = coco_annotation.copy()
test_coco["images"] = test_coco_images
test_coco["annotations"] = test_coco_annotations

### Review Datasets

In [None]:
# preview examples of data from train, valid and test dataset
preview_coco_file(train_coco, img_folder=tmp_entities.images, count=2)
preview_coco_file(val_coco, img_folder=tmp_entities.images, count=2)
preview_coco_file(test_coco, img_folder=tmp_entities.images, count=2)

In [None]:
# overview of the distribution of labeled data (detection)
import matplotlib.pyplot as plt
import plotly.express as px

areas  = []
counts = []
categories = []
categories_annotation = []

anns = coco_annotation.get('annotations', [])
for image in coco_annotation.get('images', []):
    image_anns = [ann for ann in anns if ann['image_id'] == image['id']]
    counts.append(len(image_anns))
    
    for ann in image_anns:
        areas.append(ann.get('area'))
        categories.append(ann.get('category_id'))
    
    categories_annotation += coco_annotation['categories']
           
counts = np.array(counts)
areas  = np.array(areas)

#overview of the distribution of detection marking areas throughout the entire dataset
fig = px.histogram(areas, title='Area of objects at dataset images')
fig.layout.yaxis.title = 'Objects count'
fig.layout.xaxis.title = 'Area'
fig.show()

#erview of the distribution of marked objects throughout the entire dataset
fig = px.histogram(counts, title='Objects count at dataset images')
fig.layout.yaxis.title = 'Objects count'
fig.layout.xaxis.title = 'Images count'
fig.show()

### Save train, validation and test datasets to parquets

In [None]:
# Save images for train, validation and test datasets to tmp_entities
import shutil
from tqdm import tqdm

def prepare_dataset_images(_coco_data, source_img_folder:str, dest_img_folder: str):
    pack = []
    for img_info in tqdm(_coco_data["images"]):
        source_file_name = osp.join(source_img_folder, img_info["file_name"])
        dest_file_name = osp.join(dest_img_folder, img_info["file_name"])
        shutil.copyfile(source_file_name, dest_file_name)
        
prepare_dataset_images(train_coco, source_img_folder=tmp_entities.images, dest_img_folder=tmp_entities.train_data)
prepare_dataset_images(val_coco, source_img_folder=tmp_entities.images, dest_img_folder=tmp_entities.eval_data)
prepare_dataset_images(test_coco, source_img_folder=tmp_entities.images, dest_img_folder=tmp_entities.test_data)

### Save train, validation and test annotations to json

In [None]:
# save annotations for every datasets
import json
train_annotation_path = osp.join(tmp_entities.train_data, "train_coco_annotations.json")
val_annotation_path = osp.join(tmp_entities.eval_data, "val_coco_annotations.json")
test_annotation_path = osp.join(tmp_entities.test_data, "test_coco_annotations.json")

with open(train_annotation_path, 'w') as f:
    json.dump(train_coco, f, indent=4)

with open(val_annotation_path, 'w') as f:
    json.dump(val_coco, f, indent=4)
    
with open(test_annotation_path, 'w') as f:
    json.dump(test_coco, f, indent=4)
    
# CONFIG = dict(**substep_params)
# CONFIG["train_coco_annotation"] = "train_coco_annotations.json"
# CONFIG["val_coco_annotation"] = "val_coco_annotations.json"
# CONFIG["test_coco_annotation"] = "test_coco_annotations.json"
# CONFIG["train_images"] = "train_data"
# CONFIG["val_images"] = "eval_data"
# CONFIG["test_images"] = "test_data"
# TODO:
# CONFIG["CLASSES"] = CLASSES

# config_path = osp.join(tmp_entities.dataset_config, "config.json")
# with open(config_path, 'w') as f:
#     json.dump(CONFIG, f, indent=4) 

### Archiving train, validation and test coco datasets

In [None]:
# save tmp_entities (train_data,eval_data,test_data) to outputs of step data_prep
from sinara.store import SinaraStore

outputs = substep.outputs()

SinaraStore.archive_tmp_files_to_store(tmp_dir=tmp_entities.train_data, store_path=outputs.train_data)
SinaraStore.archive_tmp_files_to_store(tmp_dir=tmp_entities.eval_data, store_path=outputs.eval_data)
SinaraStore.archive_tmp_files_to_store(tmp_dir=tmp_entities.test_data, store_path=outputs.test_data)
#SinaraStore.archive_tmp_files_to_store(tmp_dir=tmp_entities.dataset_config, store_path=outputs.dataset_config)

In [None]:
# stop spark
SinaraSpark.stop_session()