In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# specify substep parameters for interactive run
# this cell will be replaced during job run with the parameters from json within params subfolder
substep_params={
    "FILTER_EMPTY_GT"    : False,
    "MIN_OBJECT_SIZE"    : 5
}

In [None]:
# load pipeline and step parameters - do not edit
from sinara.substep import get_pipeline_params, get_step_params
pipeline_params = get_pipeline_params(pprint=True)
step_params = get_step_params(pprint=True)

In [None]:
# define substep interface
from sinara.substep import NotebookSubstep, ENV_NAME, PIPELINE_NAME, ZONE_NAME, STEP_NAME, RUN_ID, ENTITY_NAME, ENTITY_PATH, SUBSTEP_NAME

substep = NotebookSubstep(pipeline_params, step_params, substep_params)

substep.interface(
    inputs =
    [
        {STEP_NAME: "data_load", ENTITY_NAME: "coco_datasets_images"}, # images from data_load step
        {STEP_NAME: "data_load", ENTITY_NAME: "coco_datasets_annotations"} # coco annotations from data_load step
    ],
    tmp_entities =
    [    
        { ENTITY_NAME: "coco_datasets_images"}, # extracted temporary images from Sinara Archive
        { ENTITY_NAME: "coco_datasets_annotations"}, # extracted temporary annotations from Sinara Archive
        { ENTITY_NAME: "coco_train_dataset"}, # temporary coco dataset for object detector train
        { ENTITY_NAME: "coco_eval_dataset"}, # temporary coco dataset for object detector eval
        { ENTITY_NAME: "coco_test_dataset"}, # temporary coco dataset for object detector test
    ],
    outputs = 
    [
        { ENTITY_NAME: "coco_train_dataset"}, # coco dataset archived for object detector train
        { ENTITY_NAME: "coco_eval_dataset"}, # coco dataset archived  for object detector eval
        { ENTITY_NAME: "coco_test_dataset"}, # coco dataset archived  for object detector test
    ]
)

substep.print_interface_info()

substep.exit_in_visualize_mode()

In [None]:
# specify all notebook wide libraries imports here
# Sinara lib imports is left in the place of their usage
from utils.coco import preview_coco_file, load_coco_file
from sklearn.model_selection import train_test_split
import numpy as np
import os.path as osp
import os
import matplotlib.pyplot as plt
import plotly.express as px
from utils.coco.utils import prepare_coco_dataset_images
import json

In [None]:
# run spark
from sinara.spark import SinaraSpark
from sinara.archive import SinaraArchive

spark = SinaraSpark.run_session(0)
archive = SinaraArchive(spark)
SinaraSpark.ui_url()

### Loading coco_datasets_images and annotation files (from the previous step data_load)

In [None]:
inputs = substep.inputs(step_name = "data_load")
tmp_entities = substep.tmp_entities()

# copy data from previos step to tmp_entities
archive.unpack_files_from_store_to_tmp(store_path=inputs.coco_datasets_images, tmp_entity_dir=tmp_entities.coco_datasets_images)
archive.unpack_files_from_store_to_tmp(store_path=inputs.coco_datasets_annotations, tmp_entity_dir=tmp_entities.coco_datasets_annotations)

### Selecting object categories from general annotation

In [None]:
# Load annotation from json
coco_annotation = load_coco_file(osp.join(tmp_entities.coco_datasets_annotations, "instances_val2017.json"))

# Selection of object types for subsequent neural network training
select_object_names = ["person", "bicycle", "car", "motorcycle", "bus", "truck"]
select_categories= [cat_info.copy() for cat_info in coco_annotation["categories"] if cat_info["name"] in select_object_names]
for new_id, cat_info in enumerate(select_categories, 1):
    cat_info["old_id"] = cat_info["id"]
    cat_info["id"] = new_id 
    
# Select annotation object by select_categories
reid_categories_ids = {cat_info["old_id"]: cat_info["id"] for cat_info in select_categories} # reidentification categories

new_coco_annotations = []
for annot in coco_annotation["annotations"]:
    new_annot = annot.copy()
    category_id = new_annot["category_id"]
    if category_id in reid_categories_ids.keys():
        new_annot["category_id"] = reid_categories_ids[category_id]
        new_coco_annotations.append(new_annot)
        
# apply new annotation
coco_annotation["categories"] = select_categories.copy()
coco_annotation["annotations"] = new_coco_annotations.copy()

### Split Coco Dataset to Train, Valid and Test

In [None]:
# split to train, valid and test parts
train_coco_images, val_coco_images = train_test_split(coco_annotation["images"], test_size=0.33, random_state=42)
val_coco_images, test_coco_images = train_test_split(val_coco_images.copy(), test_size=0.1, random_state=42)

train_images_ids = [img_info["id"] for img_info in train_coco_images]
val_images_ids = [img_info["id"] for img_info in val_coco_images]
test_images_ids = [img_info["id"] for img_info in test_coco_images]

train_images_names = [img_info["file_name"] for img_info in train_coco_images]
val_images_names = [img_info["file_name"] for img_info in val_coco_images]
test_images_names = [img_info["file_name"] for img_info in test_coco_images]

train_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in train_images_ids]
val_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in val_images_ids]
test_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in test_images_ids]

# create coco annotation for train dataset
train_coco = coco_annotation.copy()
train_coco["images"] = train_coco_images
train_coco["annotations"] = train_coco_annotations

# create coco annotation for train dataset
val_coco = coco_annotation.copy()
val_coco["images"] = val_coco_images
val_coco["annotations"] = val_coco_annotations

# create coco annotation for train dataset
test_coco = coco_annotation.copy()
test_coco["images"] = test_coco_images
test_coco["annotations"] = test_coco_annotations

### Review Coco Datasets

In [None]:
# preview examples of data from train, valid and test dataset
preview_coco_file(train_coco, img_folder=tmp_entities.coco_datasets_images, count=2)
preview_coco_file(val_coco, img_folder=tmp_entities.coco_datasets_images, count=2)
preview_coco_file(test_coco, img_folder=tmp_entities.coco_datasets_images, count=2)

In [None]:
# overview of the distribution of labeled data (detection)
areas  = []
counts = []
categories = []
categories_annotation = []

anns = coco_annotation.get('annotations', [])
for image in coco_annotation.get('images', []):
    image_anns = [ann for ann in anns if ann['image_id'] == image['id']]
    counts.append(len(image_anns))
    
    for ann in image_anns:
        areas.append(ann.get('area'))
        categories.append(ann.get('category_id'))
    
    categories_annotation += coco_annotation['categories']
           
counts = np.array(counts)
areas  = np.array(areas)

#overview of the distribution of detection marking areas throughout the entire dataset
fig = px.histogram(areas, title='Area of objects at dataset images')
fig.layout.yaxis.title = 'Objects count'
fig.layout.xaxis.title = 'Area'
fig.show()

#erview of the distribution of marked objects throughout the entire dataset
fig = px.histogram(counts, title='Objects count at dataset images')
fig.layout.yaxis.title = 'Objects count'
fig.layout.xaxis.title = 'Images count'
fig.show()

### Save temporarily train, validation and test coco datasets to parquets

In [None]:
# Save images for train, validation and test coco datasets to tmp_entities
prepare_coco_dataset_images(train_coco, source_img_folder=tmp_entities.coco_datasets_images, dest_img_folder=tmp_entities.coco_train_dataset)
prepare_coco_dataset_images(val_coco, source_img_folder=tmp_entities.coco_datasets_images, dest_img_folder=tmp_entities.coco_eval_dataset)
prepare_coco_dataset_images(test_coco, source_img_folder=tmp_entities.coco_datasets_images, dest_img_folder=tmp_entities.coco_test_dataset)

### Save temporarily train, validation and test annotations to json

In [None]:
# save annotations for each coco datasets
train_annotation_path = osp.join(tmp_entities.coco_train_dataset, "train_coco_annotations.json")
val_annotation_path = osp.join(tmp_entities.coco_eval_dataset, "val_coco_annotations.json")
test_annotation_path = osp.join(tmp_entities.coco_test_dataset, "test_coco_annotations.json")

with open(train_annotation_path, 'w') as f:
    json.dump(train_coco, f, indent=4)

with open(val_annotation_path, 'w') as f:
    json.dump(val_coco, f, indent=4)
    
with open(test_annotation_path, 'w') as f:
    json.dump(test_coco, f, indent=4)

### Archiving train, validation and test coco datasets to Sinara Storage

In [None]:
# save tmp_entities (coco_train_dataset, coco_eval_dataset, coco_test_dataset) to outputs of step data_prep
outputs = substep.outputs()

archive.pack_files_from_tmp_to_store(tmp_entity_dir=tmp_entities.coco_train_dataset, store_path=outputs.coco_train_dataset)
archive.pack_files_from_tmp_to_store(tmp_entity_dir=tmp_entities.coco_eval_dataset, store_path=outputs.coco_eval_dataset)
archive.pack_files_from_tmp_to_store(tmp_entity_dir=tmp_entities.coco_test_dataset, store_path=outputs.coco_test_dataset)

In [None]:
# stop spark
SinaraSpark.stop_session()