In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#2. specify parameters
pipeline_params={
}
step_params={
}
substep_params={
    "FILTER_EMPTY_GT"    : False,
    "MIN_OBJECT_SIZE"    : 5
}

In [None]:
#3 define substep interface
from sinara.substep import NotebookSubstep, default_param_values, ENV_NAME, PIPELINE_NAME, ZONE_NAME, STEP_NAME, RUN_ID, ENTITY_NAME, ENTITY_PATH, SUBSTEP_NAME

substep = NotebookSubstep(pipeline_params, step_params, substep_params, **default_param_values("params/step_params.json"))

substep.interface(
    inputs =
    [
        {STEP_NAME: "data_load", ENTITY_NAME: "images"},
        {STEP_NAME: "data_load", ENTITY_NAME: "annotations"}
    ],
    tmp_entities =
    [    
        { ENTITY_NAME: "images"},
        { ENTITY_NAME: "annotations"},
        { ENTITY_NAME: "train_data"},
        { ENTITY_NAME: "eval_data"},
        { ENTITY_NAME: "test_data"},
        # TODO: annotations should be stored with corresponding images
        { ENTITY_NAME: "train_eval_config"},
        { ENTITY_NAME: "test_config"}
    ],
    outputs = 
    [
        { ENTITY_NAME: "train_data"},
        { ENTITY_NAME: "eval_data"},
        { ENTITY_NAME: "test_data"},
        # TODO: annotations should be stored with corresponding images
        { ENTITY_NAME: "train_eval_config"},
        { ENTITY_NAME: "test_config"}
    ]
    
)

substep.print_interface_info()

substep.exit_in_visualize_mode()

![interface data_prep](./imgs/data_prep_inteface.drawio.png)

In [None]:
#5 run spark
from sinara.spark import SinaraSpark

spark = SinaraSpark.run_session(0)
SinaraSpark.ui_url()

### Loading dataset and annotation files (from the previous component data_load)

In [None]:
from sinara.store import SinaraStore

inputs = substep.inputs(step_name = "data_load")
tmp_entities = substep.tmp_entities()

# copy data from previos step to tmp_entities
SinaraStore.dearchive_store_files_to_tmp(store_path=inputs.images, tmp_dir=tmp_entities.images)
SinaraStore.dearchive_store_files_to_tmp(store_path=inputs.annotations, tmp_dir=tmp_entities.annotations)

Checking annotations for empty objects, selecting object categories

In [None]:
from utils.coco import join_coco_files, load as load_coco
from utils.coco import preview_coco_file
from utils.coco import show_item
from utils.coco import get_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
import os.path as osp
import os

In [None]:
# Load annotation from json
# If using the full coco_dataset then use the full annotation file instances_train2017.json
coco_annotation = load_coco(osp.join(tmp_entities.annotations, "instances_val2017.json"))

In [None]:
# Selection of object types for subsequent neural network training
select_object_names = ["person", "bicycle", "car", "motorcycle", "bus", "truck"]
CLASSES = select_object_names
select_categories= [cat_info.copy() for cat_info in coco_annotation["categories"] if cat_info["name"] in select_object_names]
for new_id, cat_info in enumerate(select_categories, 1):
    cat_info["old_id"] = cat_info["id"]
    cat_info["id"] = new_id 
select_categories

In [None]:
# Select annotation object by select_categories
reid_categories_ids = {cat_info["old_id"]: cat_info["id"] for cat_info in select_categories} # reidentification categories

new_coco_annotations = []
for annot in coco_annotation["annotations"]:
    new_annot = annot.copy()
    category_id = new_annot["category_id"]
    if category_id in reid_categories_ids.keys():
        new_annot["category_id"] = reid_categories_ids[category_id]
        new_coco_annotations.append(new_annot)

In [None]:
# apply new annotation
coco_annotation["categories"] = select_categories.copy()
coco_annotation["annotations"] = new_coco_annotations.copy()

In [None]:
coco_annotation.keys()

### Split Dataset to Train, Valid and Test

In [None]:
# split to train and valid parts
train_coco_images, val_coco_images = train_test_split(coco_annotation["images"], test_size=0.33, random_state=42)
val_coco_images, test_coco_images = train_test_split(val_coco_images.copy(), test_size=0.1, random_state=42)

train_images_ids = [img_info["id"] for img_info in train_coco_images]
val_images_ids = [img_info["id"] for img_info in val_coco_images]
test_images_ids = [img_info["id"] for img_info in test_coco_images]

train_images_names = [img_info["file_name"] for img_info in train_coco_images]
val_images_names = [img_info["file_name"] for img_info in val_coco_images]
test_images_names = [img_info["file_name"] for img_info in test_coco_images]

In [None]:
# print count images from train, valid and test datasets
print("Count images for train datasets: ", len(train_images_ids))
print("Count images for valid datasets: ", len(val_images_ids))
print("Count images for test datasets: ", len(test_images_ids))

In [None]:
train_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in train_images_ids]
val_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in val_images_ids]
test_coco_annotations = [annot.copy() for annot in coco_annotation["annotations"] if annot["image_id"] in test_images_ids]

In [None]:
train_coco = coco_annotation.copy()
train_coco["images"] = train_coco_images
train_coco["annotations"] = train_coco_annotations

val_coco = coco_annotation.copy()
val_coco["images"] = val_coco_images
val_coco["annotations"] = val_coco_annotations

test_coco = coco_annotation.copy()
test_coco["images"] = test_coco_images
test_coco["annotations"] = test_coco_annotations

### Review Datasets

In [None]:
# preview data for train and valid dataset
preview_coco_file(train_coco, img_folder=tmp_entities.images, count=2)
preview_coco_file(val_coco, img_folder=tmp_entities.images, count=2)
preview_coco_file(test_coco, img_folder=tmp_entities.images, count=2)

In [None]:
import matplotlib.pyplot as plt

areas  = []
counts = []
categories = []
categories_annotation = []

anns = coco_annotation.get('annotations', [])
for image in coco_annotation.get('images', []):
    image_anns = [ann for ann in anns if ann['image_id'] == image['id']]
    counts.append(len(image_anns))
    
    for ann in image_anns:
        areas.append(ann.get('area'))
        categories.append(ann.get('category_id'))
    
    categories_annotation += coco_annotation['categories']
           
counts = np.array(counts)
areas  = np.array(areas)

In [None]:
import plotly.express as px

fig = px.histogram(areas, title='Area of objects at dataset images')
fig.layout.yaxis.title = 'Objects count'
fig.layout.xaxis.title = 'Area'
fig.show()

In [None]:
fig = px.histogram(counts, title='Objects count at dataset images')
fig.layout.yaxis.title = 'Objects count'
fig.layout.xaxis.title = 'Images count'
fig.show()

### Save train, validation and test datasets to parquets

In [None]:
### Save train, validation and test datasets to parquet
import shutil
from tqdm import tqdm

def prepare_dataset_images(_coco_data, source_img_folder:str, dest_img_folder: str):
    pack = []
    for img_info in tqdm(_coco_data["images"]):
        source_file_name = osp.join(source_img_folder, img_info["file_name"])
        dest_file_name = osp.join(dest_img_folder, img_info["file_name"])
        shutil.copyfile(source_file_name, dest_file_name)

In [None]:
prepare_dataset_images(train_coco, source_img_folder=tmp_entities.images, dest_img_folder=tmp_entities.train_data)
prepare_dataset_images(val_coco, source_img_folder=tmp_entities.images, dest_img_folder=tmp_entities.eval_data)
prepare_dataset_images(test_coco, source_img_folder=tmp_entities.images, dest_img_folder=tmp_entities.test_data)

### Save train, validation and test annotations to json

In [None]:
# save train and valid annotations by json to tmp_entities.train_val_config
# save test annotations by json to tmp_entities.test_config
train_annotation_path = osp.join(tmp_entities.train_eval_config, "train_coco_annotations.json")
val_annotation_path = osp.join(tmp_entities.train_eval_config, "val_coco_annotations.json")
test_annotation_path = osp.join(tmp_entities.test_config, "test_coco_annotations.json")

with open(train_annotation_path, 'w') as f:
    json.dump(train_coco, f, indent=4)

with open(val_annotation_path, 'w') as f:
    json.dump(val_coco, f, indent=4)
    
with open(test_annotation_path, 'w') as f:
    json.dump(test_coco, f, indent=4)
    
CONFIG = dict(**substep_params)
CONFIG["train_coco_annotation"] = "train_coco_annotations.json"
CONFIG["val_coco_annotation"] = "val_coco_annotations.json"
CONFIG["test_coco_annotation"] = "test_coco_annotations.json"
CONFIG["train_images"] = "train_data"
CONFIG["val_images"] = "eval_data"
CONFIG["test_images"] = "test_data"
CONFIG["CLASSES"] = CLASSES

config_path = osp.join(tmp_entities.train_eval_config, "config.json")
with open(config_path, 'w') as f:
    json.dump(CONFIG, f, indent=4)
    
config_path = osp.join(tmp_entities.test_config, "config.json")
with open(config_path, 'w') as f:
    json.dump(CONFIG, f, indent=4)   


### Send train, validation, test dataset and annotations to outputs

In [None]:
from sinara.store import SinaraStore

outputs = substep.outputs()

SinaraStore.copy_tmp_files_to_store(tmp_dir=tmp_entities.train_data, store_dir=outputs.train_data)
SinaraStore.copy_tmp_files_to_store(tmp_dir=tmp_entities.eval_data, store_dir=outputs.eval_data)
SinaraStore.copy_tmp_files_to_store(tmp_dir=tmp_entities.test_data, store_dir=outputs.test_data)
SinaraStore.copy_tmp_files_to_store(tmp_dir=tmp_entities.train_eval_config, store_dir=outputs.train_eval_config)
SinaraStore.copy_tmp_files_to_store(tmp_dir=tmp_entities.test_config, store_dir=outputs.test_config)

In [None]:
#7 stop spark
SinaraSpark.stop_session()