In [8]:
pip install loguru

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
from typing import Any, Dict, Optional

from loguru import logger


@logger.catch(reraise=True)
def coco_merge(
    input_extend: str,
    input_add: str,
    output_file: str,
    indent: Optional[int] = None,
) -> str:
    """Merge COCO annotation files.

    Args:
        input_extend: Path to input file to be extended.
        input_add: Path to input file to be added.
        output_file : Path to output file with merged annotations.
        indent: Argument passed to `json.dump`. See https://docs.python.org/3/library/json.html#json.dump.
    """
    with open(input_extend, "r") as f:
        data_extend = json.load(f)
    with open(input_add, "r") as f:
        data_add = json.load(f)

    output: Dict[str, Any] = {
        k: data_extend[k] for k in data_extend if k not in ("images", "annotations")
    }

    output["images"], output["annotations"] = [], []

    for i, data in enumerate([data_extend, data_add]):
        logger.info(
            "Input {}: {} images, {} annotations".format(
                i + 1, len(data["images"]), len(data["annotations"])
            )
        )

        cat_id_map = {}
        for new_cat in data["categories"]:
            new_id = None
            for output_cat in output["categories"]:
                if new_cat["name"] == output_cat["name"]:
                    new_id = output_cat["id"]
                    break

            if new_id is not None:
                cat_id_map[new_cat["id"]] = new_id
            else:
                new_cat_id = max(c["id"] for c in output["categories"]) + 1
                cat_id_map[new_cat["id"]] = new_cat_id
                new_cat["id"] = new_cat_id
                output["categories"].append(new_cat)

        img_id_map = {}
        for image in data["images"]:
            n_imgs = len(output["images"])
            img_id_map[image["id"]] = n_imgs
            image["id"] = n_imgs

            output["images"].append(image)

        for annotation in data["annotations"]:
            n_anns = len(output["annotations"])
            annotation["id"] = n_anns
            annotation["image_id"] = img_id_map[annotation["image_id"]]
            annotation["category_id"] = cat_id_map[annotation["category_id"]]

            output["annotations"].append(annotation)

    logger.info(
        "Result: {} images, {} annotations".format(
            len(output["images"]), len(output["annotations"])
        )
    )

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=indent, ensure_ascii=False)

    return output_file

In [4]:
import os

In [23]:
coco_paths = sorted([path for path in os.listdir('../data/Уставные грамоты/Обработка/размеченные') if 'coco' in path.split('.')[-2]])

In [24]:
len(coco_paths)

78

In [25]:
base_path = '/home/admin01/vadim/historic_docs_19/data/Уставные грамоты/Обработка/размеченные/'
input_extend =  base_path + coco_paths[0]
input_add = base_path + coco_paths[1]
output_file = 'train_annotation.json'

coco_merge(input_extend, input_add, output_file, indent=None)

for coco_path in coco_paths[2:-14]:
    input_add = base_path + coco_path
    coco_merge(output_file, input_add, output_file, indent=None)
    

[32m2023-12-05 05:08:10.628[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1mInput 1: 2 images, 60 annotations[0m
[32m2023-12-05 05:08:10.630[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1mInput 2: 4 images, 79 annotations[0m
[32m2023-12-05 05:08:10.631[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m74[0m - [1mResult: 6 images, 139 annotations[0m
[32m2023-12-05 05:08:10.646[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1mInput 1: 6 images, 139 annotations[0m
[32m2023-12-05 05:08:10.647[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1mInput 2: 4 images, 78 annotations[0m
[32m2023-12-05 05:08:10.649[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m74[0m - [1mResult: 10 images, 217 annotations[0m
[32m2023-12-05 05:08:10.668[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1m

Итого для обучения возможно использовать 502 изображений и 13282 (возможно строка)

In [26]:
base_path = '/home/admin01/vadim/historic_docs_19/data/Уставные грамоты/Обработка/размеченные/'
input_extend =  base_path + coco_paths[-14]
input_add = base_path + coco_paths[-13]
output_file = 'val_annotation.json'

coco_merge(input_extend, input_add, output_file, indent=None)

for coco_path in coco_paths[-12:]:
    input_add = base_path + coco_path
    coco_merge(output_file, input_add, output_file, indent=None)

[32m2023-12-05 05:08:19.945[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1mInput 1: 5 images, 96 annotations[0m
[32m2023-12-05 05:08:19.947[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1mInput 2: 6 images, 115 annotations[0m
[32m2023-12-05 05:08:19.949[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m74[0m - [1mResult: 11 images, 211 annotations[0m
[32m2023-12-05 05:08:19.964[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1mInput 1: 11 images, 211 annotations[0m
[32m2023-12-05 05:08:19.964[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - [1mInput 2: 7 images, 137 annotations[0m
[32m2023-12-05 05:08:19.965[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m74[0m - [1mResult: 18 images, 348 annotations[0m
[32m2023-12-05 05:08:19.977[0m | [1mINFO    [0m | [36m__main__[0m:[36mcoco_merge[0m:[36m36[0m - 

Итого для валидации возможно использовать 58 изображений и 1327 (возможно строка)