In [2]:
import json
import os

def merge_coco_files(file1_path, file2_path, output_path):
    # Folder prefix you want in ALL file_name fields
    target_folder = "dataset_for_labeling/"

    def normalize_file_name(path_str: str) -> str:
        """
        Always return: dataset_for_labeling/<basename>
        Works for paths like:
        - "image_302.png"
        - "dataset_for_labeling/image_302.png"
        - "C:\\something\\dataset_for_labeling\\image_302.png"
        - "some/other/folder/image_302.png"
        """
        base = os.path.basename(path_str.replace("\\", "/"))
        return target_folder + base

    # 1. Load both JSON files
    with open(file1_path, "r", encoding="utf-8") as f:
        data1 = json.load(f)

    with open(file2_path, "r", encoding="utf-8") as f:
        data2 = json.load(f)

    # 2. Verify / map categories by name
    cat_map = {}
    name_to_id_1 = {cat["name"]: cat["id"] for cat in data1.get("categories", [])}

    print("Mapping Categories...")
    for cat2 in data2.get("categories", []):
        name = cat2["name"]
        if name in name_to_id_1:
            cat_map[cat2["id"]] = name_to_id_1[name]
        else:
            new_id = max([c["id"] for c in data1["categories"]], default=0) + 1
            new_cat = cat2.copy()
            new_cat["id"] = new_id
            data1["categories"].append(new_cat)
            cat_map[cat2["id"]] = new_id
            name_to_id_1[name] = new_id
            print(f"  - Added new category: {name} (ID: {new_id})")

    # 3. Find max IDs in file1
    max_img_id = max([img["id"] for img in data1.get("images", [])], default=0)
    max_ann_id = max([ann["id"] for ann in data1.get("annotations", [])], default=0)

    print(f"Base Max Image ID: {max_img_id}")
    print(f"Base Max Ann ID: {max_ann_id}")

    # 4. Standardize paths for file1 images
    print("Standardizing file_name for file 1...")
    for img in data1.get("images", []):
        img["file_name"] = normalize_file_name(img.get("file_name", ""))

    # 5. Process Images from File 2 (re-id + normalize file_name)
    img_id_map = {}

    print("Merging and standardizing file_name for file 2...")
    for img in data2.get("images", []):
        old_id = img["id"]
        max_img_id += 1
        new_id = max_img_id

        img_id_map[old_id] = new_id

        img["id"] = new_id
        img["file_name"] = normalize_file_name(img.get("file_name", ""))

        data1["images"].append(img)

    # 6. Process Annotations from File 2
    for ann in data2.get("annotations", []):
        max_ann_id += 1
        ann["id"] = max_ann_id
        ann["image_id"] = img_id_map[ann["image_id"]]
        ann["category_id"] = cat_map[ann["category_id"]]
        data1["annotations"].append(ann)

    # 7. Save Result
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data1, f, ensure_ascii=False)

    print(f"✅ Success! Merged file saved to: {output_path}")
    print(f"   Total Images: {len(data1['images'])}")
    print(f"   Total Annotations: {len(data1['annotations'])}")

if __name__ == "__main__":
    file1 = r"C:\Users\adiha\Desktop\GenAi\annotations\merged_annotations0-300.json"
    file2 = r"C:\Users\adiha\Desktop\GenAi\annotations\instances_default301-460.json"
    output = r"C:\Users\adiha\Desktop\GenAi\annotations\merged_annotations0-460.json"

    merge_coco_files(file1, file2, output)


Mapping Categories...
Base Max Image ID: 301
Base Max Ann ID: 2980
Standardizing file_name for file 1...
Merging and standardizing file_name for file 2...
✅ Success! Merged file saved to: C:\Users\adiha\Desktop\GenAi\annotations\merged_annotations0-460.json
   Total Images: 461
   Total Annotations: 5192
