### Include Library

In [45]:
# library for cap_f1
from cap_f1 import *
from datetime import datetime
from multiprocessing import Pool
import math
import json
import glob
import os


# code for no need for restarting the kernel when python file is updated
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [46]:
print("Load caption file...")

# features that we need to extract from the original dataset
keys = ["file_name", "human_captions", "model_captions"]
org_caption_dataset = read_json("combined-caption-output_7304-images2025-03-29_21_40_00.json", keys)

all_human_captions=[]
for item in org_caption_dataset:
    # Filter out human captions
    human_captions = [
        hc["caption"]            
        for hc in item["human_captions"]
        if hc["caption"] != "Quality issues are too severe to recognize visual content."
    ]
    all_human_captions.append(human_captions)



Load caption file...


In [40]:
# # This is for testing
# print(len(org_caption_dataset))
# org_caption_dataset = org_caption_dataset[:128]
# all_human_captions = all_human_captions[:128]

### Run Multi Processors

In [47]:
def process_batch(start_idx, end_idx, org_caption_dataset, all_human_captions, folder_path, timestamp, chunk_id):
    subset = org_caption_dataset[start_idx:end_idx]
    LIMIT = len(subset)
    human_subset = all_human_captions[start_idx:end_idx]

    # Step 1: Parse atomics
    T_atomics, g_atomics = generate_atomic_statement(subset, limit=LIMIT)
    save_results_json(output_path=f"{folder_path}/parsed_caption_{timestamp}_chunk{chunk_id}.json",
                      org_dataset=subset, T_atomics=T_atomics, g_atomics=g_atomics, limit=LIMIT)

    # Step 2: Match human & generated
    metadata = evaluate_matching(human_subset, T_atomics, g_atomics)
    save_results_json(output_path=f"{folder_path}/recall_precision_{timestamp}_chunk{chunk_id}.json",
                      update_existing=f"{folder_path}/parsed_caption_{timestamp}_chunk{chunk_id}.json",
                      metadata=metadata, limit=LIMIT)

    # Step 3: Cap F1
    evaluation = calculate_cap_f1(metadata)
    save_results_json(output_path=f"{folder_path}/final_{timestamp}_chunk{chunk_id}.json",
                      update_existing=f"{folder_path}/recall_precision_{timestamp}_chunk{chunk_id}.json",
                      evaluations=evaluation, limit=LIMIT)

def run_parallel_processing(org_caption_dataset, all_human_captions, folder_path, timestamp, num_workers=32):
    total = len(org_caption_dataset)
    chunk_size = math.ceil(total / num_workers)

    with Pool(processes=num_workers) as pool:
        jobs = []
        for i in range(num_workers):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, total)
            jobs.append(pool.apply_async(process_batch, (start_idx, end_idx, org_caption_dataset, all_human_captions, folder_path, timestamp, i)))
        
        for job in jobs:
            job.get()

In [42]:
# for filename
now = datetime.now()
timestamp = now.strftime("%Y-%m-%d_%H-%M")

#create folder to save the results
folder_path = f"results/{timestamp}"
os.makedirs(folder_path, exist_ok=True)

run_parallel_processing(org_caption_dataset, all_human_captions, folder_path, timestamp, num_workers=32)

100%|██████████| 4/4 [00:39<00:00,  9.77s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk8.json


100%|██████████| 4/4 [00:44<00:00, 11.00s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk17.json


100%|██████████| 4/4 [00:44<00:00, 11.02s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk16.json


100%|██████████| 4/4 [00:44<00:00, 11.17s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk21.json


100%|██████████| 4/4 [00:44<00:00, 11.25s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk23.json


100%|██████████| 4/4 [00:46<00:00, 11.59s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk18.json


100%|██████████| 4/4 [00:46<00:00, 11.75s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk10.json


100%|██████████| 4/4 [00:47<00:00, 11.82s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk1.json


100%|██████████| 4/4 [00:48<00:00, 12.05s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk20.json


100%|██████████| 4/4 [00:48<00:00, 12.07s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk29.json


100%|██████████| 4/4 [00:48<00:00, 12.20s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk31.json

100%|██████████| 4/4 [00:48<00:00, 12.39s/it]




100%|██████████| 4/4 [00:48<00:00, 12.21s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk3.json


100%|██████████| 4/4 [00:48<00:00, 12.22s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk2.json


100%|██████████| 4/4 [00:48<00:00, 12.22s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk25.json


100%|██████████| 4/4 [00:49<00:00, 12.43s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk7.json


100%|██████████| 4/4 [00:49<00:00, 12.48s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk11.json


100%|██████████| 4/4 [00:50<00:00, 12.62s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk15.json


100%|██████████| 4/4 [00:51<00:00, 12.79s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk19.json


100%|██████████| 4/4 [00:51<00:00, 12.90s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk4.json


100%|██████████| 4/4 [00:51<00:00, 12.93s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk28.json


100%|██████████| 4/4 [00:53<00:00, 13.42s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk0.json


100%|██████████| 4/4 [00:54<00:00, 13.53s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk30.json


100%|██████████| 4/4 [00:55<00:00, 13.87s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk24.json


100%|██████████| 4/4 [00:56<00:00, 14.14s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk12.json


100%|██████████| 4/4 [00:58<00:00, 14.52s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk6.json


100%|██████████| 4/4 [00:58<00:00, 14.53s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk5.json


100%|██████████| 4/4 [01:02<00:00, 15.57s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk9.json


100%|██████████| 4/4 [01:03<00:00, 15.87s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk26.json


100%|██████████| 4/4 [01:05<00:00, 16.34s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk14.json


100%|██████████| 4/4 [01:08<00:00, 17.04s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk22.json


100%|██████████| 4/4 [01:08<00:00, 17.07s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk27.json


100%|██████████| 4/4 [00:42<00:00, 10.64s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk8.json


100%|██████████| 4/4 [00:00<00:00, 66841.50it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk8.json


100%|██████████| 4/4 [00:39<00:00,  9.83s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk16.json


100%|██████████| 4/4 [00:00<00:00, 64776.90it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk16.json


100%|██████████| 4/4 [00:38<00:00,  9.73s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk21.json


100%|██████████| 4/4 [00:00<00:00, 38746.46it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk21.json


100%|██████████| 4/4 [01:25<00:00, 21.46s/it]


Saved JSON to: results/2025-04-03_15-10/parsed_caption_2025-04-03_15-10_chunk13.json


100%|██████████| 4/4 [00:37<00:00,  9.29s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk19.json


100%|██████████| 4/4 [00:00<00:00, 38130.04it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk19.json


100%|██████████| 4/4 [00:45<00:00, 11.26s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk17.json


100%|██████████| 4/4 [00:00<00:00, 81049.35it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk17.json


100%|██████████| 4/4 [00:40<00:00, 10.23s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk29.json


100%|██████████| 4/4 [00:00<00:00, 61908.55it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk29.json

 75%|███████▌  | 3/4 [00:32<00:10, 10.75s/it]




100%|██████████| 4/4 [00:42<00:00, 10.52s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk15.json


100%|██████████| 4/4 [00:00<00:00, 69905.07it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk15.json


100%|██████████| 4/4 [00:43<00:00, 10.80s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk11.json


100%|██████████| 4/4 [00:00<00:00, 66313.11it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk11.json


100%|██████████| 4/4 [00:45<00:00, 11.30s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk20.json


100%|██████████| 4/4 [00:00<00:00, 66576.25it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk20.json


100%|██████████| 4/4 [00:47<00:00, 11.75s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk18.json


100%|██████████| 4/4 [00:00<00:00, 81442.80it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk18.json


100%|██████████| 4/4 [00:46<00:00, 11.63s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk1.json


100%|██████████| 4/4 [00:00<00:00, 67923.95it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk1.json


100%|██████████| 4/4 [00:45<00:00, 11.31s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk3.json


100%|██████████| 4/4 [00:00<00:00, 37957.50it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk3.json


100%|██████████| 4/4 [00:44<00:00, 11.02s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk4.json


100%|██████████| 4/4 [00:00<00:00, 36002.61it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk4.json


100%|██████████| 4/4 [00:51<00:00, 12.89s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk23.json


100%|██████████| 4/4 [00:00<00:00, 46603.38it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk23.json


100%|██████████| 4/4 [00:50<00:00, 12.72s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk10.json


100%|██████████| 4/4 [00:00<00:00, 68478.43it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk10.json


100%|██████████| 4/4 [00:49<00:00, 12.30s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk7.json


100%|██████████| 4/4 [00:00<00:00, 41838.44it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk7.json


100%|██████████| 4/4 [00:50<00:00, 12.66s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk2.json


100%|██████████| 4/4 [00:00<00:00, 62137.84it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk2.json


100%|██████████| 4/4 [00:41<00:00, 10.35s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk5.json


100%|██████████| 4/4 [00:00<00:00, 66576.25it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk5.json


100%|██████████| 4/4 [00:50<00:00, 12.70s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk25.json


100%|██████████| 4/4 [00:00<00:00, 38391.80it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk25.json


100%|██████████| 4/4 [00:36<00:00,  9.21s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk26.json


100%|██████████| 4/4 [00:00<00:00, 34879.87it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk26.json


100%|██████████| 4/4 [00:44<00:00, 11.18s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk12.json


100%|██████████| 4/4 [00:00<00:00, 67108.86it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk12.json


100%|██████████| 4/4 [00:47<00:00, 11.77s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk24.json


100%|██████████| 4/4 [00:00<00:00, 32140.26it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk24.json


100%|██████████| 4/4 [00:50<00:00, 12.52s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk30.json


100%|██████████| 4/4 [00:00<00:00, 53601.33it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk30.json


100%|██████████| 4/4 [00:53<00:00, 13.34s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk28.json


100%|██████████| 4/4 [00:00<00:00, 37365.74it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk28.json


100%|██████████| 4/4 [00:51<00:00, 12.91s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk0.json


100%|██████████| 4/4 [00:00<00:00, 38657.18it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk0.json


100%|██████████| 4/4 [00:56<00:00, 14.15s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk31.json


100%|██████████| 4/4 [00:00<00:00, 80659.69it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk31.json


100%|██████████| 4/4 [00:47<00:00, 11.93s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk6.json


100%|██████████| 4/4 [00:00<00:00, 82241.25it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk6.json


100%|██████████| 4/4 [00:43<00:00, 10.94s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk9.json


100%|██████████| 4/4 [00:00<00:00, 83468.74it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk9.json


100%|██████████| 4/4 [00:50<00:00, 12.53s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk14.json


100%|██████████| 4/4 [00:00<00:00, 58661.59it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk14.json


100%|██████████| 4/4 [00:50<00:00, 12.57s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk27.json


100%|██████████| 4/4 [00:00<00:00, 38391.80it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk27.json


100%|██████████| 4/4 [01:00<00:00, 15.05s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk22.json


100%|██████████| 4/4 [00:00<00:00, 41120.63it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk22.json


100%|██████████| 4/4 [00:45<00:00, 11.37s/it]


Saved JSON to: results/2025-04-03_15-10/recall_precision_2025-04-03_15-10_chunk13.json


100%|██████████| 4/4 [00:00<00:00, 33893.37it/s]


Saved JSON to: results/2025-04-03_15-10/final_2025-04-03_15-10_chunk13.json


In [44]:

def merge_json_chunks(output_file, file_pattern):
    merged_data = []

    for filename in sorted(glob.glob(file_pattern)):
        with open(filename, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                elif isinstance(data, dict):
                    merged_data.append(data)
            except Exception as e:
                print(f"Failed to read {filename}: {e}")

    with open(output_file, "w", encoding="utf-8") as out_f:
        json.dump(merged_data, out_f, indent=2, ensure_ascii=False)

    print(f"Merged {len(merged_data)} entries into {output_file}")

merge_json_chunks(
    output_file=f"{folder_path}/__final_{timestamp}_merged.json",
    file_pattern=f"{folder_path}/final_{timestamp}_chunk*.json"
)


Merged 128 entries into results/2025-04-03_15-10/final_2025-04-03_15-10_merged.json
