### Include Library

In [1]:
# library for cap_f1
from cap_f1 import *
from datetime import datetime
from multiprocessing import Pool
import math
import json
import glob
import os


# code for no need for restarting the kernel when python file is updated
%load_ext autoreload
%autoreload 2


### Load Data

In [2]:
print("Load caption file...")

# features that we need to extract from the original dataset
keys = ["file_name", "human_captions", "model_captions"]
org_caption_dataset = read_json("combined-caption-output_7304-images2025-03-29_21_40_00.json", keys)

all_human_captions=[]
for item in org_caption_dataset:
    # Filter out human captions
    human_captions = [
        hc["caption"]            
        for hc in item["human_captions"]
        if hc["caption"] != "Quality issues are too severe to recognize visual content."
    ]
    all_human_captions.append(human_captions)



Load caption file...


In [3]:
# This is for testing
print(len(org_caption_dataset))
org_caption_dataset = org_caption_dataset[:128]
all_human_captions = all_human_captions[:128]

7304


### Run Multi Processors

In [4]:
def process_batch(start_idx, end_idx, org_caption_dataset, all_human_captions, folder_path, timestamp, chunk_id):
    subset = org_caption_dataset[start_idx:end_idx]
    LIMIT = len(subset)
    human_subset = all_human_captions[start_idx:end_idx]

    # Step 1: Parse atomics
    T_atomics, g_atomics = generate_atomic_statement(subset, limit=LIMIT)
    save_results_json(output_path=f"{folder_path}/parsed_caption_{timestamp}_chunk{chunk_id}.json",
                      org_dataset=subset, T_atomics=T_atomics, g_atomics=g_atomics, limit=LIMIT)

    # Step 2: Match human & generated
    metadata = evaluate_matching(human_subset, T_atomics, g_atomics)
    save_results_json(output_path=f"{folder_path}/recall_precision_{timestamp}_chunk{chunk_id}.json",
                      update_existing=f"{folder_path}/parsed_caption_{timestamp}_chunk{chunk_id}.json",
                      metadata=metadata, limit=LIMIT)

    # Step 3: Cap F1
    evaluation = calculate_cap_f1(metadata)
    save_results_json(output_path=f"{folder_path}/final_{timestamp}_chunk{chunk_id}.json",
                      update_existing=f"{folder_path}/recall_precision_{timestamp}_chunk{chunk_id}.json",
                      evaluations=evaluation, limit=LIMIT)

def run_parallel_processing(org_caption_dataset, all_human_captions, folder_path, timestamp, num_workers=32):
    total = len(org_caption_dataset)
    chunk_size = math.ceil(total / num_workers)

    with Pool(processes=num_workers) as pool:
        jobs = []
        for i in range(num_workers):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, total)
            jobs.append(pool.apply_async(process_batch, (start_idx, end_idx, org_caption_dataset, all_human_captions, folder_path, timestamp, i)))
        
        for job in jobs:
            job.get()

In [6]:
# for filename
now = datetime.now()
timestamp = now.strftime("%Y-%m-%d_%H-%M")

#create folder to save the results
folder_path = f"results/{timestamp}"
os.makedirs(folder_path, exist_ok=True)

run_parallel_processing(org_caption_dataset, all_human_captions, folder_path, timestamp, num_workers=32)

100%|██████████| 4/4 [00:34<00:00,  8.66s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk21.json


100%|██████████| 4/4 [00:42<00:00, 10.63s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk29.json


100%|██████████| 4/4 [00:43<00:00, 10.86s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk19.json


100%|██████████| 4/4 [00:43<00:00, 10.99s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk18.json


100%|██████████| 4/4 [00:45<00:00, 11.31s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk1.json


100%|██████████| 4/4 [00:45<00:00, 11.38s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk24.json


100%|██████████| 4/4 [00:45<00:00, 11.47s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk10.json

100%|██████████| 4/4 [00:45<00:00, 11.91s/it]




  0%|          | 0/4 [00:00<?, ?it/s].47s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk15.json


100%|██████████| 4/4 [00:46<00:00, 11.56s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk31.json


100%|██████████| 4/4 [00:46<00:00, 11.66s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk20.json


100%|██████████| 4/4 [00:46<00:00, 11.69s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk14.json


100%|██████████| 4/4 [00:46<00:00, 11.73s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk7.json


100%|██████████| 4/4 [00:47<00:00, 11.76s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk3.json


100%|██████████| 4/4 [00:47<00:00, 11.82s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk12.json


100%|██████████| 4/4 [00:47<00:00, 11.89s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk5.json


100%|██████████| 4/4 [00:47<00:00, 11.96s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk17.json


100%|██████████| 4/4 [00:48<00:00, 12.14s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk28.json


100%|██████████| 4/4 [00:48<00:00, 12.24s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk25.json


100%|██████████| 4/4 [00:49<00:00, 12.41s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk8.json


100%|██████████| 4/4 [00:49<00:00, 12.49s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk0.json


100%|██████████| 4/4 [00:50<00:00, 12.52s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk23.json


  0%|          | 0/4 [00:00<?, ?it/s]

Error: Recall mismatch for model [gpt-4o-2024-08-06]
length 8 vs 9
T atomics:
['There is a man.', 'The man is white.', 'The man has a short beard.', 'The man is wearing a shirt.', 'The shirt is black.', 'The man has a hairy neck.', 'There is a collar.', 'The collar is part of a shirt.']
Recall TPs:
['There is a shirt.', 'The shirt is black.']
Recall FNs:
['There is a man.', 'The man is white.', 'The man has a short beard.', 'The man is wearing a shirt.', 'The man has a hairy neck.', 'There is a collar.', 'The collar is part of a shirt.']


100%|██████████| 4/4 [00:51<00:00, 12.89s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk11.json


100%|██████████| 4/4 [00:52<00:00, 13.17s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk4.json


100%|██████████| 4/4 [00:52<00:00, 13.19s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk26.json


100%|██████████| 4/4 [00:53<00:00, 13.40s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk2.json


100%|██████████| 4/4 [00:53<00:00, 13.41s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk22.json


100%|██████████| 4/4 [00:54<00:00, 13.64s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk6.json


100%|██████████| 4/4 [00:55<00:00, 13.75s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk9.json


100%|██████████| 4/4 [00:57<00:00, 14.33s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk30.json


100%|██████████| 4/4 [00:58<00:00, 14.69s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk16.json


100%|██████████| 4/4 [00:58<00:00, 14.72s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk27.json


100%|██████████| 4/4 [01:00<00:00, 15.19s/it]


Saved JSON to: results/2025-04-03_22-00/parsed_caption_2025-04-03_22-00_chunk13.json


 25%|██▌       | 1/4 [00:13<00:40, 13.55s/it]

Error: Recall mismatch for model [gpt-4o-2024-08-06]
length 14 vs 13
T atomics:
['There is a carton of eggs.', 'The carton of eggs is on a table.', 'The table is dark brown.', 'There is a newspaper.', 'The newspaper is above the carton of eggs.', 'There is a box of unsalted butter.', 'There are some receipts.', 'The box of unsalted butter is on a counter top.', 'The receipts are next to the box of unsalted butter.', 'There is a package.', 'The package is blue.', 'The package contains unsalted butter.', 'There is a picture on the package.', 'The picture is of a cow.']
Recall TPs:
['There is a box of unsalted butter.', 'The package is blue.', 'The package contains unsalted butter.', 'There is a picture on the package.', 'The picture is of a cow.']
Recall FNs:
['There is a carton of eggs.', 'The carton of eggs is on a table.', 'The table is dark brown.', 'There is a newspaper.', 'The newspaper is above the carton of eggs.', 'There are some receipts.', 'The box of unsalted butter is on a c

 50%|█████     | 2/4 [00:22<00:22, 11.16s/it]

Error: Recall mismatch for model [gpt-4o-2024-08-06]
length 14 vs 13
T atomics:
['There is a bottle.', 'The bottle contains tomato sauce.', 'There is a tablecloth.', 'The tablecloth is red.', 'The tablecloth has a checkered pattern.', 'The bottle is on the tablecloth.', 'There is a jar of pasta sauce.', 'The jar is lying on a tablecloth.', 'There is a bottle.', 'The bottle has a front label.', 'The sauce is red.', 'There is a can.', 'The can contains tomatoes.', 'The tomatoes are labeled as Cisco Favorite.']
Recall TPs:
['There is a tablecloth.', 'The tablecloth has a checkered pattern.', 'There is a bottle.', 'The bottle has a front label.']
Recall FNs:
['The bottle contains tomato sauce.', 'The tablecloth is red.', 'The bottle is on the tablecloth.', 'There is a jar of pasta sauce.', 'The jar is lying on a tablecloth.', 'The sauce is red.', 'There is a can.', 'The can contains tomatoes.', 'The tomatoes are labeled as Cisco Favorite.']


 50%|█████     | 2/4 [00:24<00:23, 11.99s/it]

Error: Recall mismatch for model [Llama-3.2-11B-Vision-Instruct]
length 14 vs 15
T atomics:
['There is a bottle.', 'The bottle contains tomato sauce.', 'There is a tablecloth.', 'The tablecloth is red.', 'The tablecloth has a checkered pattern.', 'The bottle is on the tablecloth.', 'There is a jar of pasta sauce.', 'The jar is lying on a tablecloth.', 'There is a bottle.', 'The bottle has a front label.', 'The sauce is red.', 'There is a can.', 'The can contains tomatoes.', 'The tomatoes are labeled as Cisco Favorite.']
Recall TPs:
['There is a jar of pasta sauce.', 'There is a red and white checkered cloth.']
Recall FNs:
['There is a bottle.', 'The bottle contains tomato sauce.', 'There is a tablecloth.', 'The tablecloth is red.', 'The tablecloth has a checkered pattern.', 'The bottle is on the tablecloth.', 'The jar is lying on a tablecloth.', 'There is a bottle.', 'The bottle has a front label.', 'The sauce is red.', 'There is a can.', 'The can contains tomatoes.', 'The tomatoes are

100%|██████████| 4/4 [00:39<00:00,  9.95s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk21.json


100%|██████████| 4/4 [00:00<00:00, 37617.08it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk21.json


 50%|█████     | 2/4 [00:28<00:27, 13.54s/it]

Error: Recall mismatch for model [Llama-3.2-11B-Vision-Instruct]
length 13 vs 13
T atomics:
['There is a bottle.', 'The bottle is small.', 'The bottle is green.', 'The bottle contains hair product.', 'The bottle is on top of a table.', 'The table is made of wood.', 'The bottle is of the Fructis brand.', 'The bottle contains flat iron perfector.', 'The bottle has a spray type cap.', 'The bottle is of Fructis style flat iron perfector.', 'The bottle has an orange top.', 'The bottle is lying on material.', 'The material is light brown.']
Recall TPs:
['There is a bottle.', 'The bottle is green.', 'The bottle is of Fructis brand.', 'The bottle contains flat iron perfector.', 'The bottle is of Fructis style flat iron perfector.', 'The bottle has an orange top.']
Recall FNs:
['The bottle is small.', 'The bottle contains hair product.', 'The bottle is on top of a table.', 'The table is made of wood.', 'The bottle has a spray type cap.', 'The bottle is lying on material.', 'The material is ligh

100%|██████████| 4/4 [00:40<00:00, 10.03s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk1.json


100%|██████████| 4/4 [00:00<00:00, 40136.88it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk1.json


 75%|███████▌  | 3/4 [00:32<00:09,  9.97s/it]

Error: Recall mismatch for model [gpt-4o-2024-08-06]
length 18 vs 18
T atomics:
['There is a book by Willard A Palmer.', 'The book is called Adult All in One Course.', 'The book has a cover photo of a piano.', 'There are two other books next to it.', 'The two other books are out of vision.', 'One book shows a piano.', 'One book is green.', "The green book is titled 'Database Systems'.", 'One book shows people fighting in a jungle.', 'There are books in the image.', 'There is a pad in the image.', 'The pad is on the table.', 'There is a piece of office equipment on a table.', 'The books are next to the piece of office equipment.', 'The manuals are next to the piece of office equipment.', 'There is a table.', 'There is an office machine on the table.', 'There is a mug on the table.']
Recall TPs:
['There is a book by Willard A Palmer.', 'The book is called Adult All in One Course.', 'The book has a cover photo of a piano.', "There is a book titled 'Database Systems'."]
Recall FNs:
['There

100%|██████████| 4/4 [00:39<00:00,  9.95s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk20.json


100%|██████████| 4/4 [00:00<00:00, 33825.03it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk20.json


100%|██████████| 4/4 [00:40<00:00, 10.19s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk10.json


100%|██████████| 4/4 [00:00<00:00, 39475.80it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk10.json


100%|██████████| 4/4 [00:43<00:00, 10.88s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk19.json


100%|██████████| 4/4 [00:00<00:00, 40041.09it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk19.json


 75%|███████▌  | 3/4 [00:34<00:11, 11.48s/it]

Error: Precision mismatch for model [Molmo-7B-O-0924]
length 7 vs 7
G atomics:
['There is a box of tea.', 'The box is on a table.', 'The table is red.', 'The box is yellow.', 'The box has a red label.', "The text 'Tea' is visible on the box.", "The text 'Tea' is at the bottom of the box."]
Precision TPs:
['The box is on a table']
Precision FPs:
['There is a box of tea', 'The table is red', 'The box is yellow', 'The box has a red label', "The text 'Tea' is visible on the box", "The text 'Tea' is at the bottom of the box"]


100%|██████████| 4/4 [00:40<00:00, 10.01s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk17.json


100%|██████████| 4/4 [00:00<00:00, 40136.88it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk17.json


100%|██████████| 4/4 [00:38<00:00,  9.71s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk23.json


100%|██████████| 4/4 [00:00<00:00, 37365.74it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk23.json


100%|██████████| 4/4 [00:42<00:00, 10.66s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk14.json


100%|██████████| 4/4 [00:00<00:00, 39945.75it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk14.json


100%|██████████| 4/4 [00:40<00:00, 10.06s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk8.json


100%|██████████| 4/4 [00:00<00:00, 37200.04it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk8.json


100%|██████████| 4/4 [00:41<00:00, 10.50s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk28.json


100%|██████████| 4/4 [00:00<00:00, 41221.66it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk28.json


100%|██████████| 4/4 [00:44<00:00, 11.08s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk31.json


100%|██████████| 4/4 [00:00<00:00, 85163.53it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk31.json


100%|██████████| 4/4 [00:44<00:00, 11.03s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk3.json


100%|██████████| 4/4 [00:00<00:00, 40920.04it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk3.json


100%|██████████| 4/4 [00:44<00:00, 11.07s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk7.json


100%|██████████| 4/4 [00:00<00:00, 52265.47it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk7.json


100%|██████████| 4/4 [00:42<00:00, 10.67s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk25.json


100%|██████████| 4/4 [00:00<00:00, 37449.14it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk25.json


100%|██████████| 4/4 [00:49<00:00, 12.32s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk29.json


100%|██████████| 4/4 [00:00<00:00, 39850.87it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk29.json


100%|██████████| 4/4 [00:46<00:00, 11.61s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk24.json


100%|██████████| 4/4 [00:00<00:00, 40427.03it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk24.json


100%|██████████| 4/4 [00:47<00:00, 11.97s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk15.json


100%|██████████| 4/4 [00:00<00:00, 37117.73it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk15.json


 75%|███████▌  | 3/4 [00:36<00:12, 12.62s/it]

Error: Precision mismatch for model [Molmo-7B-O-0924]
length 6 vs 6
G atomics:
['There is a soda can.', 'The soda can has a black top.', 'The soda can has a green label.', 'The soda can has a red label.', "The word 'Mountain' is visible on the can.", "The word 'Mountain' is in green text."]
Precision TPs:
['There is a soda can', "The word 'Mountain' is visible on the can"]
Precision FPs:
['The soda can has a black top', 'The soda can has a green label', 'The soda can has a red label', "The word 'Mountain' is in green text"]


100%|██████████| 4/4 [00:42<00:00, 10.67s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk4.json


100%|██████████| 4/4 [00:00<00:00, 41120.63it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk4.json


100%|██████████| 4/4 [00:45<00:00, 11.34s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk11.json


100%|██████████| 4/4 [00:00<00:00, 40622.80it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk11.json


100%|██████████| 4/4 [00:50<00:00, 12.54s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk5.json


100%|██████████| 4/4 [00:00<00:00, 40136.88it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk5.json


100%|██████████| 4/4 [00:45<00:00, 11.37s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk26.json


100%|██████████| 4/4 [00:00<00:00, 38568.31it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk26.json


100%|██████████| 4/4 [00:48<00:00, 12.25s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk0.json


100%|██████████| 4/4 [00:00<00:00, 40820.48it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk0.json


100%|██████████| 4/4 [00:55<00:00, 13.90s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk18.json


100%|██████████| 4/4 [00:00<00:00, 31184.42it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk18.json


100%|██████████| 4/4 [00:46<00:00, 11.75s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk6.json


100%|██████████| 4/4 [00:00<00:00, 41221.66it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk6.json


100%|██████████| 4/4 [00:43<00:00, 10.93s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk16.json


100%|██████████| 4/4 [00:00<00:00, 38926.26it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk16.json


100%|██████████| 4/4 [00:47<00:00, 11.97s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk9.json


100%|██████████| 4/4 [00:00<00:00, 84733.41it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk9.json


100%|██████████| 4/4 [00:46<00:00, 11.58s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk30.json


100%|██████████| 4/4 [00:00<00:00, 39756.44it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk30.json
Error: Precision mismatch for model [gpt-4o-2024-08-06]
length 7 vs 7
G atomics:
['There is a can.', 'The can is on a wooden surface.', 'The can has visible text.', "The text reads 'WT. 15 OZ. (425g)'.", "The text includes part of the word 'TIONAL'.", 'The label has a yellow color.', 'The label has a red color.']
Precision TPs:
['There is a can.', 'The can is on a wooden surface.', "The text reads 'WT. 15 OZ. (425g)'", "The text includes part of the word 'TIONAL'", 'The label has a yellow color.', 'The label has a red color.']
Precision FPs:
['The can has visible text.']


100%|██████████| 4/4 [00:58<00:00, 14.62s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk12.json


100%|██████████| 4/4 [00:00<00:00, 86037.01it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk12.json


100%|██████████| 4/4 [00:53<00:00, 13.34s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk2.json


100%|██████████| 4/4 [00:00<00:00, 40622.80it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk2.json


100%|██████████| 4/4 [00:51<00:00, 12.78s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk27.json


100%|██████████| 4/4 [00:00<00:00, 78033.56it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk27.json


100%|██████████| 4/4 [00:51<00:00, 13.00s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk13.json


100%|██████████| 4/4 [00:00<00:00, 40427.03it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk13.json
Error: Recall mismatch for model [Molmo-7B-O-0924]
length 17 vs 16
T atomics:
['The canned food has a yellow label.', 'The canned food has a green label.', 'The canned food has a red label.', 'The canned food is on a wooden surface.', 'The can contains spaghetti sauce.', 'The spaghetti sauce is traditional style.', 'The can weighs 15 ounces.', 'The can weighs 425 grams.', 'There is a small can.', 'The can has a picture on it.', 'The picture shows red pasta sauce.', 'The picture shows noodles.', 'The can is on a table.', 'The can is yellow.', 'The can is of national tomato paste.', 'The can is sitting on a counter.', 'The table is made of wood.']
Recall TPs:
['The canned food has a yellow label.', 'The canned food is on a wooden surface.', 'The can contains spaghetti sauce.', 'The picture shows red pasta sauce.', 'The picture shows noodles.', 'The can is yellow.']
Recall FNs:
['The canned food has a green label.

100%|██████████| 4/4 [00:59<00:00, 14.92s/it]


Saved JSON to: results/2025-04-03_22-00/recall_precision_2025-04-03_22-00_chunk22.json


100%|██████████| 4/4 [00:00<00:00, 40427.03it/s]


Saved JSON to: results/2025-04-03_22-00/final_2025-04-03_22-00_chunk22.json


In [7]:

def merge_json_chunks(output_file, file_pattern):
    merged_data = []

    for filename in sorted(glob.glob(file_pattern)):
        with open(filename, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                elif isinstance(data, dict):
                    merged_data.append(data)
            except Exception as e:
                print(f"Failed to read {filename}: {e}")

    with open(output_file, "w", encoding="utf-8") as out_f:
        json.dump(merged_data, out_f, indent=2, ensure_ascii=False)

    print(f"Merged {len(merged_data)} entries into {output_file}")

merge_json_chunks(
    output_file=f"{folder_path}/__final_{timestamp}_merged.json",
    file_pattern=f"{folder_path}/final_{timestamp}_chunk*.json"
)


Merged 128 entries into results/2025-04-03_22-00/__final_2025-04-03_22-00_merged.json


In [8]:
import json
import csv

json_path = f"{folder_path}/__final_{timestamp}_merged.json"
csv_path = f"{folder_path}/__final_{timestamp}_merged.csv"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

fieldnames = [
    "image",
    "T_atomics",
    "gpt_caption",
    "gpt_g_atomics",
    "gpt_recall_TPs",
    "gpt_recall_FNs",
    "gpt_precision_TPs",
    "gpt_precision_FPs",
    "molmo_caption",
    "molmo_g_atomics",
    "molmo_recall_TPs",
    "molmo_recall_FNs",
    "molmo_precision_TPs",
    "molmo_precision_FPs",
    "llama_caption",
    "llama_g_atomics",
    "llama_recall_TPs",
    "llama_recall_FNs",
    "llama_precision_TPs",
    "llama_precision_FPs",
    "gpt_recall",
    "gpt_precision",
    "gpt_capf1",
    "molmo_recall",
    "molmo_precision",
    "molmo_capf1",
    "llama_recall",
    "llama_precision",
    "llama_capf1",
]

with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for item in data:
        file_name = item.get("file_name", "")
        cap_f1 = item.get("evaluation", {}).get("cap_f1", {})
        scores = cap_f1.get("scores", {})
        metadata = cap_f1.get("metadata", {})
        t_atomics = cap_f1.get("T_atomics", [])

        model_keys = {
            "gpt": "gpt-4o-2024-08-06",
            "molmo": "Molmo-7B-O-0924",
            "llama": "Llama-3.2-11B-Vision-Instruct",
        }

        row = {
            "image": file_name,
            "T_atomics": "\n".join(t_atomics),
            "gpt_caption": item["model_captions"][0]["caption"],
            "gpt_g_atomics": "",
            "gpt_recall_TPs": "",
            "gpt_precision_TPs": "",
            "molmo_caption": item["model_captions"][2]["caption"],
            "molmo_g_atomics": "",
            "molmo_recall_TPs": "",
            "molmo_precision_TPs": "",
            "llama_caption": item["model_captions"][1]["caption"],
            "llama_g_atomics": "",
            "llama_recall_TPs": "",
            "llama_precision_TPs": "",
            "gpt_recall": scores.get(model_keys["gpt"], {}).get("recall"),
            "gpt_precision": scores.get(model_keys["gpt"], {}).get("precision"),
            "gpt_capf1": scores.get(model_keys["gpt"], {}).get("cap_f1"),
            "molmo_recall": scores.get(model_keys["molmo"], {}).get("recall"),
            "molmo_precision": scores.get(model_keys["molmo"], {}).get("precision"),
            "molmo_capf1": scores.get(model_keys["molmo"], {}).get("cap_f1"),
            "llama_recall": scores.get(model_keys["llama"], {}).get("recall"),
            "llama_precision": scores.get(model_keys["llama"], {}).get("precision"),
            "llama_capf1": scores.get(model_keys["llama"], {}).get("cap_f1"),
        }

        for short_name, model_key in model_keys.items():
            # g_atomics
            g_atomics_list = cap_f1.get("g_atomics", {}).get(model_key, [])
            row[f"{short_name}_g_atomics"] = "\n".join(g_atomics_list)

            # recall TPs
            recall_tps = metadata.get(model_key, {}).get("recall", {}).get("TPs", [])
            row[f"{short_name}_recall_TPs"] = "\n".join(recall_tps)

            # recall FNs
            recall_fns = metadata.get(model_key, {}).get("recall", {}).get("FNs", [])
            row[f"{short_name}_recall_FNs"] = "\n".join(recall_fns)

            # precision TPs
            precision_tps = (
                metadata.get(model_key, {}).get("precision", {}).get("TPs", [])
            )
            row[f"{short_name}_precision_TPs"] = "\n".join(precision_tps)

            # precision FPs
            precision_fps = (
                metadata.get(model_key, {}).get("precision", {}).get("FPs", [])
            )
            row[f"{short_name}_precision_FPs"] = "\n".join(precision_fps)

        writer.writerow(row)

print(f"CSV file saved to: {csv_path}")

CSV file saved to: results/2025-04-03_22-00/__final_2025-04-03_22-00_merged.csv
