### Include Library

In [33]:
# library for cap_f1
from cap_f1 import *
from datetime import datetime

# code for no need for restarting the kernel when python file is updated
%load_ext autoreload
%autoreload 2

# number of data points
LIMIT = 10

# for filename
now = datetime.now()
timestamp = now.strftime("%Y-%m-%d_%H-%M")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [34]:
print("Load caption file...")

# features that we need to extract from the original dataset
keys = ["file_name", "human_captions", "model_captions"]
org_caption_dataset = read_json(
    "combined-caption-output_7304-images2025-03-29_21_40_00.json", keys
)

all_human_captions = []
for item in org_caption_dataset:
    # Filter out human captions
    human_captions = [
        hc["caption"]
        for hc in item["human_captions"]
        if hc["caption"] != "Quality issues are too severe to recognize visual content."
    ]
    all_human_captions.append(human_captions)

Load caption file...


### Parse Caption into Atomic Statements

In [35]:
print("Generating atomic statements using gpt-4o...")

T_atomics, g_atomics = generate_atomic_statement(org_caption_dataset, limit=LIMIT)

Generating atomic statements using gpt-4o...




In [36]:
# Save the parsing results
save_results_json(
    output_path=f"parsed_caption_{timestamp}.json",
    org_dataset=org_caption_dataset,
    T_atomics=T_atomics,
    g_atomics=g_atomics,
    limit=LIMIT,
)



### Evaluation

In [37]:
# Read data from variable
# before calculating F1 score, match sentences between human generated and model generated
metadata = evaluate_matching(all_human_captions, T_atomics, g_atomics)

# Read existing atomic caption dataset from previous run
# If you want to use results from previous run without running atomic captioning call
# keys = ["file_name", "human_captions", "model_captions", "evaluation"]
# parsed_dataset = read_json(f"parsed_caption_{timestamp}.json", keys)
# metadata = evaluate_matching_file(parsed_dataset, print_mode=True)

# save the temp result
save_results_json(
    output_path=f"recall_precision_{timestamp}.json",
    update_existing=f"parsed_caption_{timestamp}.json",
    metadata=metadata,
    limit=LIMIT,
)








In [38]:
# get cap f1 score
evaluation = calculate_cap_f1(metadata)
save_results_json(
    output_path=f"final_{timestamp}.json",
    update_existing=f"recall_precision_{timestamp}.json",
    evaluations=evaluation,
    limit=LIMIT,
)

100%|██████████| 10/10 [00:00<00:00, 147168.56it/s]

Saved JSON to: final_2025-04-03_04-07.json





In [41]:
item

{'file_name': 'VizWiz_train_00000001.jpg',
 'human_captions': ['A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.',
  'A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.',
  'A kitchen counter the various items on top including a can of Coca-Cola, metal containers, and a teapot.',
  'a black tin of Coca Cola placed on a black surface',
  'Black counter with canisters, kettle and can of soda.'],
 'model_captions': [{'model_name': 'gpt-4o-2024-08-06',
   'caption': 'A can of Coca-Cola Zero is on a kitchen countertop, next to a white mug and a black kettle. Three silver canisters are aligned against the wall, along with a visible electrical outlet above them.'},
  {'model_name': 'Llama-3.2-11B-Vision-Instruct',
   'caption': 'The image shows a black can with a yellow band and red writing, likely a beverage can, on a kitchen counter. The can has a white label with indistinct writing.'},
  {'model_name': 'Molmo-7B-O-0924',
   

In [43]:
import json
import csv

json_path = f"final_{timestamp}.json"
csv_path = f"final_{timestamp}.csv"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

fieldnames = [
    "image",
    "T_atomics",
    "gpt_caption",
    "gpt_g_atomics",
    "gpt_recall_TPs",
    "gpt_recall_FNs",
    "gpt_precision_TPs",
    "gpt_precision_FPs",
    "molmo_caption",
    "molmo_g_atomics",
    "molmo_recall_TPs",
    "molmo_recall_FNs",
    "molmo_precision_TPs",
    "molmo_precision_FPs",
    "llama_caption",
    "llama_g_atomics",
    "llama_recall_TPs",
    "llama_recall_FNs",
    "llama_precision_TPs",
    "llama_precision_FPs",
    "gpt_recall",
    "gpt_precision",
    "gpt_capf1",
    "molmo_recall",
    "molmo_precision",
    "molmo_capf1",
    "llama_recall",
    "llama_precision",
    "llama_capf1",
]

with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for item in data:
        file_name = item.get("file_name", "")
        cap_f1 = item.get("evaluation", {}).get("cap_f1", {})
        scores = cap_f1.get("scores", {})
        metadata = cap_f1.get("metadata", {})
        t_atomics = cap_f1.get("T_atomics", [])

        model_keys = {
            "gpt": "gpt-4o-2024-08-06",
            "molmo": "Molmo-7B-O-0924",
            "llama": "Llama-3.2-11B-Vision-Instruct",
        }

        row = {
            "image": file_name,
            "T_atomics": "\n".join(t_atomics),
            "gpt_caption": item["model_captions"][0]["caption"],
            "gpt_g_atomics": "",
            "gpt_recall_TPs": "",
            "gpt_precision_TPs": "",
            "molmo_caption": item["model_captions"][2]["caption"],
            "molmo_g_atomics": "",
            "molmo_recall_TPs": "",
            "molmo_precision_TPs": "",
            "llama_caption": item["model_captions"][1]["caption"],
            "llama_g_atomics": "",
            "llama_recall_TPs": "",
            "llama_precision_TPs": "",
            "gpt_recall": scores.get(model_keys["gpt"], {}).get("recall"),
            "gpt_precision": scores.get(model_keys["gpt"], {}).get("precision"),
            "gpt_capf1": scores.get(model_keys["gpt"], {}).get("cap_f1"),
            "molmo_recall": scores.get(model_keys["molmo"], {}).get("recall"),
            "molmo_precision": scores.get(model_keys["molmo"], {}).get("precision"),
            "molmo_capf1": scores.get(model_keys["molmo"], {}).get("cap_f1"),
            "llama_recall": scores.get(model_keys["llama"], {}).get("recall"),
            "llama_precision": scores.get(model_keys["llama"], {}).get("precision"),
            "llama_capf1": scores.get(model_keys["llama"], {}).get("cap_f1"),
        }

        for short_name, model_key in model_keys.items():
            # g_atomics
            g_atomics_list = cap_f1.get("g_atomics", {}).get(model_key, [])
            row[f"{short_name}_g_atomics"] = "\n".join(g_atomics_list)

            # recall TPs
            recall_tps = metadata.get(model_key, {}).get("recall", {}).get("TPs", [])
            row[f"{short_name}_recall_TPs"] = "\n".join(recall_tps)

            # recall FNs
            recall_fns = metadata.get(model_key, {}).get("recall", {}).get("FNs", [])
            row[f"{short_name}_recall_FNs"] = "\n".join(recall_fns)

            # precision TPs
            precision_tps = (
                metadata.get(model_key, {}).get("precision", {}).get("TPs", [])
            )
            row[f"{short_name}_precision_TPs"] = "\n".join(precision_tps)

            # precision FPs
            precision_fps = (
                metadata.get(model_key, {}).get("precision", {}).get("FPs", [])
            )
            row[f"{short_name}_precision_FPs"] = "\n".join(precision_fps)

        writer.writerow(row)

print(f"CSV file saved to: {csv_path}")

CSV file saved to: final_2025-04-03_04-07.csv


## Other Evaluation 
### BLUE, METEOR, ROUGE

In [18]:
print("Load caption file...")

# features that we need to extract from the original dataset
keys = ["file_name", "human_captions", "model_captions"]
org_caption_dataset = read_json(
    "combined-caption-output_7304-images2025-03-29_21_40_00.json", keys
)

for item in org_caption_dataset:
    # Filter out human captions
    human_captions = [
        hc["caption"]
        for hc in item["human_captions"]
        if hc["caption"] != "Quality issues are too severe to recognize visual content."
    ]

Load caption file...


In [16]:
evall = get_others(org_caption_dataset, human_captions)
# print(json.dumps(evall, indent=4, ensure_ascii=False))

[nltk_data] Downloading package wordnet to /home/heoj4/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/heoj4/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/heoj4/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [17]:
save_results_json(
    output_path="others_evaluation.json", metric_name="others", evaluations=evall
)

Saved JSON to: others_evaluation.json
