### Include Library

In [1]:
# library for cap_f1
from cap_f1 import *

# library for BLUE, METEOR, ROUGE
import evaluate

%load_ext autoreload
%autoreload 2

### Load Data

In [11]:
print("Load caption file...")
caption_file = "../../data/study-2-output/labeled-data/combined-caption-output/combined-caption-output_7304-images_2025-03-29_21:40:00.json"

# features that we need to extract from the original dataset
keys = ["file_name", "human_captions", "model_captions"]
org_caption_dataset = read_json(caption_file, keys)
print(f"Captioned dataset loaded: {len(org_caption_dataset)} images.")

for item in org_caption_dataset:
    # Filter out human captions
    human_captions = [
        hc["caption"]            
        for hc in item["human_captions"]
        if hc["caption"] != "Quality issues are too severe to recognize visual content."
    ]
    for mc in item["model_captions"]:
        model_name = mc["model_name"]
        model_caption = mc["caption"]

Load caption file...
Captioned dataset loaded: 7304 images.


### Parse Caption into Atomic Statements

In [3]:
print("Generating atomic statements using gpt-4o...")
T_atomics, g_atomics  = generate_atomic_statement(org_caption_dataset)

Generating atomic statements using gpt-4o...


In [4]:
# Save the parsing results
save_results_json(output_path="parsed_caption.json", org_dataset=org_caption_dataset, T_atomics=T_atomics, g_atomics=g_atomics)

# Read Atomic Caption Dataset If Needed
keys = ["file_name", "human_captions", "model_captions", "evaluation"]
parsed_dataset = read_json("parsed_caption.json", keys)

Saved JSON to: parsed_caption.json


### Evaluation

In [5]:
evaluation = evaluate_matching(human_captions, T_atomics, g_atomics)
# evaluation = evaluate_matching_file(parsed_dataset)

In [6]:
output = calculate_cap_f1(evaluation)
print(json.dumps(output, indent=4, ensure_ascii=False))

[
    [
        {
            "model_name": "gpt-4o",
            "recall": 0.3,
            "precision": 0.8571428571428571,
            "cap_f1": 0.4444444444444444
        },
        {
            "model_name": "llama",
            "recall": 0.3,
            "precision": 0.2857142857142857,
            "cap_f1": 0.2926829268292683
        },
        {
            "model_name": "molmo",
            "recall": 0.4444444444444444,
            "precision": 0.5714285714285714,
            "cap_f1": 0.5
        }
    ]
]


## Evaluation 
### BLUE, METEOR, ROUGE

In [14]:
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")

[nltk_data] Downloading package wordnet to /Users/kgarg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/kgarg/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kgarg/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [50]:
for item in org_caption_dataset[0:1]:
    # Filter out human captions that we'll use for all models 
    human_captions = [
        hc["caption"]            
        for hc in item["human_captions"]
        if hc["caption"] != "Quality issues are too severe to recognize visual content."
    ]

    print(f"Human captions: {human_captions}\n")
    # apply metric for each model separately
    for mc in item["model_captions"]:
        model_name = mc["model_name"]
        model_caption = mc["caption"]
        
        references = [human_captions]
        predictions = [model_caption]

        print(f"{model_name} caption: {predictions}\n")
        print("BLEU-1:", bleu.compute(predictions=predictions, references=[references], max_order=1))
        print("BLEU-2:", bleu.compute(predictions=predictions, references=[references], max_order=2))
        print("BLEU-3:", bleu.compute(predictions=predictions, references=[references], max_order=3))
        print("BLEU-4:", bleu.compute(predictions=predictions, references=[references], max_order=4))
        print("METEOR:", meteor.compute(predictions=predictions, references=references))
        print("ROUGE:", rouge.compute(predictions=predictions, references=references))
        print("\n")

Human captions: ['A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.', 'A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.', 'A kitchen counter the various items on top including a can of Coca-Cola, metal containers, and a teapot.', 'a black tin of Coca Cola placed on a black surface', 'Black counter with canisters, kettle and can of soda.']

gpt-4o-2024-08-06 caption: ['A can of Coca-Cola Zero is on a kitchen countertop, next to a white mug and a black kettle. Three silver canisters are aligned against the wall, along with a visible electrical outlet above them.']

BLEU-1: {'bleu': 0.1383437751327924, 'precisions': [0.5384615384615384], 'brevity_penalty': 0.256924153818043, 'length_ratio': 0.42391304347826086, 'translation_length': 39, 'reference_length': 92}
BLEU-2: {'bleu': 0.07491453800514392, 'precisions': [0.5384615384615384, 0.15789473684210525], 'brevity_penalty': 0.256924153818043, 'length_ratio': 0.42391304347826

In [40]:
import nltk

hypothesis = ["hello there general kenobi"]
reference = ["hello there general kenobi"]
#there may be several references
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
print(BLEUscore)

1.821831989445342e-231


In [43]:
predictions = ["hello there general kenobi", "foo bar foobar"]
references = [
    ["hello there general kenobi", "hello there!"],
    ["foo bar foobar", "cat"]
]
bleu.compute(predictions=predictions, references=references)

{'bleu': 1.0,
 'precisions': [1.0, 1.0, 1.0, 1.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.75,
 'translation_length': 7,
 'reference_length': 4}

In [29]:
bleu.compute(predictions="hello there general kenobi", references="hello there general kenobi")

{'bleu': 0.0,
 'precisions': [1.0, 0.0, 0.0, 0.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0,
 'translation_length': 23,
 'reference_length': 23}

In [20]:
org_caption_dataset[0]

{'file_name': 'VizWiz_train_00000001.jpg',
 'human_captions': [{'caption': 'A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.',
   'is_precanned': False,
   'is_rejected': False},
  {'caption': 'A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.',
   'is_precanned': False,
   'is_rejected': False},
  {'caption': 'A kitchen counter the various items on top including a can of Coca-Cola, metal containers, and a teapot.',
   'is_precanned': False,
   'is_rejected': False},
  {'caption': 'a black tin of Coca Cola placed on a black surface',
   'is_precanned': False,
   'is_rejected': False},
  {'caption': 'Black counter with canisters, kettle and can of soda.',
   'is_precanned': False,
   'is_rejected': False}],
 'model_captions': [{'model_name': 'gpt-4o-2024-08-06',
   'caption': 'A can of Coca-Cola Zero is on a kitchen countertop, next to a white mug and a black kettle. Three silver canisters are aligned against the wall, a