### Include Library

In [1]:
# library for cap_f1
from cap_f1 import *

# library for BLUE, METEOR, ROUGE
import evaluate

%load_ext autoreload
%autoreload 2

### Load Data

In [2]:
print("Load caption file...")

# features that we need to extract from the original dataset
keys = ["file_name", "human_captions", "model_captions"]
org_caption_dataset = read_json("test_org.json", keys)

for item in org_caption_dataset:
    # Filter out human captions
    human_captions = [
        hc["caption"]            
        for hc in item["human_captions"]
        if hc["caption"] != "Quality issues are too severe to recognize visual content."
    ]
    for mc in item["model_captions"]:
        model_name = mc["model_name"]
        model_caption = mc["caption"]

Load caption file...


### Parse Caption into Atomic Statements

In [3]:
print("Generating atomic statements using gpt-4o...")
T_atomics, g_atomics  = generate_atomic_statement(org_caption_dataset)

Generating atomic statements using gpt-4o...


In [4]:
# Save the parsing results
save_results_json(output_path="parsed_caption.json", org_dataset=org_caption_dataset, T_atomics=T_atomics, g_atomics=g_atomics)

# Read Atomic Caption Dataset If Needed
keys = ["file_name", "human_captions", "model_captions", "evaluation"]
parsed_dataset = read_json("parsed_caption.json", keys)

Saved JSON to: parsed_caption.json


### Evaluation

In [5]:
evaluation = evaluate_matching(human_captions, T_atomics, g_atomics)
# evaluation = evaluate_matching_file(parsed_dataset)

In [6]:
output = calculate_cap_f1(evaluation)
print(json.dumps(output, indent=4, ensure_ascii=False))

[
    [
        {
            "model_name": "gpt-4o",
            "recall": 0.3,
            "precision": 0.8571428571428571,
            "cap_f1": 0.4444444444444444
        },
        {
            "model_name": "llama",
            "recall": 0.3,
            "precision": 0.2857142857142857,
            "cap_f1": 0.2926829268292683
        },
        {
            "model_name": "molmo",
            "recall": 0.4444444444444444,
            "precision": 0.5714285714285714,
            "cap_f1": 0.5
        }
    ]
]


## Evaluation 
### BLUE, METEOR, ROUGE

In [7]:
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")

[nltk_data] Downloading package wordnet to /home/heoj4/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/heoj4/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/heoj4/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
for item in org_caption_dataset:
    # Filter out human captions
    human_captions = [
        hc["caption"]            
        for hc in item["human_captions"]
        if hc["caption"] != "Quality issues are too severe to recognize visual content."
    ]
    for mc in item["model_captions"]:
        model_name = mc["model_name"]
        model_caption = mc["caption"]

    references = [human_captions]
    predictions = [model_caption]

    print("BLEU:", bleu.compute(predictions=predictions, references=references))
    print("METEOR:", meteor.compute(predictions=predictions, references=references))
    print("ROUGE:", rouge.compute(predictions=predictions, references=references))

BLEU: {'bleu': 0.15268470848781107, 'precisions': [0.5, 0.2, 0.125, 0.043478260869565216], 'brevity_penalty': 1.0, 'length_ratio': 2.3636363636363638, 'translation_length': 26, 'reference_length': 11}
METEOR: {'meteor': 0.5555555555555556}
ROUGE: {'rouge1': 0.4444444444444444, 'rouge2': 0.29411764705882354, 'rougeL': 0.4444444444444444, 'rougeLsum': 0.4444444444444444}
