In [7]:
import json
import statistics as s
from bert_score import score
from tqdm.notebook import tqdm

%load_ext jupyter_black

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [8]:
# load in combined caption data
captioned_data = None
filepath = "../data/study-2-output/labeled-data/combined-caption-output/combined-caption-output_7304-images_2025-03-29_21:40:00.json"
with open(
    filepath,
    "r",
) as f:
    captioned_data = json.load(f)
captioned_data[0:1]

[{'image_id': 1,
  'file_name': 'VizWiz_train_00000001.jpg',
  'vizwiz_url': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00000001.jpg',
  'text_detected': True,
  'unrecognizable': 0,
  'framing': 0,
  'blur': 5,
  'obstruction': 0,
  'rotation': 0,
  'too dark': 0,
  'too bright': 0,
  'other': 0,
  'no issue': 0,
  'human_captions': [{'caption': 'A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.',
    'is_precanned': False,
    'is_rejected': False},
   {'caption': 'A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.',
    'is_precanned': False,
    'is_rejected': False},
   {'caption': 'A kitchen counter the various items on top including a can of Coca-Cola, metal containers, and a teapot.',
    'is_precanned': False,
    'is_rejected': False},
   {'caption': 'a black tin of Coca Cola placed on a black surface',
    'is_precanned': False,
    'is_rejected': False},
   {'caption': 'Black counter

In [21]:
def compute_bertscore(dataset, model_type="microsoft/deberta-xlarge-mnli", lang="en"):
    """
    Computes BERTScore for each model against human captions as reference.
    Saves precision, recall, and f1 scores by reference to original input.
    """
    for image in tqdm(dataset):
        curr_references = [
            caption["caption"]
            for caption in image["human_captions"]
            if caption["caption"]
            != "Quality issues are too severe to recognize visual content."
        ]
        curr_output = {}
        for model in image["model_captions"]:
            curr_model_name = model["model_name"]

            # compute scores for current model
            curr_candidate = [model["caption"]]
            P, R, F1 = score(
                curr_candidate, [curr_references], model_type=model_type, lang=lang
            )
            curr_output[curr_model_name] = {
                "scores": {
                    "precision": float(P[0]),
                    "recall": float(R[0]),
                    "f1": float(F1[0]),
                }
            }

        # check if evaluation exists and save score
        if "evaluation" not in image:
            image["evaluation"] = {}
        image["evaluation"]["bertscore"] = curr_output

In [22]:
def compute_average_metrics_bertscore(dataset):
    """
    Computes average precision, recall, and f1 for BERTScore for each model.
    """
    total_scores = {}
    for image in tqdm(dataset):
        curr_evaluation = image["evaluation"]["bertscore"]
        for model_name, scores in curr_evaluation.items():
            scores = scores["scores"]
            if model_name in total_scores:
                total_scores[model_name] = {
                    "total_count": total_scores[model_name]["total_count"] + 1,
                    "total_precision": total_scores[model_name]["total_precision"]
                    + scores["precision"],
                    "total_recall": total_scores[model_name]["total_recall"]
                    + scores["recall"],
                    "total_f1": total_scores[model_name]["total_f1"] + scores["f1"],
                }
            else:
                total_scores[model_name] = {
                    "total_count": 1,
                    "total_precision": scores["precision"],
                    "total_recall": scores["recall"],
                    "total_f1": scores["f1"],
                }

    # compute averages and f1
    output = {}
    for model_name, values in total_scores.items():
        output[model_name] = {
            "avg_precision": values["total_precision"] / float(values["total_count"]),
            "avg_recall": values["total_recall"] / float(values["total_count"]),
            "avg_f1": values["total_f1"] / float(values["total_count"]),
        }
        output[model_name]["f1"] = s.harmonic_mean(
            [output[model_name]["avg_precision"], output[model_name]["avg_recall"]]
        )

    return output

In [58]:
model_type = (
    "microsoft/deberta-xlarge-mnli"  # TODO: try different models and see performance
)
lang = "en"
limit = 10

# TODO: try to use this optimization for all metrics
# construct input to sentence bert [[list of captions], [[references], references]]
candidates = {}
references = []
for image in tqdm(captioned_data[0:limit]):
    curr_references = [
        caption["caption"]
        for caption in image["human_captions"]
        if caption["caption"]
        != "Quality issues are too severe to recognize visual content."
    ]

    for model in image["model_captions"]:
        curr_model_name = model["model_name"]

        # compute scores for current model
        curr_candidate = model["caption"]
        if curr_model_name not in candidates:
            candidates[curr_model_name] = [curr_candidate]
        else:
            candidates[curr_model_name].append(curr_candidate)

    references.append(curr_references)

bertscore_output_per_model = {}
for model_name in candidates.keys():
    P_lst, R_lst, F1_lst = score(
        candidates[model_name], references, model_type=model_type, lang=lang
    )
    bertscore_output_per_model[model_name] = {
        "precision": P_lst,
        "recall": R_lst,
        "f1": F1_lst,
    }

# convert lists into formatted output for captioned_data
bertscore_outputs = {}
for model_name in candidates.keys():
    curr_output = []
    for P, R, F1 in zip(
        bertscore_output_per_model[model_name]["precision"],
        bertscore_output_per_model[model_name]["recall"],
        bertscore_output_per_model[model_name]["f1"],
    ):
        curr_output.append(
            {
                "scores": {
                    "precision": float(P),
                    "recall": float(R),
                    "f1": float(F1),
                }
            }
        )
    bertscore_outputs[model_name] = curr_output

# attach bertscore outputs to captioned_data
for index, image in enumerate(captioned_data[0:limit]):
    curr_output = {}
    for model_name in bertscore_outputs.keys():
        curr_output[model_name] = bertscore_outputs[model_name][index]
    if "evaluation" not in image:
        image["evaluation"] = {}
    image["evaluation"]["bertscore"] = curr_output

  0%|          | 0/10 [00:00<?, ?it/s]

In [59]:
print(json.dumps(captioned_data[0:limit], indent=4))

[
    {
        "image_id": 1,
        "file_name": "VizWiz_train_00000001.jpg",
        "vizwiz_url": "https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00000001.jpg",
        "text_detected": true,
        "unrecognizable": 0,
        "framing": 0,
        "blur": 5,
        "obstruction": 0,
        "rotation": 0,
        "too dark": 0,
        "too bright": 0,
        "other": 0,
        "no issue": 0,
        "human_captions": [
            {
                "caption": "A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.",
                "is_precanned": false,
                "is_rejected": false
            },
            {
                "caption": "A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.",
                "is_precanned": false,
                "is_rejected": false
            },
            {
                "caption": "A kitchen counter the various items on top including a can of Coc