# Study 2 Analysis
Analyses for each research question in Study 2. These analyses are done with the newest setup of the project where the researchers annotate whether VLMs accurately identified the product in images.

# Load packages

In [2]:
import os
import copy
import json
import csv
from datetime import datetime

import pandas as pd
import statistics as s
from scipy import stats

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

## Helper functions

In [3]:
def calculate_quality_metrics(df, reference_df=None, quality_columns=None):
    """
    Calculate quality issue counts and optionally percentages compared to a reference dataset.

    Args:
        df: DataFrame containing the quality issues data
        reference_df: Optional reference DataFrame to calculate percentages against
        quality_columns: List of quality issue column names. If None, uses default columns

    Returns:
        DataFrame with quality counts, and optionally percentages if reference_df is provided
    """
    if quality_columns is None:
        quality_columns = [
            "unrecognizable",
            "blur",
            "framing",
            "obstruction",
            "rotation",
            "too dark",
            "too bright",
            "other",
        ]

    # Calculate counts
    quality_counts = pd.concat(
        [df[col].value_counts() for col in quality_columns], axis=1
    )
    quality_counts.columns = quality_columns

    # Replace NaN with 0 and convert to int
    quality_counts = quality_counts.fillna(0).astype(int)

    # sort index from 0 to 5
    quality_counts = quality_counts.sort_index()

    # Add total row
    quality_counts.loc["total"] = quality_counts.sum()

    # Calculate percentages if reference DataFrame is provided
    if reference_df is not None:
        reference_counts = calculate_quality_metrics(
            reference_df, quality_columns=quality_columns
        )
        quality_percentages = quality_counts.div(reference_counts, axis=0) * 100
        return quality_percentages.round(2)

    return quality_counts


def combine_counts_and_percentages(counts_df, percentages_df=None):
    """
    Combines counts and percentages into a single DataFrame with formatted strings.

    Args:
        counts_df: DataFrame containing the counts
        percentages_df: Optional DataFrame containing percentages. If None, percentages
                       will be calculated using the total row of counts_df

    Returns:
        DataFrame with formatted strings combining counts and percentages
    """
    # Calculate percentages if not provided
    if percentages_df is None:
        percentages_df = (counts_df.div(counts_df.loc["total"], axis=1) * 100).round(2)

    def format_count_and_percentage(count, percentage):
        count_str = (
            str(int(float(count))) if float(count).is_integer() else str(float(count))
        )
        return f"{count_str} ({percentage:.2f}%)"

    # Create combined DataFrame
    combined_stats = pd.DataFrame(
        [
            [
                format_count_and_percentage(count, pct)
                for count, pct in zip(row_counts, row_pcts)
            ]
            for row_counts, row_pcts in zip(counts_df.values, percentages_df.values)
        ],
        index=counts_df.index,
        columns=counts_df.columns,
    )

    return combined_stats

## Constants

In [4]:
MODEL_NAMES = {
    "gpt-4o-2024-08-06": "gpt4o",
    "Llama-3.2-11B-Vision-Instruct": "llama",
    "Molmo-7B-O-0924": "molmo",
}

# Load data
We load 3 pieces of data:
1. A `.json` file with all images that fit our study conditions, their model captions, and evaluation metrics.
2. A `.csv` that has annotations from the research team noting what captions accurately identify products.
3. A `.csv` with expert captions.

## (1) load images with model captions and evaluation metrics

In [5]:
evaluated_captions_data = json.load(
    open(
        "../../data/study-2-output/final-evaluated-captions/low-quality_evaluation_5432-images_2025-04-11_03-31_merged.json"
    )
)

print(f"Length of evaluated captions: {len(evaluated_captions_data)}")
print(json.dumps(evaluated_captions_data[0], indent=4))

Length of evaluated captions: 5432
{
    "image_id": 1,
    "file_name": "VizWiz_train_00000001.jpg",
    "vizwiz_url": "https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00000001.jpg",
    "text_detected": true,
    "unrecognizable": 0,
    "framing": 0,
    "blur": 5,
    "obstruction": 0,
    "rotation": 0,
    "too dark": 0,
    "too bright": 0,
    "other": 0,
    "no issue": 0,
    "human_captions": [
        {
            "caption": "A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.",
            "is_precanned": false,
            "is_rejected": false
        },
        {
            "caption": "A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.",
            "is_precanned": false,
            "is_rejected": false
        },
        {
            "caption": "A kitchen counter the various items on top including a can of Coca-Cola, metal containers, and a teapot.",
            "is_precanned": false

In [6]:
# create a dataframe with only the data we need for the regression
target_keys = [
    "image_id",
    "file_name",
    "vizwiz_url",
    "text_detected",
    "unrecognizable",
    "framing",
    "blur",
    "obstruction",
    "rotation",
    "too dark",
    "too bright",
    "other",
    "no issue",
]
evaluation_data_regression = [
    {x: y for x, y in image.items() if x in target_keys}
    for image in evaluated_captions_data
]
filtered_evaluation_data_df = pd.DataFrame.from_dict(evaluation_data_regression)
filtered_evaluation_data_df.head()

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,no issue
0,1,VizWiz_train_00000001.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,5,0,0,0,0,0,0
1,8,VizWiz_train_00000008.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,0,0,0,0,0,0,1
2,11,VizWiz_train_00000011.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,2,2,4,1,1,1,0,0
3,20,VizWiz_train_00000020.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,0,1,0,0,0,0,0
4,26,VizWiz_train_00000026.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,4,0,0,0,0,0,1


## (2) load annotations from research team

In [7]:
target_images_dtypes = {
    "image_id": int,
    "file_name": str,
    "vizwiz_url": str,
    "image_preview": str,
    "INCLUDE because product": str,
    "EXCLUDE because not verifable": str,
    "EXCLUDE because Book/DVD/CD/magazine?": str,
    "text_detected": bool,
    "unrecognizable": int,
    "framing": int,
    "blur": int,
    "obstruction": int,
    "rotation": int,
    "too dark": int,
    "too bright": int,
    "other": int,
    "no issue": int,
    "human_caption_0": str,
    "human_caption_1": str,
    "human_caption_2": str,
    "human_caption_3": str,
    "human_caption_4": str,
    "gpt-4o-2024-08-06_caption": str,
    "Llama-3.2-11B-Vision-Instruct_caption": str,
    "Molmo-7B-O-0924_caption": str,
    "general_notes": str,
    "gpt-4o-2024-08-06_notes": str,
    "Llama-3.2-11B-Vision-Instruct_notes": str,
    "Molmo-7B-O-0924_notes": str,
    "image_preview": str,
    "unable_to_verify": str,
    "gpt4o_code": str,
    "llama_code": str,
    "molmo_code": str,
    "notes": str,
    "double code notes": str,
    "double verified": str,
}

annotations_df = pd.read_csv(
    "./annotated-data/final-annotated-images_1696-images_2025-04-14_03-41.csv",
    dtype=target_images_dtypes,
    keep_default_na=False,
)

print(f"Number of annotated images: {len(annotations_df)}")
print(
    f"Number of images that were verified: {len(annotations_df[annotations_df['unable_to_verify'] == ''])}"
)
annotations_df.head()

Number of annotated images: 1696
Number of images that were verified: 1220


Unnamed: 0,image_id,file_name,vizwiz_url,image_preview,human_captions,annotator,notes,unable_to_verify,double code notes,double verified,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,curved label,text panel,expert_caption
0,9886,VizWiz_train_00009886.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A box of frozen food sits on a table top that ...,Anne Marie,trade joes paneer tikka masala,,,x,A package of Trader Joe's Paneer Tikka Masala ...,yes,The image shows a frozen food package with a g...,yes,The image shows a food package for a Tikka Mas...,no,True,0,1,2,0,4,0,1,0,,,
1,12066,VizWiz_train_00012066.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A Kroger grocery store tag for a caramel iced ...,Anne Marie,kroger iced carmel cake,,,x,A caramel iced cake with a label featuring the...,yes,The image shows a white label with the blue an...,no,The image shows a plastic container of apple c...,no,True,0,4,2,0,1,0,0,0,,,
2,1908,VizWiz_train_00001908.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A description on a box of herbal tea.\nA print...,Anne Marie,caffeine free herb tea,,,x,"A label with the text ""A delicious CAFFEINE FR...",yes,"The image shows a white box with the words ""A ...",yes,The image shows a sideways view of a product l...,yes,True,0,2,4,0,3,0,0,0,,,
3,15399,VizWiz_train_00015399.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A green and white box of Lean Pockets frozen r...,Anne Marie,lean pocket garlic chicken white pizza,,,x,Box of Lean Pockets with garlic chicken white ...,yes,The image shows a green and white box of Lean ...,yes,The image shows a box of Lean Pockets on a whi...,no,True,0,4,0,0,3,0,0,0,,,
4,4380,VizWiz_train_00004380.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,garlic spinach hummus possibly in a green cont...,Anne Marie,garlic spinach hummus,,garlic spinach hummus,x,"A plastic container with a green lid, featurin...",no,The image shows a plastic container with a gre...,no,The image shows a plastic food container on a ...,no,True,0,5,0,0,3,0,0,0,,,


In [8]:
combine_counts_and_percentages(
    calculate_quality_metrics(annotations_df[annotations_df["unable_to_verify"] == ""])
)

Unnamed: 0,unrecognizable,blur,framing,obstruction,rotation,too dark,too bright,other
0,1003 (82.21%),346 (28.36%),190 (15.57%),1098 (90.00%),620 (50.82%),1051 (86.15%),1034 (84.75%),1162 (95.25%)
1,217 (17.79%),186 (15.25%),150 (12.30%),95 (7.79%),124 (10.16%),144 (11.80%),142 (11.64%),57 (4.67%)
2,0 (0.00%),107 (8.77%),142 (11.64%),8 (0.66%),93 (7.62%),18 (1.48%),27 (2.21%),1 (0.08%)
3,0 (0.00%),80 (6.56%),144 (11.80%),4 (0.33%),82 (6.72%),3 (0.25%),9 (0.74%),0 (0.00%)
4,0 (0.00%),300 (24.59%),416 (34.10%),11 (0.90%),237 (19.43%),3 (0.25%),5 (0.41%),0 (0.00%)
5,0 (0.00%),201 (16.48%),178 (14.59%),4 (0.33%),64 (5.25%),1 (0.08%),3 (0.25%),0 (0.00%)
total,1220 (100.00%),1220 (100.00%),1220 (100.00%),1220 (100.00%),1220 (100.00%),1220 (100.00%),1220 (100.00%),1220 (100.00%)


### Processing step for combining annotation files

In [9]:
# # get the expert caption data and combine with the above
# curr_annotations_df = pd.read_csv(
#     "./intermediate-data/annotated-images_04-14-25-00:00.csv",
#     dtype=target_images_dtypes,
#     keep_default_na=False,
# )
# expert_caption_annotated_df = pd.read_csv(
#     "./intermediate-data/expert-annotated-images_04-14-25-00:00.csv",
#     dtype=target_images_dtypes,
#     keep_default_na=False,
# )
# print(f"Length of curr annotations: {len(curr_annotations_df)}")
# print(f"Length of expert annotations: {len(expert_caption_annotated_df)}")

# # merge expert_caption_annotated with filtered_evaluation_data_df
# expert_caption_annotated_df = pd.merge(
#     filtered_evaluation_data_df,
#     expert_caption_annotated_df,
#     on=["image_id", "file_name", "vizwiz_url"],
#     how="right",
# )

# # concat to the annotation_df
# columns_to_include = [
#     "image_id",
#     "file_name",
#     "vizwiz_url",
#     "image_preview",
#     "human_captions",
#     "annotator",
#     "notes",
#     "unable_to_verify",
#     "double code notes",
#     "double verified",
#     "gpt4o_caption",
#     "gpt4o_code",
#     "llama_caption",
#     "llama_code",
#     "molmo_caption",
#     "molmo_code",
#     "text_detected",
#     "unrecognizable",
#     "framing",
#     "blur",
#     "obstruction",
#     "rotation",
#     "too dark",
#     "too bright",
#     "other",
#     "curved label",
#     "text panel",
# ]

# full_annotations_df = pd.concat(
#     [
#         curr_annotations_df[columns_to_include],
#         expert_caption_annotated_df[columns_to_include + ["expert_caption"]],
#     ]
# )
# full_annotations_df["expert_caption"].fillna("", inplace=True)

# # save
# full_annotations_df.to_csv(
#     f"./annotated-data/final-annotated-images_{len(full_annotations_df)}-images_{datetime.now().strftime('%Y-%m-%d_%H-%M')}.csv",
#     index=False,
# )

## (3) load expert captions

In [10]:
expert_captioned_data = json.load(
    open(
        "../../data/study-2-output/final-evaluated-captions/expert-captions_evaluation_600-images_2025-04-13_16-42.json"
    )
)
print(json.dumps(expert_captioned_data[0], indent=4))

{
    "image_id": 20,
    "file_name": "VizWiz_train_00000020.jpg",
    "vizwiz_url": "https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00000020.jpg",
    "text_detected": true,
    "unrecognizable": 0,
    "framing": 4,
    "blur": 0,
    "obstruction": 1,
    "rotation": 0,
    "too dark": 0,
    "too bright": 0,
    "other": 0,
    "no issue": 0,
    "expert_captioner": "sm",
    "human_captions": [
        {
            "caption": "The upper portion of a PlayStation 2 game case for \"Grand Theft Auto: Vice City\" is shown. The colorful illustrated cover includes characters, vehicles, and a building, with a \"Rockstar Games\" logo in yellow. The case is partially off-frame on a white surface with a cord nearby.",
            "captioning_issue": ""
        }
    ],
    "model_captions": [
        {
            "model_name": "gpt-4o-2024-08-06",
            "caption": "A PlayStation 2 game case for \"Grand Theft Auto: Vice City\" with a colorful cover featuring ill

### Load raw expert captions

In [11]:
# # TODO: replace this with the dataset that has metrics run
# expert_captioned_data = pd.read_csv(
#     "./labeled-data/Dataset for Shawn _ 04-06-25 - dataset formatted.csv",
#     dtype={
#         "File Name": str,
#         "Image URL": str,
#         "Image Preview": str,
#         "Captioner Name": str,
#         "Describe all parts of the image that may be important to a person who is blind.": str,
#         "If you are unable to caption the image, describe the issue in this column.": str,
#     },
#     keep_default_na=False,
#     encoding="utf-8",
# )

# # rename columns so they're easier to program with
# expert_captioned_data.rename(
#     columns={
#         "File Name": "file_name",
#         "Image URL": "vizwiz_url",
#         "Captioner Name": "expert_captioner",
#         "Describe all parts of the image that may be important to a person who is blind.": "expert_caption",
#         "If you are unable to caption the image, describe the issue in this column.": "captioning_issue",
#     },
#     inplace=True,
# )
# expert_captioned_data.head()

# Analysis 1: how accurately do VLMs identify products?

In [12]:
# get only the images that were verifable
accuracy_counts_df = pd.concat(
    [
        annotations_df[annotations_df["unable_to_verify"] == ""][
            "gpt4o_code"
        ].value_counts(),
        annotations_df[annotations_df["unable_to_verify"] == ""][
            "llama_code"
        ].value_counts(),
        annotations_df[annotations_df["unable_to_verify"] == ""][
            "molmo_code"
        ].value_counts(),
    ],
    axis=1,
)
accuracy_counts_df.columns = MODEL_NAMES.keys()
display(accuracy_counts_df)

Unnamed: 0,gpt-4o-2024-08-06,Llama-3.2-11B-Vision-Instruct,Molmo-7B-O-0924
yes,811,568,449
no,379,644,770
yes++,30,8,1


In [13]:
100 * accuracy_counts_df / 1220

Unnamed: 0,gpt-4o-2024-08-06,Llama-3.2-11B-Vision-Instruct,Molmo-7B-O-0924
yes,66.47541,46.557377,36.803279
no,31.065574,52.786885,63.114754
yes++,2.459016,0.655738,0.081967


# Analysis 2: how does image quality affect a VLM's ability to accurately identify products?

## Combine annotation data with image quality data

In [14]:
# combine with annotation data
columns_to_include = [
    "image_id",
    "file_name",
    "vizwiz_url",
    "human_captions",
    "unable_to_verify",
    "double code notes",
    "double verified",
    "gpt4o_caption",
    "gpt4o_code",
    "llama_caption",
    "llama_code",
    "molmo_caption",
    "molmo_code",
]
regression_df = pd.merge(
    filtered_evaluation_data_df,
    annotations_df[columns_to_include],
    on=["image_id", "file_name", "vizwiz_url"],
    how="right",
)

# make sure each model has a yes or no
regression_df = regression_df[regression_df["unable_to_verify"] == ""]
regression_df = regression_df[
    (regression_df["gpt4o_code"] != "unsure") & (regression_df["gpt4o_code"] != "")
]
regression_df = regression_df[
    (regression_df["llama_code"] != "unsure") & (regression_df["llama_code"] != "")
]
regression_df = regression_df[
    (regression_df["molmo_code"] != "unsure") & (regression_df["molmo_code"] != "")
]

# combine yes and yes++
regression_df.replace({"yes++": "yes"}, inplace=True)
regression_df.replace({"yes": 1, "no": 0}, inplace=True)
regression_df["gpt4o_code"] = pd.to_numeric(regression_df["gpt4o_code"])
regression_df["llama_code"] = pd.to_numeric(regression_df["llama_code"])
regression_df["molmo_code"] = pd.to_numeric(regression_df["molmo_code"])

# cleanup
del regression_df["unable_to_verify"]
del regression_df["double code notes"]
del regression_df["double verified"]

print(f"Number of images in regression df: {len(regression_df)}")
regression_df.head()

Number of images in regression df: 1220


  regression_df.replace({"yes": 1, "no": 0}, inplace=True)


Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,no issue,human_captions,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code
0,9886,VizWiz_train_00009886.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,1,2,0,4,0,1,0,1,A box of frozen food sits on a table top that ...,A package of Trader Joe's Paneer Tikka Masala ...,1,The image shows a frozen food package with a g...,1,The image shows a food package for a Tikka Mas...,0
1,12066,VizWiz_train_00012066.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,2,0,1,0,0,0,0,A Kroger grocery store tag for a caramel iced ...,A caramel iced cake with a label featuring the...,1,The image shows a white label with the blue an...,0,The image shows a plastic container of apple c...,0
2,1908,VizWiz_train_00001908.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,2,4,0,3,0,0,0,0,A description on a box of herbal tea.\nA print...,"A label with the text ""A delicious CAFFEINE FR...",1,"The image shows a white box with the words ""A ...",1,The image shows a sideways view of a product l...,1
3,15399,VizWiz_train_00015399.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,0,0,3,0,0,0,1,A green and white box of Lean Pockets frozen r...,Box of Lean Pockets with garlic chicken white ...,1,The image shows a green and white box of Lean ...,1,The image shows a box of Lean Pockets on a whi...,0
4,4380,VizWiz_train_00004380.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,5,0,0,3,0,0,0,0,garlic spinach hummus possibly in a green cont...,"A plastic container with a green lid, featurin...",0,The image shows a plastic container with a gre...,0,The image shows a plastic food container on a ...,0


In [15]:
# save regression df
os.makedirs(
    "./intermediate-data",
    exist_ok=True,
)
regression_df.to_csv(
    f"./intermediate-data/regression-continuious-df_{len(regression_df)}-images.csv",
    index=False,
)

## Create bins for regression

In [16]:
image_quality_columns = [
    "unrecognizable",
    "framing",
    "blur",
    "obstruction",
    "rotation",
    "too dark",
    "too bright",
]

In [17]:
# bin image quality issues into 2 bins
regression_two_bins_df = regression_df.copy()
for iq in image_quality_columns:
    regression_two_bins_df[iq] = regression_two_bins_df[iq].astype(str)
    regression_two_bins_df[iq] = regression_two_bins_df[iq].map(
        {
            "0": "no issue",
            "1": "no issue",
            "2": "issue present",
            "3": "issue present",
            "4": "issue present",
            "5": "issue present",
        }
    )
display(regression_two_bins_df.head())
regression_two_bins_df.to_csv(
    f"./intermediate-data/regression-two-bins-df_{len(regression_two_bins_df)}-images.csv",
    index=False,
)

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,no issue,human_captions,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code
0,9886,VizWiz_train_00009886.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,no issue,no issue,issue present,no issue,issue present,no issue,no issue,0,1,A box of frozen food sits on a table top that ...,A package of Trader Joe's Paneer Tikka Masala ...,1,The image shows a frozen food package with a g...,1,The image shows a food package for a Tikka Mas...,0
1,12066,VizWiz_train_00012066.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,no issue,issue present,issue present,no issue,no issue,no issue,no issue,0,0,A Kroger grocery store tag for a caramel iced ...,A caramel iced cake with a label featuring the...,1,The image shows a white label with the blue an...,0,The image shows a plastic container of apple c...,0
2,1908,VizWiz_train_00001908.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,no issue,issue present,issue present,no issue,issue present,no issue,no issue,0,0,A description on a box of herbal tea.\nA print...,"A label with the text ""A delicious CAFFEINE FR...",1,"The image shows a white box with the words ""A ...",1,The image shows a sideways view of a product l...,1
3,15399,VizWiz_train_00015399.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,no issue,issue present,no issue,no issue,issue present,no issue,no issue,0,1,A green and white box of Lean Pockets frozen r...,Box of Lean Pockets with garlic chicken white ...,1,The image shows a green and white box of Lean ...,1,The image shows a box of Lean Pockets on a whi...,0
4,4380,VizWiz_train_00004380.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,no issue,issue present,no issue,no issue,issue present,no issue,no issue,0,0,garlic spinach hummus possibly in a green cont...,"A plastic container with a green lid, featurin...",0,The image shows a plastic container with a gre...,0,The image shows a plastic food container on a ...,0


In [18]:
# bin image quality issues into 2 bins
regression_three_bins_df = regression_df.copy()
for iq in image_quality_columns:
    regression_three_bins_df[iq] = regression_three_bins_df[iq].astype(str)
    regression_three_bins_df[iq] = regression_three_bins_df[iq].map(
        {
            "0": ".low",
            "1": ".low",
            "2": ".medium",
            "3": ".medium",
            "4": ".high",
            "5": ".high",
        }
    )
display(regression_three_bins_df.head())
regression_three_bins_df.to_csv(
    f"./intermediate-data/regression-three-bins-df_{len(regression_three_bins_df)}-images.csv",
    index=False,
)

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,no issue,human_captions,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code
0,9886,VizWiz_train_00009886.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,.low,.low,.medium,.low,.high,.low,.low,0,1,A box of frozen food sits on a table top that ...,A package of Trader Joe's Paneer Tikka Masala ...,1,The image shows a frozen food package with a g...,1,The image shows a food package for a Tikka Mas...,0
1,12066,VizWiz_train_00012066.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,.low,.high,.medium,.low,.low,.low,.low,0,0,A Kroger grocery store tag for a caramel iced ...,A caramel iced cake with a label featuring the...,1,The image shows a white label with the blue an...,0,The image shows a plastic container of apple c...,0
2,1908,VizWiz_train_00001908.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,.low,.medium,.high,.low,.medium,.low,.low,0,0,A description on a box of herbal tea.\nA print...,"A label with the text ""A delicious CAFFEINE FR...",1,"The image shows a white box with the words ""A ...",1,The image shows a sideways view of a product l...,1
3,15399,VizWiz_train_00015399.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,.low,.high,.low,.low,.medium,.low,.low,0,1,A green and white box of Lean Pockets frozen r...,Box of Lean Pockets with garlic chicken white ...,1,The image shows a green and white box of Lean ...,1,The image shows a box of Lean Pockets on a whi...,0
4,4380,VizWiz_train_00004380.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,.low,.high,.low,.low,.medium,.low,.low,0,0,garlic spinach hummus possibly in a green cont...,"A plastic container with a green lid, featurin...",0,The image shows a plastic container with a gre...,0,The image shows a plastic food container on a ...,0


## Create contingency tables for each image quality issue pair

In [19]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["framing"],
    regression_df["blur"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


blur,0,1,2,3,4,5
framing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.2,1.23,0.98,0.82,5.49,3.85
1,1.64,1.07,0.98,0.57,4.92,3.11
2,1.39,1.15,0.57,0.41,4.75,3.36
3,2.21,0.74,0.66,0.25,5.25,2.7
4,14.02,7.79,4.18,2.79,2.87,2.46
5,5.9,3.28,1.39,1.72,1.31,0.98


In [20]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["framing"],
    regression_df["obstruction"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


obstruction,0,1,2,3,4,5
framing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,14.75,0.41,0.08,0.0,0.25,0.08
1,11.23,0.66,0.0,0.0,0.25,0.16
2,10.33,0.9,0.08,0.08,0.16,0.08
3,10.16,1.39,0.0,0.0,0.25,0.0
4,30.49,3.28,0.25,0.08,0.0,0.0
5,13.03,1.15,0.25,0.16,0.0,0.0


In [21]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["framing"],
    regression_df["rotation"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


rotation,0,1,2,3,4,5
framing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.89,1.15,0.82,0.41,4.92,1.39
1,5.49,1.39,0.66,0.57,3.61,0.57
2,4.43,1.56,0.82,0.57,3.28,0.98
3,5.33,0.66,0.41,1.23,3.44,0.74
4,19.92,3.77,3.69,2.87,2.62,1.23
5,8.77,1.64,1.23,1.07,1.56,0.33


In [22]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["blur"],
    regression_df["obstruction"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


obstruction,0,1,2,3,4,5
blur,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.56,1.31,0.08,0.08,0.25,0.08
1,12.87,1.8,0.25,0.08,0.16,0.08
2,7.7,0.82,0.08,0.0,0.08,0.08
3,5.82,0.25,0.16,0.08,0.25,0.0
4,22.13,2.21,0.08,0.0,0.16,0.0
5,14.92,1.39,0.0,0.08,0.0,0.08


In [23]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["blur"],
    regression_df["rotation"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


rotation,0,1,2,3,4,5
blur,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,12.13,2.13,2.38,1.39,7.62,2.7
1,7.46,1.07,0.66,1.23,4.26,0.57
2,3.61,0.66,0.57,0.25,2.95,0.74
3,3.11,0.49,0.49,0.25,1.72,0.49
4,15.98,3.36,1.64,1.72,1.23,0.66
5,8.52,2.46,1.89,1.89,1.64,0.08


# Analysis 3: expert captions and sensitivity of scores

In [24]:
def compute_average_metrics(dataset, metric):
    """
    Computes average precision, recall, and f1 for BERTScore for each model.
    """
    total_scores = {}
    for image in dataset:
        if metric == "bertscore":
            curr_evaluation = image["evaluation"][metric]
            f1_name = "f1"
        elif metric == "cap_f1":
            curr_evaluation = image["evaluation"][metric]["scores"]
            f1_name = "cap_f1"

        for model_name, scores in curr_evaluation.items():
            if model_name in total_scores:
                total_scores[model_name] = {
                    "total_count": total_scores[model_name]["total_count"] + 1,
                    "total_precision": total_scores[model_name]["total_precision"]
                    + scores["precision"],
                    "total_recall": total_scores[model_name]["total_recall"]
                    + scores["recall"],
                    "total_f1": total_scores[model_name]["total_f1"] + scores[f1_name],
                }
            else:
                total_scores[model_name] = {
                    "total_count": 1,
                    "total_precision": scores["precision"],
                    "total_recall": scores["recall"],
                    "total_f1": scores[f1_name],
                }

    # compute averages and f1
    output = {}
    for model_name, values in total_scores.items():
        output[model_name] = {
            "avg_precision": values["total_precision"] / float(values["total_count"]),
            "avg_recall": values["total_recall"] / float(values["total_count"]),
            "avg_f1": values["total_f1"] / float(values["total_count"]),
        }
        output[model_name]["f1"] = s.harmonic_mean(
            [output[model_name]["avg_precision"], output[model_name]["avg_recall"]]
        )

    return output

In [25]:
# get only the data we need for the expert file
target_keys = [
    "image_id",
    "file_name",
    "vizwiz_url",
    "text_detected",
    "unrecognizable",
    "framing",
    "blur",
    "obstruction",
    "rotation",
    "too dark",
    "too bright",
    "other",
    "no issue",
]
evaluation_data_expert = [
    {x: y for x, y in image.items() if x in target_keys}
    for image in evaluated_captions_data
]

# create an expert file to evaluate
expert_data_to_eval_df = pd.DataFrame.from_dict(evaluation_data_expert)
expert_data_to_eval_df = pd.merge(
    expert_data_to_eval_df,
    expert_captioned_data,
    on=["file_name", "vizwiz_url"],
    how="right",
)
del expert_data_to_eval_df["Image Preview"]
expert_data_to_eval_df.head()

TypeError: Can only merge Series or DataFrame objects, a <class 'list'> was passed

In [26]:
# convert to a dict
expert_data_to_eval_dict = expert_data_to_eval_df.to_dict(orient="records")

# sort dict by image_id
expert_data_to_eval_dict = sorted(expert_data_to_eval_dict, key=lambda x: x["image_id"])
evaluated_captions_data = sorted(evaluated_captions_data, key=lambda x: x["image_id"])


def find_matching_image(image_id, data):
    for image in data:
        if image["image_id"] == image_id:
            return image
    return None


# add extra columns to the dict
for index, image in enumerate(expert_data_to_eval_dict):
    matching_image = find_matching_image(image["image_id"], evaluated_captions_data)
    expert_caption = image["expert_caption"]
    captioning_challenge = image["captioning_issue"]
    model_captions = matching_image["model_captions"]

    image["human_captions"] = [
        {
            "caption": expert_caption.strip(),
            "captioning_issue": captioning_challenge,
        }
    ]
    image["model_captions"] = model_captions
    del image["expert_caption"]
    del image["captioning_issue"]

# save the dict
with open(
    f"./intermediate-data/expert_data_{len(expert_data_to_eval_dict)}-images_{datetime.now().strftime('%Y-%m-%d_%H-%M')}.json",
    "w",
) as f:
    json.dump(expert_data_to_eval_dict, f, indent=4)

KeyError: 'expert_caption'

## Look at subgroups for expert captions

In [27]:
expert_image_set = {x["image_id"] for x in expert_captioned_data}
expert_annotations_df = annotations_df[
    (annotations_df["image_id"].isin(expert_image_set))
    & (annotations_df["unable_to_verify"] == "")
]
print(len(expert_annotations_df))
expert_annotations_df.head()

268


Unnamed: 0,image_id,file_name,vizwiz_url,image_preview,human_captions,annotator,notes,unable_to_verify,double code notes,double verified,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,curved label,text panel,expert_caption
1,12066,VizWiz_train_00012066.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A Kroger grocery store tag for a caramel iced ...,Anne Marie,kroger iced carmel cake,,,x,A caramel iced cake with a label featuring the...,yes,The image shows a white label with the blue an...,no,The image shows a plastic container of apple c...,no,True,0,4,2,0,1,0,0,0,,,
6,12884,VizWiz_train_00012884.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,The backside of a green plastic bottle with wo...,Anne Marie,nourishing shampoo,,,x,"Green tube labeled ""Nourishing Shampoo"" with i...",yes,"The image shows a green tube with white text, ...",yes,The image shows a green tube lying on its side...,no,True,0,1,0,0,4,0,0,0,x,,
9,8670,VizWiz_train_00008670.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A package of Kellogg's brand granola cereal wi...,Anne Marie,kellogg's low fat granola with raisens,,,x,A box of Kellogg’s Low Fat Granola with Raisin...,yes,The image shows a box of Kellogg's Low Fat Gra...,yes,The image shows a box of Kellogg's low-fat gra...,yes,True,0,0,5,0,0,0,0,0,,,
13,13799,VizWiz_train_00013799.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A package for Macaroni and cheese on a kitchen...,Anne Marie,macaroni and cheese,,,x,A food package with a label that includes part...,no,The image appears to be a close-up of a person...,no,The image shows a box of Lipton iced tea. The ...,no,True,0,4,5,1,1,0,1,0,,,
17,16729,VizWiz_train_00016729.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,the back of a Unilever brand product listing i...,Anne Marie,(Unilever) vasaline lotion,,,x,A bottle with a barcode and text showing ingre...,no,The image shows a white plastic bottle with a ...,no,The image shows a bottle of Vaseline lotion be...,yes,True,0,4,0,0,0,0,0,0,,x,


In [28]:
# expert annotations
for metric in ["bertscore", "cap_f1"]:
    print(f"--- Performance for {metric} ---")
    for model, model_short_name in MODEL_NAMES.items():
        correct_predictions = expert_annotations_df[
            expert_annotations_df[f"{model_short_name}_code"] != "no"
        ]
        ids_for_correct = list(correct_predictions["image_id"])

        incorrect_predictions = expert_annotations_df[
            expert_annotations_df[f"{model_short_name}_code"] == "no"
        ]
        ids_for_incorrect = list(incorrect_predictions["image_id"])
        print(
            f"{model_short_name} -- correct count = {len(ids_for_correct)}; incorrect count = {len(ids_for_incorrect)}"
        )
        print(ids_for_incorrect)

        # filter expert captioned data
        filted_expert_data_correct = [
            x for x in expert_captioned_data if x["image_id"] in ids_for_correct
        ]
        filted_expert_data_incorrect = [
            x for x in expert_captioned_data if x["image_id"] in ids_for_incorrect
        ]

        # print metrics
        performance_correct = compute_average_metrics(
            filted_expert_data_correct, metric
        )[model]
        performance_incorrect = compute_average_metrics(
            filted_expert_data_incorrect, metric
        )[model]

        # print
        print(
            f"Performance Correct -- Avg Precision: {performance_correct['avg_precision']:.4f}, Avg Recall: {performance_correct['avg_recall']:.4f}"
        )
        print(
            f"Performance incorrect -- Avg Precision: {performance_incorrect['avg_precision']:.4f}, Avg Recall: {performance_incorrect['avg_recall']:.4f}\n"
        )

--- Performance for bertscore ---
gpt4o -- correct count = 185; incorrect count = 83
[13799, 16729, 6341, 1646, 763, 5711, 4629, 5368, 14683, 15302, 18234, 20318, 13687, 11856, 2030, 2002, 15338, 1471, 16393, 16060, 944, 1950, 10733, 7498, 19322, 23160, 10504, 19212, 11989, 12265, 7632, 5209, 18833, 981, 7902, 6048, 12113, 20178, 6719, 8055, 5656, 14146, 16379, 3462, 21997, 10689, 13355, 15949, 9159, 13702, 2390, 14578, 11510, 9584, 18225, 4356, 20540, 143, 4474, 7866, 16052, 19303, 14975, 16007, 9718, 13286, 16479, 15240, 17913, 2015, 9831, 2294, 860, 10709, 21209, 3380, 3552, 3737, 6693, 3493, 2235, 17557, 5029]
Performance Correct -- Avg Precision: 0.7629, Avg Recall: 0.6906
Performance incorrect -- Avg Precision: 0.7449, Avg Recall: 0.6653

llama -- correct count = 117; incorrect count = 151
[12066, 13799, 16729, 6341, 1646, 763, 5711, 4629, 22396, 5368, 14683, 14321, 15302, 18234, 20318, 14833, 13687, 11856, 2030, 4191, 327, 5824, 7367, 15338, 1471, 1386, 16393, 15008, 9431, 16830

In [29]:
# on the same dataset, look at crowdworker annotations
crowdworker_annotations_df = annotations_df[
    (annotations_df["image_id"].isin(expert_image_set))
    & (annotations_df["unable_to_verify"] == "")
]
print(len(crowdworker_annotations_df))
crowdworker_annotations_df.head()

268


Unnamed: 0,image_id,file_name,vizwiz_url,image_preview,human_captions,annotator,notes,unable_to_verify,double code notes,double verified,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,curved label,text panel,expert_caption
1,12066,VizWiz_train_00012066.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A Kroger grocery store tag for a caramel iced ...,Anne Marie,kroger iced carmel cake,,,x,A caramel iced cake with a label featuring the...,yes,The image shows a white label with the blue an...,no,The image shows a plastic container of apple c...,no,True,0,4,2,0,1,0,0,0,,,
6,12884,VizWiz_train_00012884.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,The backside of a green plastic bottle with wo...,Anne Marie,nourishing shampoo,,,x,"Green tube labeled ""Nourishing Shampoo"" with i...",yes,"The image shows a green tube with white text, ...",yes,The image shows a green tube lying on its side...,no,True,0,1,0,0,4,0,0,0,x,,
9,8670,VizWiz_train_00008670.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A package of Kellogg's brand granola cereal wi...,Anne Marie,kellogg's low fat granola with raisens,,,x,A box of Kellogg’s Low Fat Granola with Raisin...,yes,The image shows a box of Kellogg's Low Fat Gra...,yes,The image shows a box of Kellogg's low-fat gra...,yes,True,0,0,5,0,0,0,0,0,,,
13,13799,VizWiz_train_00013799.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A package for Macaroni and cheese on a kitchen...,Anne Marie,macaroni and cheese,,,x,A food package with a label that includes part...,no,The image appears to be a close-up of a person...,no,The image shows a box of Lipton iced tea. The ...,no,True,0,4,5,1,1,0,1,0,,,
17,16729,VizWiz_train_00016729.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,the back of a Unilever brand product listing i...,Anne Marie,(Unilever) vasaline lotion,,,x,A bottle with a barcode and text showing ingre...,no,The image shows a white plastic bottle with a ...,no,The image shows a bottle of Vaseline lotion be...,yes,True,0,4,0,0,0,0,0,0,,x,


In [30]:
for metric in ["bertscore", "cap_f1"]:
    print(f"--- Performance for {metric} ---")
    for model, model_short_name in MODEL_NAMES.items():
        correct_predictions = crowdworker_annotations_df[
            crowdworker_annotations_df[f"{model_short_name}_code"] != "no"
        ]
        ids_for_correct = list(correct_predictions["image_id"])

        incorrect_predictions = crowdworker_annotations_df[
            crowdworker_annotations_df[f"{model_short_name}_code"] == "no"
        ]
        ids_for_incorrect = list(incorrect_predictions["image_id"])
        print(
            f"{model_short_name} -- correct count = {len(ids_for_correct)} ({100 * len(ids_for_correct) / len(crowdworker_annotations_df):.0f}%); incorrect count = {len(ids_for_incorrect)} ({100 * len(ids_for_incorrect) / len(crowdworker_annotations_df):.0f}%)"
        )
        # print(ids_for_incorrect)

        # filter crowdworker captioned data
        filted_crowdworker_data_correct = [
            x for x in evaluated_captions_data if x["image_id"] in ids_for_correct
        ]
        filted_crowdworker_data_incorrect = [
            x for x in evaluated_captions_data if x["image_id"] in ids_for_incorrect
        ]

        # print metrics
        performance_correct = compute_average_metrics(
            filted_crowdworker_data_correct, metric
        )[model]
        performance_incorrect = compute_average_metrics(
            filted_crowdworker_data_incorrect, metric
        )[model]

        # print
        print(
            f"Performance Correct -- Avg Precision: {performance_correct['avg_precision']:.4f}, Avg Recall: {performance_correct['avg_recall']:.4f}"
        )
        print(
            f"Performance incorrect -- Avg Precision: {performance_incorrect['avg_precision']:.4f}, Avg Recall: {performance_incorrect['avg_recall']:.4f}\n"
        )

--- Performance for bertscore ---
gpt4o -- correct count = 185 (69%); incorrect count = 83 (31%)
Performance Correct -- Avg Precision: 0.6398, Avg Recall: 0.7958
Performance incorrect -- Avg Precision: 0.6377, Avg Recall: 0.7692

llama -- correct count = 117 (44%); incorrect count = 151 (56%)
Performance Correct -- Avg Precision: 0.5921, Avg Recall: 0.7603
Performance incorrect -- Avg Precision: 0.5946, Avg Recall: 0.7402

molmo -- correct count = 96 (36%); incorrect count = 172 (64%)
Performance Correct -- Avg Precision: 0.6638, Avg Recall: 0.8006
Performance incorrect -- Avg Precision: 0.6448, Avg Recall: 0.7676

--- Performance for cap_f1 ---
gpt4o -- correct count = 185 (69%); incorrect count = 83 (31%)
Performance Correct -- Avg Precision: 0.5186, Avg Recall: 0.4206
Performance incorrect -- Avg Precision: 0.5018, Avg Recall: 0.3407

llama -- correct count = 117 (44%); incorrect count = 151 (56%)
Performance Correct -- Avg Precision: 0.4062, Avg Recall: 0.4000
Performance incorrect

In [147]:
MODEL_NAMES

{'gpt-4o-2024-08-06': 'gpt4o',
 'Llama-3.2-11B-Vision-Instruct': 'llama',
 'Molmo-7B-O-0924': 'molmo'}

In [162]:
for model in MODEL_NAMES.keys():
    bert_precisions = [
        x["evaluation"]["bertscore"][model]["precision"]
        for x in evaluated_captions_data
    ]
    bert_recalls = [
        x["evaluation"]["bertscore"][model]["recall"] for x in evaluated_captions_data
    ]
    print(stats.describe(bert_precisions))
    print(stats.describe(bert_recalls))
    print()

DescribeResult(nobs=5432, minmax=(np.float64(0.3801501393318176), np.float64(0.90423983335495)), mean=np.float64(0.6410437088092139), variance=np.float64(0.00351636410091269), skewness=np.float64(0.13506593282533155), kurtosis=np.float64(0.38863508604686503))
DescribeResult(nobs=5432, minmax=(np.float64(0.5880314707756042), np.float64(0.9440022110939026)), mean=np.float64(0.7789125641443065), variance=np.float64(0.002378888823964874), skewness=np.float64(-0.10485067445019737), kurtosis=np.float64(-0.09655775360738117))

DescribeResult(nobs=5432, minmax=(np.float64(0.18891480565071106), np.float64(0.8394327163696289)), mean=np.float64(0.6055959118296981), variance=np.float64(0.0028518755754962633), skewness=np.float64(-0.05755974845593788), kurtosis=np.float64(0.9191530939957211))
DescribeResult(nobs=5432, minmax=(np.float64(0.5791395902633667), np.float64(0.8977669477462769)), mean=np.float64(0.7485131648244318), variance=np.float64(0.002095130992444404), skewness=np.float64(-0.0047310

In [1]:
for model in MODEL_NAMES.keys():
    cap_f1_precisions = [
        x["evaluation"]["cap_f1"]["scores"][model]["precision"]
        for x in evaluated_captions_data
    ]
    cap_f1_recalls = [
        x["evaluation"]["cap_f1"]["scores"][model]["recall"]
        for x in evaluated_captions_data
    ]
    print(stats.describe(cap_f1_precisions))
    print(stats.describe(cap_f1_recalls))
    print()

NameError: name 'MODEL_NAMES' is not defined