# Study 2 Analysis
Analyses for each research question in Study 2. These analyses are done with the newest setup of the project where the researchers annotate whether VLMs accurately identified the product in images.

# Load packages

In [1]:
import os
import copy
import json
import csv
from datetime import datetime

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

## Helper functions

# Load data
We load 3 pieces of data:
1. A `.json` file with all images that fit our study conditions, their model captions, and evaluation metrics.
2. A `.csv` that has annotations from the research team noting what captions accurately identify products.
3. A `.csv` with expert captions.

In [12]:
# (1) load images with model captions and evaluation metrics
evaluated_captions_data = json.load(
    open(
        "../../data/study-2-output/final-evaluated-captions/low-quality_evaluation_5432-images_2025-04-11_03-31_merged.json"
    )
)

print(f"Length of evaluated captions: {len(evaluated_captions_data)}")
print(json.dumps(evaluated_captions_data[0], indent=4))

Length of evaluated captions: 5432
{
    "image_id": 1,
    "file_name": "VizWiz_train_00000001.jpg",
    "vizwiz_url": "https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00000001.jpg",
    "text_detected": true,
    "unrecognizable": 0,
    "framing": 0,
    "blur": 5,
    "obstruction": 0,
    "rotation": 0,
    "too dark": 0,
    "too bright": 0,
    "other": 0,
    "no issue": 0,
    "human_captions": [
        {
            "caption": "A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.",
            "is_precanned": false,
            "is_rejected": false
        },
        {
            "caption": "A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.",
            "is_precanned": false,
            "is_rejected": false
        },
        {
            "caption": "A kitchen counter the various items on top including a can of Coca-Cola, metal containers, and a teapot.",
            "is_precanned": false

In [13]:
# (2) load annotations from resaerch team
target_images_dtypes = {
    "image_id": int,
    "file_name": str,
    "vizwiz_url": str,
    "image_preview": str,
    "INCLUDE because product": str,
    "EXCLUDE because not verifable": str,
    "EXCLUDE because Book/DVD/CD/magazine?": str,
    "text_detected": bool,
    "unrecognizable": int,
    "framing": int,
    "blur": int,
    "obstruction": int,
    "rotation": int,
    "too dark": int,
    "too bright": int,
    "other": int,
    "no issue": int,
    "human_caption_0": str,
    "human_caption_1": str,
    "human_caption_2": str,
    "human_caption_3": str,
    "human_caption_4": str,
    "gpt-4o-2024-08-06_caption": str,
    "Llama-3.2-11B-Vision-Instruct_caption": str,
    "Molmo-7B-O-0924_caption": str,
    "general_notes": str,
    "gpt-4o-2024-08-06_notes": str,
    "Llama-3.2-11B-Vision-Instruct_notes": str,
    "Molmo-7B-O-0924_notes": str,
    "image_preview": str,
    "unable_to_verify": str,
    "gpt4o_code": str,
    "llama_code": str,
    "molmo_code": str,
    "notes": str,
    "double code notes": str,
    "double verified": str,
}

annotations_df = pd.read_csv(
    "./annotated-data/final-annotated-images_04-13-25-00:00.csv",
    dtype=target_images_dtypes,
    keep_default_na=False,
)

print(f"Number of annotated images: {len(annotations_df)}")
print(
    f"Number of images that were verified: {len(annotations_df[annotations_df['unable_to_verify'] == ''])}"
)
annotations_df.head()

Number of annotated images: 1519
Number of images that were verified: 1070


Unnamed: 0,image_id,file_name,vizwiz_url,image_preview,human_captions,annotator,notes,unable_to_verify,double code notes,double verified,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other
0,9886,VizWiz_train_00009886.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A box of frozen food sits on a table top that ...,Anne Marie,trade joes paneer tikka masala,,,x,A package of Trader Joe's Paneer Tikka Masala ...,yes,The image shows a frozen food package with a g...,yes,The image shows a food package for a Tikka Mas...,no,True,0,1,2,0,4,0,1,0
1,12066,VizWiz_train_00012066.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A Kroger grocery store tag for a caramel iced ...,Anne Marie,kroger iced carmel cake,,,x,A caramel iced cake with a label featuring the...,yes,The image shows a white label with the blue an...,no,The image shows a plastic container of apple c...,no,True,0,4,2,0,1,0,0,0
2,1908,VizWiz_train_00001908.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A description on a box of herbal tea.\nA print...,Anne Marie,caffeine free herb tea,,,x,"A label with the text ""A delicious CAFFEINE FR...",yes,"The image shows a white box with the words ""A ...",yes,The image shows a sideways view of a product l...,yes,True,0,2,4,0,3,0,0,0
3,15399,VizWiz_train_00015399.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A green and white box of Lean Pockets frozen r...,Anne Marie,lean pocket garlic chicken white pizza,,,x,Box of Lean Pockets with garlic chicken white ...,yes,The image shows a green and white box of Lean ...,yes,The image shows a box of Lean Pockets on a whi...,no,True,0,4,0,0,3,0,0,0
4,4380,VizWiz_train_00004380.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,garlic spinach hummus possibly in a green cont...,Anne Marie,,yes,garlic spinach hummus,x,"A plastic container with a green lid, featurin...",,The image shows a plastic container with a gre...,,The image shows a plastic food container on a ...,,True,0,5,0,0,3,0,0,0


In [14]:
# (3) load expert captions
# TODO: replace this with the dataset that has metrics run
expert_captioned_data = pd.read_csv(
    "./labeled-data/Dataset for Shawn _ 04-06-25 - dataset formatted.csv",
    dtype={
        "File Name": str,
        "Image URL": str,
        "Image Preview": str,
        "Captioner Name": str,
        "Describe all parts of the image that may be important to a person who is blind.": str,
        "If you are unable to caption the image, describe the issue in this column.": str,
    },
    keep_default_na=False,
)

# rename columns so they're easier to program with
expert_captioned_data.rename(
    columns={
        "File Name": "file_name",
        "Image URL": "vizwiz_url",
        "Captioner Name": "expert_captioner",
        "Describe all parts of the image that may be important to a person who is blind.": "expert_caption",
        "If you are unable to caption the image, describe the issue in this column.": "captioning_issue",
    },
    inplace=True,
)
expert_captioned_data.head()

Unnamed: 0,file_name,vizwiz_url,Image Preview,expert_captioner,expert_caption,captioning_issue
0,VizWiz_train_00019710.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,sm,Two bulldogs are lying side by side on a white...,
1,VizWiz_train_00008349.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,sm,A box of “Café Escapes” dark chocolate hot coc...,
2,VizWiz_train_00003852.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,sm,A clear plastic container of Kirkland Signatur...,
3,VizWiz_train_00014314.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,sm,A close-up photo of a black computer keyboard ...,
4,VizWiz_train_00016316.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,sm,A can of Progresso soup is being held up close...,


# Analysis 1: how accurately do VLMs identify products?

# Analysis 2: how does image quality affect a VLM's ability to accurately identify products?

## Combine annotation data with image quality data

In [5]:
# create a dataframe with only the data we need for the regression
target_keys = [
    "image_id",
    "file_name",
    "vizwiz_url",
    "text_detected",
    "unrecognizable",
    "framing",
    "blur",
    "obstruction",
    "rotation",
    "too dark",
    "too bright",
    "other",
    "no issue",
]
evaluation_data_regression = [
    {x: y for x, y in image.items() if x in target_keys}
    for image in evaluated_captions_data
]
filtered_evaluation_data_df = pd.DataFrame.from_dict(evaluation_data_regression)
filtered_evaluation_data_df.head()

# combine with annotation data
columns_to_include = [
    "image_id",
    "file_name",
    "vizwiz_url",
    "human_captions",
    "unable_to_verify",
    "double code notes",
    "double verified",
    "gpt4o_caption",
    "gpt4o_code",
    "llama_caption",
    "llama_code",
    "molmo_caption",
    "molmo_code",
]
regression_df = pd.merge(
    filtered_evaluation_data_df,
    annotations_df[columns_to_include],
    on=["image_id", "file_name", "vizwiz_url"],
    how="right",
)

# make sure each model has a yes or no
regression_df = regression_df[regression_df["unable_to_verify"] == ""]
regression_df = regression_df[
    (regression_df["gpt4o_code"] != "unsure") & (regression_df["gpt4o_code"] != "")
]
regression_df = regression_df[
    (regression_df["llama_code"] != "unsure") & (regression_df["llama_code"] != "")
]
regression_df = regression_df[
    (regression_df["molmo_code"] != "unsure") & (regression_df["molmo_code"] != "")
]

# combine yes and yes++
regression_df.replace({"yes++": "yes"}, inplace=True)
regression_df.replace({"yes": 1, "no": 0}, inplace=True)
regression_df["gpt4o_code"] = pd.to_numeric(regression_df["gpt4o_code"])
regression_df["llama_code"] = pd.to_numeric(regression_df["llama_code"])
regression_df["molmo_code"] = pd.to_numeric(regression_df["molmo_code"])

# cleanup
del regression_df["unable_to_verify"]
del regression_df["double code notes"]
del regression_df["double verified"]

print(f"Number of images in regression df: {len(regression_df)}")
regression_df.head()

Number of images in regression df: 1056


  regression_df.replace({"yes": 1, "no": 0}, inplace=True)


Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,no issue,human_captions,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code
0,9886,VizWiz_train_00009886.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,1,2,0,4,0,1,0,1,A box of frozen food sits on a table top that ...,A package of Trader Joe's Paneer Tikka Masala ...,1,The image shows a frozen food package with a g...,1,The image shows a food package for a Tikka Mas...,0
1,12066,VizWiz_train_00012066.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,2,0,1,0,0,0,0,A Kroger grocery store tag for a caramel iced ...,A caramel iced cake with a label featuring the...,1,The image shows a white label with the blue an...,0,The image shows a plastic container of apple c...,0
2,1908,VizWiz_train_00001908.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,2,4,0,3,0,0,0,0,A description on a box of herbal tea.\nA print...,"A label with the text ""A delicious CAFFEINE FR...",1,"The image shows a white box with the words ""A ...",1,The image shows a sideways view of a product l...,1
3,15399,VizWiz_train_00015399.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,0,0,3,0,0,0,1,A green and white box of Lean Pockets frozen r...,Box of Lean Pockets with garlic chicken white ...,1,The image shows a green and white box of Lean ...,1,The image shows a box of Lean Pockets on a whi...,0
5,8554,VizWiz_train_00008554.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,5,0,0,0,0,2,0,0,Part of a package containing combat gauze for ...,"A green package labeled ""QuikClot Combat Gauze...",1,"The object is a green, rectangular, plastic po...",0,The image shows a green plastic packet on a wo...,0


In [6]:
# save regression df
os.makedirs(
    "./intermediate-data",
    exist_ok=True,
)
regression_df.to_csv(
    "./intermediate-data/regression-df_04-13-25-00:00.csv",
    index=False,
)

## Create contingency tables for each image quality issue pair

In [7]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["framing"],
    regression_df["blur"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


blur,0,1,2,3,4,5
framing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.5,1.23,0.76,0.66,5.68,3.98
1,1.52,0.85,1.04,0.57,4.92,2.75
2,1.52,1.23,0.28,0.38,5.11,3.31
3,2.37,0.66,0.57,0.19,5.21,2.65
4,14.87,8.14,4.07,2.75,3.03,2.08
5,5.97,3.31,1.33,1.42,0.95,1.14


In [8]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["framing"],
    regression_df["obstruction"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


obstruction,0,1,2,3,4,5
framing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,14.96,0.38,0.09,0.0,0.28,0.09
1,10.51,0.66,0.0,0.0,0.28,0.19
2,10.51,1.04,0.0,0.09,0.19,0.0
3,9.94,1.42,0.0,0.0,0.28,0.0
4,31.06,3.6,0.28,0.0,0.0,0.0
5,12.5,1.14,0.28,0.19,0.0,0.0


In [9]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["framing"],
    regression_df["rotation"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


rotation,0,1,2,3,4,5
framing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7.29,1.14,0.76,0.47,4.73,1.42
1,5.78,1.33,0.57,0.57,2.84,0.57
2,4.92,1.61,0.66,0.57,3.12,0.95
3,5.49,0.76,0.38,1.04,3.31,0.66
4,20.93,3.98,3.69,2.65,2.37,1.33
5,9.09,1.8,1.04,0.76,1.04,0.38


In [10]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["blur"],
    regression_df["obstruction"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


obstruction,0,1,2,3,4,5
blur,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.94,1.33,0.09,0.0,0.28,0.09
1,13.07,1.8,0.19,0.09,0.19,0.09
2,7.1,0.76,0.09,0.0,0.09,0.0
3,5.02,0.38,0.19,0.09,0.28,0.0
4,21.97,2.65,0.09,0.0,0.19,0.0
5,14.39,1.33,0.0,0.09,0.0,0.09


In [11]:
print(
    "Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)"
)
pd.crosstab(
    regression_df["blur"],
    regression_df["rotation"],
    normalize=True,
).round(4) * 100

Percentage of images with framing and blur issues, by vote from crowdworker (divided by total number of images with framing or blur issues)


rotation,0,1,2,3,4,5
blur,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,13.16,2.37,2.27,1.14,7.86,2.94
1,8.05,0.95,0.66,1.23,3.88,0.66
2,3.79,0.66,0.47,0.28,2.18,0.66
3,3.03,0.66,0.47,0.09,1.33,0.38
4,16.86,3.5,1.42,1.52,0.95,0.66
5,8.62,2.46,1.8,1.8,1.23,0.0


# Analysis 3: expert captions and sensitivity of scores

In [21]:
# get only the data we need for the expert file
target_keys = [
    "image_id",
    "file_name",
    "vizwiz_url",
    "text_detected",
    "unrecognizable",
    "framing",
    "blur",
    "obstruction",
    "rotation",
    "too dark",
    "too bright",
    "other",
    "no issue",
]
evaluation_data_expert = [
    {x: y for x, y in image.items() if x in target_keys}
    for image in evaluated_captions_data
]

# create an expert file to evaluate
expert_data_to_eval_df = pd.DataFrame.from_dict(evaluation_data_expert)
expert_data_to_eval_df = pd.merge(
    expert_data_to_eval_df,
    expert_captioned_data,
    on=["file_name", "vizwiz_url"],
    how="right",
)
del expert_data_to_eval_df["Image Preview"]
expert_data_to_eval_df.head()

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,no issue,expert_captioner,expert_caption,captioning_issue
0,19710,VizWiz_train_00019710.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,0,0,0,0,0,0,1,sm,Two bulldogs are lying side by side on a white...,
1,8349,VizWiz_train_00008349.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,1,3,4,0,0,0,0,0,1,sm,A box of “Café Escapes” dark chocolate hot coc...,
2,3852,VizWiz_train_00003852.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,2,4,0,2,1,0,0,0,sm,A clear plastic container of Kirkland Signatur...,
3,14314,VizWiz_train_00014314.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,4,1,0,0,0,0,0,sm,A close-up photo of a black computer keyboard ...,
4,16316,VizWiz_train_00016316.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,1,4,3,0,0,1,1,0,0,sm,A can of Progresso soup is being held up close...,


In [25]:
# convert to a dict
expert_data_to_eval_dict = expert_data_to_eval_df.to_dict(orient="records")
for image in expert_data_to_eval_dict:
    expert_caption = image["expert_caption"]
    captioning_challenge = image["captioning_issue"]
    image["human_captions"] = [
        {
            "caption": expert_caption,
            "captioning_issue": captioning_challenge,
        }
    ]
    del image["expert_caption"]
    del image["captioning_issue"]

# save the dict
with open(
    f"./intermediate-data/expert_data_{len(expert_data_to_eval_dict)}-images_{datetime.now().strftime('%Y-%m-%d_%H-%M')}.json",
    "w",
) as f:
    json.dump(expert_data_to_eval_dict, f)