# Abstention Testing
Because VLMs can vary in their responses, this notebook checks how consistently correct or incorrect VLMs are in identifying products

In [1]:
import sys
sys.path.append('../')

import os
import json

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import torch
import spacy  # make sure to run python -m spacy download en_core_web_sm

from openai import OpenAI
from transformers import MllamaForConditionalGeneration, AutoProcessor, GenerationConfig
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI, SelfCheckBERTScore
from tqdm.notebook import tqdm

In [2]:
from scripts.constants import get_prompt

In [3]:
# setup pytorch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # for multi-GPU systems, force single GPU
if torch.cuda.is_available():
    device_map = "cuda:0"  # force single, first GPU
    device_type = "cuda"
    torch_dtype = "auto"
elif torch.backends.mps.is_available():
    device_map = "auto"
    device_type = "mps"
    torch_dtype = torch.bfloat16
else:
    device_map = "auto"
    device_type = "cpu"
    torch_dtype = torch.bfloat16
print(f"Using device: {device_type}")

Using device: cuda


In [4]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

## Constants

In [5]:
MODEL_NAMES = {
    "gpt-4o-2024-08-06": "gpt4o",
    "Llama-3.2-11B-Vision-Instruct": "llama",
    "Molmo-7B-O-0924": "molmo",
}

# initialize openai client
openai_client = OpenAI()
openai_client.api_key = os.getenv("OPENAI_API_KEY")

## Helper functions to generate samples from model

In [6]:
def gpt4o_sampler_wrapper(row):
    image_url = row['vizwiz_url']
    return generate_gpt4o_captions(image_url, openai_client, get_prompt(), count=5)
    
def generate_gpt4o_captions(image_url, openai_client, prompt, model_name="gpt-4o-2024-08-06", temperature=1.0, count=1):
    """
    Generates a caption for an image.

    Inputs:
    - image_url (str): url of image to caption.
    - prompt (str): prompt to use for captioning.
    - temperature (float; optional): temperature setting for model, greater than 0. Defaults to 1.0; lower values are more deterministic.
    - count (int; optional): how many samples should be generated

    Output:
    - (list of str): captions for image.
    """
    output = []
    for i in range(count):
        response = openai_client.responses.create(
            model=model_name,
            input=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "input_text",
                            "text": prompt,
                        }
                    ],
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "input_image",
                            "image_url": image_url,
                        }
                    ],
                },
            ],
            text={"format": {"type": "text"}},
            reasoning={},
            tools=[],
            temperature=temperature,
            max_output_tokens=300,
            top_p=1,
            store=False,
        )

        if response.output_text is not None:
            output.append(response.output_text)
        else:
            output.append("")

    return output

In [7]:
def llama_sampler_wrapper(row):
    image_file = os.path.join("../data/caption-dataset/train", row["file_name"])
    image_object = Image.open(image_file)
    return generate_llama_captions(image_object, model, processor, get_prompt(), count=5)

def generate_llama_captions(
    image_object, model, processor, prompt, temperature=1.0, do_sample=False, count=1 
):
    """
    Generates a caption for an image.

    Inputs:
    - image_object (pil Image): image to caption.
    - model (torch model): loaded model to use for captioning.
    - processor (torch processor): loaded processor for pre-processing inputs.
    - temperature (float; optional): temperature setting for model, greater than 0. Defaults to 1.0; lower values are more deterministic.
    - do_sample (boolean; optional): whether model should sample probabilities. Defaults to False -- greedy decoding.

    Output:
    - (str): caption for image.
    """
    output = []
    for i in range(count):
        # process the image and text
        messages = [
            {"role": "system", "content": prompt},
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                ],
            },
        ]
    
        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(
            image_object, input_text, add_special_tokens=False, return_tensors="pt"
        ).to(model.device)
    
        # generate output; maximum 300 new tokens; stop generation when <|endoftext|> is generated
        model_output = model.generate(
            **inputs,
            generation_config=GenerationConfig(
                max_new_tokens=300,
                stop_strings="<|endoftext|>",
                use_cache=True,
                temperature=temperature,
                do_sample=do_sample,
            ),
            tokenizer=processor.tokenizer,
        )
    
        # only get generated tokens; decode them to text
        generated_tokens = model_output[0, inputs["input_ids"].size(1) :]
        generated_text = processor.tokenizer.decode(
            generated_tokens, skip_special_tokens=True
        )

        output.append(generated_text.strip())
    return output

In [8]:
def molmo_sampler_wrapper(row):
    image_file = os.path.join("../data/caption-dataset/train", row["file_name"])
    image_object = Image.open(image_file)
    return generate_molmo_captions(image_object, model, processor, get_prompt(), count=5)

def generate_molmo_captions(
    image_object, model, processor, prompt, temperature=1.0, do_sample=False, count=1
):
    """
    Generates a caption for an image.

    Inputs:
    - image_object (pil Image): image to caption.
    - model (torch model): loaded model to use for captioning.
    - processor (torch processor): loaded processor for pre-processing inputs.
    - temperature (float; optional): temperature setting for model, greater than 0. Defaults to 1.0; lower values are more deterministic.
    - do_sample (boolean; optional): whether model should sample probabilities. Defaults to False -- greedy decoding.

    Output:
    - (str): caption for image.
    """
    output = []
    for i in range(count):
        # process the image and text
        inputs = processor.process(
            images=[image_object],
            text=prompt,
        )
    
        # move inputs to the correct device and make a batch of size 1
        inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
    
        # generate output; maximum 300 new tokens; stop generation when <|endoftext|> is generated
        def generate_output():
            return model.generate_from_batch(
                inputs,
                GenerationConfig(max_new_tokens=300, stop_strings="<|endoftext|>"),
                tokenizer=processor.tokenizer,
                use_cache=True,
                temperature=temperature,
                do_sample=do_sample,
            )
    
        model_output = None
        if device_type == "cuda":
            model_output = generate_output()
        else:
            with torch.autocast(
                device_type=device_type, enabled=True, dtype=torch.bfloat16
            ):
                model_output = generate_output()
    
        # only get generated tokens; decode them to text
        generated_tokens = model_output[0, inputs["input_ids"].size(1) :]
        generated_text = processor.tokenizer.decode(
            generated_tokens, skip_special_tokens=True
        )
    
        output.append(generated_text.strip())
    return output

## Load data

In [9]:
evaluated_captions_data = json.load(
    open(
        "../data/study-2-output/final-evaluated-captions/low-quality_evaluation_5432-images_2025-04-11_03-31_merged.json"
    )
)

print(f"Length of evaluated captions: {len(evaluated_captions_data)}")
# print(json.dumps(evaluated_captions_data[0], indent=4))

Length of evaluated captions: 5432


In [10]:
target_images_dtypes = {
    "image_id": int,
    "file_name": str,
    "vizwiz_url": str,
    "image_preview": str,
    "INCLUDE because product": str,
    "EXCLUDE because not verifable": str,
    "EXCLUDE because Book/DVD/CD/magazine?": str,
    "text_detected": str,
    "unrecognizable": bool,
    "framing": bool,
    "blur": bool,
    "obbooluction": bool,
    "rotation": bool,
    "too dark": bool,
    "too bright": bool,
    "other": bool,
    "no issue": bool,
    "unrecognizable_orig": int,
    "framing_orig": int,
    "blur_orig": int,
    "obstruction_orig": int,
    "rotation_orig": int,
    "too dark_orig": int,
    "too bright_orig": int,
    "other": int,
    "no issue": int,
    "human_caption_0": str,
    "human_caption_1": str,
    "human_caption_2": str,
    "human_caption_3": str,
    "human_caption_4": str,
    "gpt-4o-2024-08-06_caption": str,
    "Llama-3.2-11B-Vision-Instruct_caption": str,
    "Molmo-7B-O-0924_caption": str,
    "general_notes": str,
    "gpt-4o-2024-08-06_notes": str,
    "Llama-3.2-11B-Vision-Instruct_notes": str,
    "Molmo-7B-O-0924_notes": str,
    "image_preview": str,
    "unable_to_verify": str,
    "gpt4o_code": str,
    "llama_code": str,
    "molmo_code": str,
    "notes": str,
    "double code notes": str,
    "double verified": str,
    "curved label": str,
    "text panel": str,
    "AMP_rotation": str,
    "XT_rotation": str,
}

annotations_df = pd.read_csv(
    "../study-analysis/study-2-product-analysis/annotated-data/final-annotated-images_1696-images_2025-04-15_15-00.csv",
    dtype=target_images_dtypes,
    keep_default_na=False,
)

print(f"Number of annotated images: {len(annotations_df)}")
print(
    f"Number of images that were verified: {len(annotations_df[annotations_df['unable_to_verify'] == ''])}"
)
print(annotations_df.columns)
annotations_df.tail()

Number of annotated images: 1696
Number of images that were verified: 1220
Index(['image_id', 'file_name', 'vizwiz_url', 'image_preview',
       'human_captions', 'annotator', 'notes', 'unable_to_verify',
       'double code notes', 'double verified', 'gpt4o_caption', 'gpt4o_code',
       'llama_caption', 'llama_code', 'molmo_caption', 'molmo_code',
       'text_detected', 'unrecognizable_orig', 'framing_orig', 'blur_orig',
       'obstruction_orig', 'rotation_orig', 'too_dark_orig', 'too_bright_orig',
       'other_orig', 'curved label', 'text panel', 'AMP_rotation',
       'XT_rotation', 'expert_caption', 'unrecognizable', 'framing', 'blur',
       'obstruction', 'rotation', 'too dark', 'too bright', 'other'],
      dtype='object')


Unnamed: 0,image_id,file_name,vizwiz_url,image_preview,human_captions,annotator,notes,unable_to_verify,double code notes,double verified,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code,text_detected,unrecognizable_orig,framing_orig,blur_orig,obstruction_orig,rotation_orig,too_dark_orig,too_bright_orig,other_orig,curved label,text panel,AMP_rotation,XT_rotation,expert_caption,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other
1691,17557,VizWiz_train_00017557.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,Back of a box of hot tamales showing the flavo...,,hot tamales candy,,,x,A box with a red and yellow design features th...,no,The image shows the back of a box of Hot Tamal...,yes,The image shows a box of Hot Tamales candy. Th...,yes,True,0,2,4,0,3,0,0,0,,,x,x,Back of a box of hot tamales showing the flavo...,False,True,True,False,True,False,False,0
1692,11995,VizWiz_train_00011995.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,Green and white fabric clothing with a trophy ...,,,yes,,x,"A graphic of a trophy in black and white, with...",,"The image depicts a trophy with a round base, ...",,The image depicts a trophy with a round top an...,,True,0,4,2,0,0,0,0,0,,,,,Green and white fabric clothing with a trophy ...,False,True,True,False,False,False,False,0
1693,5029,VizWiz_train_00005029.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A Marie Callender's frozen entree rests on a w...,,marie callenders country fried pork chop,,,x,A packaged meal with an image of breaded chick...,no,"The image appears to be a frozen food package,...",no,The image shows a box of mashed potatoes. The ...,no,True,0,1,5,0,4,0,0,0,,,x,x,A Marie Callender's frozen entree rests on a w...,False,False,True,False,True,False,False,0
1694,9975,VizWiz_train_00009975.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A can or container of a food product called Ru...,,wray and nephew rum cream,,,x,"A bottle with the text ""WRAY & NEPHEW"" on a gr...",yes,The image shows a cylindrical container with a...,no,The image shows a bottle of rum cream. The bot...,yes,True,1,4,2,0,5,0,0,0,x,,x,x,A can or container of a food product called Ru...,False,True,True,False,True,False,False,0
1695,14006,VizWiz_train_00014006.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A package of Jimmy Dean fully cooked maple por...,,jimmy dean maple pork sausage patties,,,x,"A box labeled ""Jimmy Dean Fully Cooked Maple P...",yes,The image shows a red box of Jimmy Dean fully ...,yes,The image shows a box of frozen chicken tender...,no,True,0,1,0,0,4,0,1,0,,,x,x,A package of Jimmy Dean fully cooked maple por...,False,False,False,False,True,False,False,0


## Generate samples

In [11]:
all_correct_df = annotations_df[(annotations_df['unable_to_verify'] == "") & 
    (annotations_df['gpt4o_code'] == "no") & 
    (annotations_df['molmo_code'] == "no") & 
    (annotations_df['llama_code'] == "no")
    ]
all_incorrect_df = annotations_df[(annotations_df['unable_to_verify'] == "") & 
    (annotations_df['gpt4o_code'] == "yes") & 
    (annotations_df['molmo_code'] == "yes") & 
    (annotations_df['llama_code'] == "yes")
    ]

sample_size = 100
sampled_data = pd.concat([all_correct_df.sample(n=sample_size, random_state=42), 
                          all_incorrect_df.sample(n=sample_size, random_state=42)])
print(len(sampled_data))

200


In [12]:
# compute gpt4o samples
sampled_data[['gpt4o_sample_1', 'gpt4o_sample_2', 'gpt4o_sample_3', 'gpt4o_sample_4', 'gpt4o_sample_5']] = sampled_data.apply(gpt4o_sampler_wrapper, axis=1, result_type='expand')

In [13]:
# compute llama samples
model_name = "Llama-3.2-11B-Vision-Instruct"
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
)
processor = AutoProcessor.from_pretrained(model_id)

sampled_data[['llama_sample_1', 'llama_sample_2', 'llama_sample_3', 'llama_sample_4', 'llama_sample_5']] = sampled_data.apply(llama_sampler_wrapper, axis=1, result_type='expand')

# free memory after we're done
del model_name
del model_id
del model
del processor
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [14]:
# compute molmo samples
model_name = "Molmo-7B-O-0924"
model_id = "allenai/Molmo-7B-O-0924"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch_dtype,
    device_map=device_map,
)
processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch_dtype,
    device_map=device_map,
)

sampled_data[['molmo_sample_1', 'molmo_sample_2', 'molmo_sample_3', 'molmo_sample_4', 'molmo_sample_5']] = sampled_data.apply(molmo_sampler_wrapper, axis=1, result_type='expand')

# free memory after we're done
del model_name
del model_id
del model
del processor
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [15]:
# save data
sampled_data.to_csv(f'./sampled_data_{len(sampled_data)}-examples_{sample_size}-samples.csv', index=False)
sampled_data.head()

Unnamed: 0,image_id,file_name,vizwiz_url,image_preview,human_captions,annotator,notes,unable_to_verify,double code notes,double verified,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code,text_detected,unrecognizable_orig,framing_orig,blur_orig,obstruction_orig,rotation_orig,too_dark_orig,too_bright_orig,other_orig,curved label,text panel,AMP_rotation,XT_rotation,expert_caption,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,gpt4o_sample_1,gpt4o_sample_2,gpt4o_sample_3,gpt4o_sample_4,gpt4o_sample_5,llama_sample_1,llama_sample_2,llama_sample_3,llama_sample_4,llama_sample_5,molmo_sample_1,molmo_sample_2,molmo_sample_3,molmo_sample_4,molmo_sample_5
972,12307,VizWiz_train_00012307.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A very close up shot of a package that says Ve...,Anne Marie,venus razor,,,x,"Text ""Venus Breeze"" is visible against a light...",no,The image appears to be a close-up of a produc...,no,The image shows a close-up of a product packag...,no,True,0,4,2,0,2,0,0,0,,,x,x,,False,True,True,False,True,False,False,0,"The image shows part of a package with text ""B...","Text ""Venus Breeze"" is visible on a teal and p...","The image contains the text ""Venus Breeze"" on ...","Text showing ""Venus Breeze"" on a blue and purp...","The image displays a package with the words ""V...",The image appears to be a close-up of a produc...,The image appears to be a close-up of a produc...,The image appears to be a close-up of a produc...,The image appears to be a close-up of a produc...,The image appears to be a close-up of a produc...,The image shows a close-up of a product packag...,The image shows a close-up of a product packag...,The image shows a close-up of a product packag...,The image shows a close-up of a product packag...,The image shows a close-up of a product packag...
1571,3462,VizWiz_train_00003462.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A colorful bag of lollipops for small children...,,strawberry lollipops,,,x,A package with a polka dot pattern in various ...,no,The image shows a white bag with multicolored ...,no,The image shows a package of wipes. It has a w...,no,True,0,1,0,0,4,1,0,0,,x,x,x,A colorful bag of lollipops for small children,False,False,False,False,True,False,False,0,A package with a colorful polka dot design and...,A package with a colorful polka dot pattern fe...,A package with a pattern of colorful dots on a...,A package with a colorful polka dot design in ...,Colorful polka dot packaging with a nutrition ...,The image shows a white bag with multicolored ...,The image shows a white bag with multicolored ...,The image shows a white bag with multicolored ...,The image shows a white bag with multicolored ...,The image shows a white bag with multicolored ...,The image shows a package of wipes. It has a w...,The image shows a package of wipes. It has a w...,The image shows a package of wipes. It has a w...,The image shows a package of wipes. It has a w...,The image shows a package of wipes. It has a w...
679,13723,VizWiz_train_00013723.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,a white box of medication on a blue fabric sur...,Anne Marie,medication expectorant walgreens,,,x,"A white box with detailed text on its side, in...",no,The image shows a white rectangular box with a...,no,"The image shows a white rectangular box, likel...",no,True,0,2,0,0,4,0,3,1,,x,x,x,,False,True,False,False,True,False,True,0,"A white rectangular box with printed text, inc...",A rectangular white box with black text and a ...,A box with printed information and a barcode o...,A box with a barcode on the top and a section ...,A box on a blue surface with visible text incl...,The image shows a white rectangular box with a...,The image shows a white rectangular box with a...,The image shows a white rectangular box with a...,The image shows a white rectangular box with a...,The image shows a white rectangular box with a...,"The image shows a white rectangular box, likel...","The image shows a white rectangular box, likel...","The image shows a white rectangular box, likel...","The image shows a white rectangular box, likel...","The image shows a white rectangular box, likel..."
1316,3949,VizWiz_train_00003949.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,appears to be a picture of meatballs and mozza...,Anne Marie,meatball and mozzarella hot pocket / hot pie,,,x,"A red packaging box with white text reads ""MEA...",no,The image shows a red box of frozen meatballs ...,no,"The image shows a box of frozen food, specific...",no,True,0,3,4,0,3,1,0,0,,,x,x,,False,True,True,False,True,False,False,0,"A red package featuring the text ""12 MEATBALLS...",A product packaging with a red banner containi...,"A red package with white text reads ""MEATBALLS...",A red and yellow packaging for a product label...,"A red package with text reading ""Meatballs & M...",The image shows a red box of frozen meatballs ...,The image shows a red box of frozen meatballs ...,The image shows a red box of frozen meatballs ...,The image shows a red box of frozen meatballs ...,The image shows a red box of frozen meatballs ...,"The image shows a box of frozen food, specific...","The image shows a box of frozen food, specific...","The image shows a box of frozen food, specific...","The image shows a box of frozen food, specific...","The image shows a box of frozen food, specific..."
1228,17176,VizWiz_train_00017176.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A red and white box of Home Style Beef Goulash...,Anne Marie,home style beef goulash,,,x,"Label with text reading ""Home Style Beef Goula...",no,The image shows a close-up of a white and red ...,no,The image shows a partially visible food packa...,no,True,1,4,4,0,0,1,0,0,,,x,x,,False,True,True,False,True,False,False,0,"The image shows a food package with the text ""...","A product package with visible text reads ""HOM...",The image shows part of a package label. Visib...,"A package label with text stating ""HOME STYLE ...","Partial text on a food package reads ""HOME STY...",The image shows a close-up of a white and red ...,The image shows a close-up of a white and red ...,The image shows a close-up of a white and red ...,The image shows a close-up of a white and red ...,The image shows a close-up of a white and red ...,The image shows a partially visible food packa...,The image shows a partially visible food packa...,The image shows a partially visible food packa...,The image shows a partially visible food packa...,The image shows a partially visible food packa...


## Check for consistency
Below, we test consistency in responses using [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt). The paper offers 5 approaches for checking consistency:
1. BERTScore: finds the average BERTScore of ths i-th sentence with the most similar sentence from each sample. If information in a sentnece appears in many samples, one can assume that the information is factual. 
2. Question Answering: generates multiple-choice questions over the main passage, which a separate question answering system can attempt to answer while conditioned on  the other sampled respones. If questions on consistent information are queried, the answering system should predict similar answers.
3. N-gram: idea is to use sampled responses to create a new langauge model that approximates the generating LLM. when N is large, the new model will converse to LLM resposnes. Practically, this is done by training a n-gram model using the samples and original passage. The final score is the average log-probability of each token in the original passage, as computed by the n-gram model.
4. Natural Language Inference (NLI): determins whether a hypothesis follows a premise, classified as entailment, neutral, or contradition. Score is defined as the average probability of contradition given the reference sentence across all sampled passages. The final score is the probability of contradiction for each sentence
5. LLM Prompting: uses an LLM to check if the i-th sentence in the passaage us supported by the sample for each sample. The final score is an average of how many sampeld sentences support the i-th sentence.

Note that for any that split a passage into separate sentences, you'll get an array of scores for each sentence.

The LLM prompting method was shown to be most effective, but NLI method is not significantly worse. Below, we use the NLI method since it's faster to compute and offers close enough accuracy. 

In [16]:
def compute_consistency_nli(row, model_name, model):
    # get passage and samples
    passage = row[f"{model_name}_caption"]
    samples = [
        row[f"{model_name}_sample_1"],
        row[f"{model_name}_sample_2"],
        row[f"{model_name}_sample_3"],
        row[f"{model_name}_sample_4"],
        row[f"{model_name}_sample_5"],
    ]
    
    # get sentences for original passage
    sentences = [
        sent.text.strip() for sent in nlp(passage).sents
    ]

    scores = model.predict(sentences=sentences, sampled_passages=samples)
    return sentences, scores

selfcheck_nli = SelfCheckNLI(
    device=device_type
)

SelfCheck-NLI initialized to device cuda


In [17]:
sampled_consistency_df = sampled_data.copy(deep=True)
for model in ["gpt4o", "llama", "molmo"]:
    # create placeholder columns
    for i in range(0, 5): 
        sampled_consistency_df[f'{model}_caption_sent-{i+1}'] = None
        sampled_consistency_df[f'{model}_nli_sent-{i+1}'] = np.nan
    sampled_consistency_df[f'{model}_nli_avg_contrad'] = np.nan

    # loop over each example and compute NLI P(contradiction)
    for index, row in tqdm(sampled_consistency_df.iterrows(), total=sampled_consistency_df.shape[0]):
        # compute the probability of contradition for all sentences in the original caption, using 
        sentences, scores = compute_consistency_nli(row, model, selfcheck_nli)
    
        # add sentences and contradiction score for the sentence
        for sent_index, sentence in enumerate(sentences):
            sampled_consistency_df.loc[index, f'{model}_caption_sent-{sent_index+1}'] = sentence
        for score_index, score in enumerate(scores):
            sampled_consistency_df.loc[index, f'{model}_nli_sent-{score_index+1}'] = float(score)
    
        # add average
        sampled_consistency_df.loc[index, f'{model}_nli_avg_contrad'] = np.mean(scores)
    
    # remove extra columns without any sentences or scores
    sampled_consistency_df.dropna(axis=1, how='all', inplace=True)

# clear memory
del selfcheck_nli
torch.cuda.empty_cache()

# save file and print
sampled_consistency_df.to_csv(f'./sampled-data-consistency_{len(sampled_data)}-examples_{sample_size}-samples.csv', index=False)
sampled_consistency_df.head()

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Unnamed: 0,image_id,file_name,vizwiz_url,image_preview,human_captions,annotator,notes,unable_to_verify,double code notes,double verified,gpt4o_caption,gpt4o_code,llama_caption,llama_code,molmo_caption,molmo_code,text_detected,unrecognizable_orig,framing_orig,blur_orig,obstruction_orig,rotation_orig,too_dark_orig,too_bright_orig,other_orig,curved label,text panel,AMP_rotation,XT_rotation,expert_caption,unrecognizable,framing,blur,obstruction,rotation,too dark,too bright,other,gpt4o_sample_1,gpt4o_sample_2,gpt4o_sample_3,gpt4o_sample_4,gpt4o_sample_5,llama_sample_1,llama_sample_2,llama_sample_3,llama_sample_4,llama_sample_5,molmo_sample_1,molmo_sample_2,molmo_sample_3,molmo_sample_4,molmo_sample_5,gpt4o_caption_sent-1,gpt4o_nli_sent-1,gpt4o_caption_sent-2,gpt4o_nli_sent-2,gpt4o_caption_sent-3,gpt4o_nli_sent-3,gpt4o_caption_sent-4,gpt4o_nli_sent-4,gpt4o_nli_avg_contrad,llama_caption_sent-1,llama_nli_sent-1,llama_caption_sent-2,llama_nli_sent-2,llama_caption_sent-3,llama_nli_sent-3,llama_caption_sent-4,llama_nli_sent-4,llama_caption_sent-5,llama_nli_sent-5,llama_nli_avg_contrad,molmo_caption_sent-1,molmo_nli_sent-1,molmo_caption_sent-2,molmo_nli_sent-2,molmo_caption_sent-3,molmo_nli_sent-3,molmo_nli_avg_contrad
972,12307,VizWiz_train_00012307.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A very close up shot of a package that says Ve...,Anne Marie,venus razor,,,x,"Text ""Venus Breeze"" is visible against a light...",no,The image appears to be a close-up of a produc...,no,The image shows a close-up of a product packag...,no,True,0,4,2,0,2,0,0,0,,,x,x,,False,True,True,False,True,False,False,0,"The image shows part of a package with text ""B...","Text ""Venus Breeze"" is visible on a teal and p...","The image contains the text ""Venus Breeze"" on ...","Text showing ""Venus Breeze"" on a blue and purp...","The image displays a package with the words ""V...",The image appears to be a close-up of a produc...,The image appears to be a close-up of a produc...,The image appears to be a close-up of a produc...,The image appears to be a close-up of a produc...,The image appears to be a close-up of a produc...,The image shows a close-up of a product packag...,The image shows a close-up of a product packag...,The image shows a close-up of a product packag...,The image shows a close-up of a product packag...,The image shows a close-up of a product packag...,"Text ""Venus Breeze"" is visible against a light...",0.068755,The image appears to be a close-up of packaging.,0.264089,,,,,0.166422,The image appears to be a close-up of a produc...,0.003647,The background is a blue-green color with a pi...,0.00171,,,,,,,0.002678,The image shows a close-up of a product packag...,0.001564,"The visible text reads ""3VENUE BREE"", which ap...",0.001573,,,0.001569
1571,3462,VizWiz_train_00003462.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A colorful bag of lollipops for small children...,,strawberry lollipops,,,x,A package with a polka dot pattern in various ...,no,The image shows a white bag with multicolored ...,no,The image shows a package of wipes. It has a w...,no,True,0,1,0,0,4,1,0,0,,x,x,x,A colorful bag of lollipops for small children,False,False,False,False,True,False,False,0,A package with a colorful polka dot design and...,A package with a colorful polka dot pattern fe...,A package with a pattern of colorful dots on a...,A package with a colorful polka dot design in ...,Colorful polka dot packaging with a nutrition ...,The image shows a white bag with multicolored ...,The image shows a white bag with multicolored ...,The image shows a white bag with multicolored ...,The image shows a white bag with multicolored ...,The image shows a white bag with multicolored ...,The image shows a package of wipes. It has a w...,The image shows a package of wipes. It has a w...,The image shows a package of wipes. It has a w...,The image shows a package of wipes. It has a w...,The image shows a package of wipes. It has a w...,A package with a polka dot pattern in various ...,0.248454,There is a white label with nutrition facts an...,0.240949,,,,,0.244702,The image shows a white bag with multicolored ...,0.004346,"The bag is made of a shiny material, possibly ...",0.436487,,,,,,,0.220417,The image shows a package of wipes.,0.00077,It has a white background with colorful polka ...,0.002447,,,0.001609
679,13723,VizWiz_train_00013723.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,a white box of medication on a blue fabric sur...,Anne Marie,medication expectorant walgreens,,,x,"A white box with detailed text on its side, in...",no,The image shows a white rectangular box with a...,no,"The image shows a white rectangular box, likel...",no,True,0,2,0,0,4,0,3,1,,x,x,x,,False,True,False,False,True,False,True,0,"A white rectangular box with printed text, inc...",A rectangular white box with black text and a ...,A box with printed information and a barcode o...,A box with a barcode on the top and a section ...,A box on a blue surface with visible text incl...,The image shows a white rectangular box with a...,The image shows a white rectangular box with a...,The image shows a white rectangular box with a...,The image shows a white rectangular box with a...,The image shows a white rectangular box with a...,"The image shows a white rectangular box, likel...","The image shows a white rectangular box, likel...","The image shows a white rectangular box, likel...","The image shows a white rectangular box, likel...","The image shows a white rectangular box, likel...","A white box with detailed text on its side, in...",0.364978,There is a barcode and some numbers printed on...,0.342193,,,,,0.353586,The image shows a white rectangular box with a...,0.027466,,,,,,,,,0.027466,"The image shows a white rectangular box, likel...",0.002297,The box has a barcode at the top and a large w...,0.0015,,,0.001898
1316,3949,VizWiz_train_00003949.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,appears to be a picture of meatballs and mozza...,Anne Marie,meatball and mozzarella hot pocket / hot pie,,,x,"A red packaging box with white text reads ""MEA...",no,The image shows a red box of frozen meatballs ...,no,"The image shows a box of frozen food, specific...",no,True,0,3,4,0,3,1,0,0,,,x,x,,False,True,True,False,True,False,False,0,"A red package featuring the text ""12 MEATBALLS...",A product packaging with a red banner containi...,"A red package with white text reads ""MEATBALLS...",A red and yellow packaging for a product label...,"A red package with text reading ""Meatballs & M...",The image shows a red box of frozen meatballs ...,The image shows a red box of frozen meatballs ...,The image shows a red box of frozen meatballs ...,The image shows a red box of frozen meatballs ...,The image shows a red box of frozen meatballs ...,"The image shows a box of frozen food, specific...","The image shows a box of frozen food, specific...","The image shows a box of frozen food, specific...","The image shows a box of frozen food, specific...","The image shows a box of frozen food, specific...","A red packaging box with white text reads ""MEA...",0.073244,"The packaging mentions ""12"" and includes an im...",0.122873,,,,,0.098058,The image shows a red box of frozen meatballs ...,0.973668,"The box features white text that reads ""MEATBA...",0.136951,,,,,,,0.55531,"The image shows a box of frozen food, specific...",0.005195,The box is red with white text and a yellow nu...,0.002797,,,0.003996
1228,17176,VizWiz_train_00017176.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,,A red and white box of Home Style Beef Goulash...,Anne Marie,home style beef goulash,,,x,"Label with text reading ""Home Style Beef Goula...",no,The image shows a close-up of a white and red ...,no,The image shows a partially visible food packa...,no,True,1,4,4,0,0,1,0,0,,,x,x,,False,True,True,False,True,False,False,0,"The image shows a food package with the text ""...","A product package with visible text reads ""HOM...",The image shows part of a package label. Visib...,"A package label with text stating ""HOME STYLE ...","Partial text on a food package reads ""HOME STY...",The image shows a close-up of a white and red ...,The image shows a close-up of a white and red ...,The image shows a close-up of a white and red ...,The image shows a close-up of a white and red ...,The image shows a close-up of a white and red ...,The image shows a partially visible food packa...,The image shows a partially visible food packa...,The image shows a partially visible food packa...,The image shows a partially visible food packa...,The image shows a partially visible food packa...,"Label with text reading ""Home Style Beef Goula...",0.054986,"Additional text mentions ""ground beef"", ""elbow...",0.001054,,,,,0.02802,The image shows a close-up of a white and red ...,0.01817,The box has a red stripe on the left side and ...,0.000682,,,,,,,0.009426,The image shows a partially visible food packa...,0.00124,The top left corner displays a circular logo w...,0.001743,,,0.001491


### Summary Statistics

In [18]:
sampled_consistency_df.groupby(["gpt4o_code"])["gpt4o_nli_avg_contrad"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
gpt4o_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
no,100.0,0.317633,0.196882,0.007176,0.151667,0.303487,0.437047,0.863079
yes,100.0,0.308758,0.202405,0.001224,0.142871,0.303035,0.441471,0.874776


In [19]:
sampled_consistency_df.groupby(["llama_code"])["llama_nli_avg_contrad"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
llama_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
no,100.0,0.271648,0.292457,0.000259,0.003177,0.200905,0.493467,0.984827
yes,100.0,0.151315,0.232015,0.000398,0.003737,0.008991,0.302364,0.97129


In [20]:
sampled_consistency_df.groupby(["molmo_code"])["molmo_nli_avg_contrad"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
molmo_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
no,100.0,0.002535,0.002588,0.000746,0.001368,0.001846,0.002641,0.020417
yes,100.0,0.003586,0.005119,0.000752,0.001481,0.001989,0.00337,0.031127
