# Prompt Exploration for Image Captioning
We explore different prompts and their impact on image captions. 

## Setup

### Load Libraries

In [2]:
%load_ext jupyter_black

# Libraries
import json
import csv
import pandas as pd
import os
import re
import copy

import requests
from PIL import Image

from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
import torch

from tqdm.notebook import tqdm

# for showing image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams

%matplotlib inline

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


### Load Model
For this experiment, all the same model will be used.

In [3]:
# setup pytorch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # for multi-GPU systems, force single GPU
if torch.cuda.is_available():
    device_map = "cuda:0"  # force single, first GPU
    device_type = "cuda"
elif torch.backends.mps.is_available():
    device_map = "auto"
    device_type = "mps"
else:
    device_map = "auto"
    device_type = "cpu"

print(f"Using device: {device_type}")

Using device: cuda


In [4]:
# load model
model_id = "allenai/Molmo-7B-O-0924"
processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map=device_map,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map=device_map,
)

# print model properties
print("Model ID: ", model_id)
print("Device: ", model.device)
print("Dtype: ", model.dtype)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Model ID:  allenai/Molmo-7B-O-0924
Device:  cuda:0
Dtype:  torch.float32


### Helper functions

In [5]:
# captioning function
def generate_caption(
    image_object, model, processor, prompt, temperature=1.0, do_sample=False
):
    """
    Generates a caption for an image.

    Inputs:
    - image_object (pil Image): image to caption.
    - model (torch model): loaded model to use for captioning.
    - processor (torch processor): loaded processor for pre-processing inputs.
    - temperature (float; optional): temperature setting for model, greater than 0. Defaults to 1.0; lower values are more deterministic.
    - do_sample (boolean; optional): whether model should sample probabilities. Defaults to False -- greedy decoding.

    Output:
    - (str): caption for image.
    """
    # process the image and text
    inputs = processor.process(
        images=[image_object],
        text=prompt,
    )

    # move inputs to the correct device and make a batch of size 1
    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

    # generate output; maximum 300 new tokens; stop generation when <|endoftext|> is generated
    output = ""
    output = model.generate_from_batch(
        inputs,
        GenerationConfig(max_new_tokens=300, stop_strings="<|endoftext|>"),
        tokenizer=processor.tokenizer,
        use_cache=False,
        temperature=temperature,
        do_sample=do_sample,
    )

    # only get generated tokens; decode them to text
    generated_tokens = output[0, inputs["input_ids"].size(1) :]
    generated_text = processor.tokenizer.decode(
        generated_tokens, skip_special_tokens=True
    )
    output = generated_text.strip()

    return output


def generate_target_dataset(caption_dataset_filename, image_quality_dataset_filename):
    """
    Generates a target dataset for captioning based on VizWiz's image captioning dataset and image quality assessment dataset.

    Inputs:
    - caption_dataset_filename (str): path to caption dataset.
    - image_quality_dataset_filename (str): path to image quality dataset.

    Output:
    - (pd.DataFrame): dataframe containing image annotations and image quality.
    """
    # get images and annotations in one dataframe
    image_annotation_df = None
    with open(caption_dataset_filename) as f:
        # load caption dataset
        caption_dataset_json = json.load(f)

        # combine image files and annotations
        images_df = pd.DataFrame.from_dict(caption_dataset_json["images"])
        annotations_df = pd.DataFrame.from_dict(caption_dataset_json["annotations"])
        grouped_annotations = (
            annotations_df.groupby(["image_id"]).agg(tuple).map(list).reset_index()
        )
        image_annotation_df = images_df.merge(
            grouped_annotations[["image_id", "caption", "is_precanned", "is_rejected"]],
            left_on="id",
            right_on="image_id",
        )

        # vizwiz_url is broken, so fix with https://vizwiz.cs.colorado.edu/*
        image_annotation_df["vizwiz_url"] = image_annotation_df["vizwiz_url"].apply(
            lambda x: x.replace(
                "https://ivc.ischool.utexas.edu/", "https://vizwiz.cs.colorado.edu/"
            )
        )

    # get image quality
    with open(image_quality_dataset_filename) as f:
        # load image quality annotation dataset
        image_quality_dataset_json = json.load(f)
        image_quality_df = pd.DataFrame.from_dict(image_quality_dataset_json)

        # expand object of flaws into individual columns and rename
        image_quality_df = pd.concat(
            [
                image_quality_df.drop(["flaws"], axis=1),
                pd.json_normalize(image_quality_df["flaws"]),
            ],
            axis=1,
        )
        image_quality_df.rename(
            columns={
                "FRM": "framing",
                "BLR": "blur",
                "DRK": "too dark",
                "BRT": "too bright",
                "OBS": "obstruction",
                "OTH": "other",
                "NON": "no issue",
                "ROT": "rotation",
                "caption": "human_captions",
            },
            inplace=True,
        )

    # combine image and quality datasets together
    image_captioning_input = image_annotation_df.merge(
        image_quality_df, left_on="file_name", right_on="image"
    ).drop(["image"], axis=1)

    # remove duplicate id column
    image_captioning_input.drop(["id"], axis=1, inplace=True)

    # reorder columns
    image_captioning_input = image_captioning_input[
        [
            "image_id",
            "file_name",
            "vizwiz_url",
            "text_detected",
            "unrecognizable",
            "framing",
            "blur",
            "obstruction",
            "rotation",
            "too dark",
            "too bright",
            "other",
            "no issue",
            "caption",
            "is_precanned",
            "is_rejected",
        ]
    ]

    # convert image_captioning_input to a list of dictionaries
    image_captioning_input = image_captioning_input.to_dict(orient="records")

    # expand captions, is_precanned, and is_rejected into individual columns
    for index, row in enumerate(image_captioning_input):
        curr_captions = row["caption"]
        curr_precanned = row["is_precanned"]
        curr_rejected = row["is_rejected"]

        # expand captions
        for caption_index in range(0, len(curr_captions)):
            # expand caption
            image_captioning_input[index][f"human_caption_{caption_index + 1}"] = (
                curr_captions[caption_index]
            )

            # expand caption
            image_captioning_input[index][f"is_precanned_{caption_index + 1}"] = (
                curr_precanned[caption_index]
            )

            # expand caption
            image_captioning_input[index][f"is_rejected_{caption_index + 1}"] = (
                curr_rejected[caption_index]
            )

        # remove old rows
        del image_captioning_input[index]["caption"]
        del image_captioning_input[index]["is_precanned"]
        del image_captioning_input[index]["is_rejected"]

    return image_captioning_input


def generate_caption_output(
    image_captioning_input, prompts, image_folder, scratch_path="", use_scratch=False
):
    """
    Generates a caption for an image.

    Inputs:
    - image_captioning_input (pd.DataFrame): dataframe containing image annotations and image quality.
    - prompts (list of tuples): prompts to try. each tuple includes (prompt_name, prompt).
    - image_folder (str): path to image folder.
    - scratch_path (str): path to scratch folder where intermediate files will be stored.
    - use_scratch (bool): whether to save intermediate files

    Output:
    - (list): list of dictionaries containing image annotations and image quality.
    """
    # deepclone input where labels will be
    caption_output = copy.deepcopy(image_captioning_input)

    # create scratch path if it doesn't exist
    if use_scratch:
        os.makedirs(scratch_path, exist_ok=True)

    for index, row in enumerate(tqdm(image_captioning_input)):
        # get image for current annotation
        image_file = os.path.join(image_folder, caption_output[index]["file_name"])
        image = Image.open(image_file)

        # repeat per prompt
        for prompt_name, prompt in prompts:
            # generate caption and store for output
            caption_output[index][f"caption_for_prompt_{prompt_name}"] = (
                generate_caption(image, model, processor, prompt)
            )

        # save scratch file for every 100 images
        if use_scratch:
            if index % 100 == 0:
                with open(
                    os.path.join(scratch_path, f"caption_output_{index}.json"), "w"
                ) as f:
                    json.dump(caption_output, f, indent=4, separators=(",", ": "))

    return caption_output

## Data Loading

In [6]:
dataset_to_caption = generate_target_dataset(
    "./data/caption-dataset/annotations/train.json",
    "./data/image-quality-assessment/annotations/train.json",
)
dataset_to_caption_df = pd.DataFrame.from_dict(dataset_to_caption)
dataset_to_caption_df.head()

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,...,is_rejected_2,human_caption_3,is_precanned_3,is_rejected_3,human_caption_4,is_precanned_4,is_rejected_4,human_caption_5,is_precanned_5,is_rejected_5
0,0,VizWiz_train_00000000.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,1,3,1,0,0,0,...,False,Quality issues are too severe to recognize vis...,True,True,A bottle of spices in a plastic container layi...,False,False,some basil leaves in a container on a counter,False,False
1,1,VizWiz_train_00000001.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,5,0,0,0,...,False,A kitchen counter the various items on top inc...,False,False,a black tin of Coca Cola placed on a black sur...,False,False,"Black counter with canisters, kettle and can o...",False,False
2,2,VizWiz_train_00000002.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,0,0,0,0,...,False,a can of crushed tomatoes in puree from price ...,False,False,a Price Chopper branded can of crushed tomatoes,False,False,Image is a can of crushed tomatoes in view.,False,False
3,3,VizWiz_train_00000003.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,0,0,0,0,...,False,Screenshot from a smartphone with a case insen...,False,False,image shows a screenshot of a page required ca...,False,False,A screenshot of Spotify page on a cell phone s...,False,False
4,4,VizWiz_train_00000004.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,3,0,0,0,0,...,False,A garden book is sitting on a person's lap.,False,False,a box for a solar garden light laying on someo...,False,False,A blue and yellow box with lights for the gard...,False,False


### Subset which images we want to
There's no need to caption all the images with the set of prompts we want to try. Allow the user to select which images to try using `file_name`.

In [7]:
image_subset = [
    "VizWiz_train_00002685.jpg",
    "VizWiz_train_00004360.jpg",
    "VizWiz_train_00005236.jpg",
    "VizWiz_train_00018277.jpg",
    "VizWiz_train_00008241.jpg",
    "VizWiz_train_00015593.jpg",
    "VizWiz_train_00005191.jpg",
    "VizWiz_train_00002033.jpg",
    "VizWiz_train_00012474.jpg",
    "VizWiz_train_00004571.jpg",
    "VizWiz_train_00020191.jpg",
    "VizWiz_train_00016717.jpg",
    "VizWiz_train_00007812.jpg",
    "VizWiz_train_00000428.jpg",
    "VizWiz_train_00011077.jpg",
    "VizWiz_train_00012338.jpg",
    "VizWiz_train_00012570.jpg",
    "VizWiz_train_00009241.jpg",
    "VizWiz_train_00002353.jpg",
    "VizWiz_train_00000575.jpg",
    "VizWiz_train_00013633.jpg",
    "VizWiz_train_00008078.jpg",
    "VizWiz_train_00000140.jpg",
    "VizWiz_train_00008867.jpg",
    "VizWiz_train_00000078.jpg",
    "VizWiz_train_00000192.jpg",
    "VizWiz_train_00011842.jpg",
    "VizWiz_train_00022167.jpg",
    "VizWiz_train_00003433.jpg",
    "VizWiz_train_00017296.jpg",
    "VizWiz_train_00010915.jpg",
    "VizWiz_train_00000145.jpg",
    "VizWiz_train_00008162.jpg",
    "VizWiz_train_00005952.jpg",
    "VizWiz_train_00000139.jpg",
    "VizWiz_train_00012215.jpg",
    "VizWiz_train_00004920.jpg",
    "VizWiz_train_00010826.jpg",
    "VizWiz_train_00006570.jpg",
    "VizWiz_train_00000070.jpg",
    "VizWiz_train_00009603.jpg",
    "VizWiz_train_00023166.jpg",
    "VizWiz_train_00000621.jpg",
    "VizWiz_train_00010902.jpg",
    "VizWiz_train_00000131.jpg",
    "VizWiz_train_00017219.jpg",
    "VizWiz_train_00015043.jpg",
    "VizWiz_train_00000051.jpg",
    "VizWiz_train_00000149.jpg",
    "VizWiz_train_00012203.jpg",
    "VizWiz_train_00001485.jpg",
    "VizWiz_train_00000142.jpg",
    "VizWiz_train_00000126.jpg",
    "VizWiz_train_00000155.jpg",
    "VizWiz_train_00000558.jpg",
    "VizWiz_train_00000560.jpg",
    "VizWiz_train_00000637.jpg",
    "VizWiz_train_00000506.jpg",
    "VizWiz_train_00000193.jpg",
    "VizWiz_train_00006773.jpg",
]

In [8]:
dataset_filtered_df = dataset_to_caption_df[
    dataset_to_caption_df["file_name"].isin(image_subset)
]
dataset_filtered = dataset_filtered_df.to_dict(orient="records")

## Batch Processor
Below, we process all selected data across all prompts. Some constants across runs:
1. `temperature = 1.0` which should balance determinism and randomness.
2. `use_cache = False` to make sure new responses are always generated.
3. `do_sample = False` to use greedy decoding to reduce sampling variance.

In [9]:
prompts = [
    # ("plain", "What's in this image?"),
    # ("plain-succinct", "What is in this image? Please provide a succinct description."),
    # ("blv-baseline", "Describe the image for blind and low-vision users."),
    # (
    #     "original",
    #     """You are a program designed to help blind and low-vision users understand images. Generate an accessible image description that includes key visual and contextual details of the image for blind and low-vision people.
    #     Focus on the following principles:
    #     • Clarity and Conciseness: Use simple, straightforward language to describe the main subjects and their relationships.;
    #     • Relevance: Highlight only essential visual elements that contribute to understanding the image or its purpose.;
    #     • Context: Provide contextual information when necessary, such as emotional tone, setting, or action. Avoid assumptions or subjective interpretations.;
    #     • Specificity: Include important details like colors, shapes, textures, or text visible in the image, if relevant. Avoid overly general terms or unnecessary details.
    #     Once you generate your caption, shorten it to a succinct, single sentence. Output only the final sentence.
    #     """,
    # ),
    # (
    #     "object-specific",
    #     """You are a program designed to provide concise and accessible descriptions of objects in images for blind and low-vision people. Please describe the main objects in the image, their relationships to each other, and any essential details (e.g., colors, shapes, textures, or relevant text) necessary for understanding what the objects are.
    #     Adhere to these guidelines:
    #     • Clarity and Conciseness: Use simple, straightforward language, and omit unnecessary details.
    #     • Relevance: Include only significant visual elements that convey the essential information about objects in the image.
    #     • Context: Provide setting or emotional tone only when it’s clearly depicted, avoiding assumptions or subjective interpretations.
    #     • Specificity: Mention key colors, shapes, visible text, or other notable features only if they enhance understanding.
    #     After creating the full description, refine and compress it into a concise caption. Output only final caption.
    #     """,
    # ),
    # (
    #     "object-specific-structure",
    #     """You are a program designed to describe objects in images to a blind and low-vision person. Please describe the main objects in the image, their relationships to each other, and any essential details (e.g., relevant text, shapes, colors, or textures) necessary for identifying what the objects are.
    #     Adhere to these guidelines:
    #     • Relevance: Include only significant visual elements that convey the essential information about objects in the image.
    #     • Specificity: Mention visible text (including brand names), shapes, colors, textures, or notable features only if they enhance understanding.
    #     • Structure of Response: First, mention what the object is. Then, provide details helpful for identifying the object, including text or shapes and colors. Finally, add details about the surrounding environment.
    #     • Clarity and Conciseness: Use simple, straightforward language. Do not include unnecessary details.
    #     After creating the full description, refine and compress it into a concise caption. Output only final caption.
    #     """,
    # ),
    # (
    #     "single-object-specific",
    #     """You are a program designed to describe objects in images to a blind and low-vision person. Please describe the object in the image and any essential details (e.g., relevant text, shapes, colors, or textures) necessary for identifying the object.
    #     Adhere to these guidelines:
    #     • Relevance: Include only significant visual elements that convey the essential information about the main object in the image.
    #     • Specificity: Mention visible text (including brand names), shapes, colors, textures, or notable features only if they enhance understanding.
    #     • Structure of Response: First, mention what the object is. Then, provide details helpful for identifying the object, including text or shapes and colors. Finally, add details about the surrounding environment, if relevant.
    #     • Clarity and Conciseness: Use simple, straightforward language. Do not include unnecessary details.
    #     After creating the full description, refine and compress it into a concise 1-2 sentence caption. Output only the final caption.
    #     """,
    # ),
    # (
    #     "single-object-specific-v3",
    #     "\n".join(
    #         [
    #             "You are an assistant who describes objects in images to a blind and low-vision person. Please describe the object in the image and any essential details necessary for identifying the object.",
    #             "",
    #             "Follow these guidelines:",
    #             "- Relevance and Specificity: Include visible text (including brand names), shapes, colors, textures, spatial relationships, or notable features only if they convey essential information about what is in the image.",
    #             "- Structure of Response: Provide an overall description of the object, then include details to help identify it. Only include details about the surrounding environment if it helps to identify the object.",
    #             "- Clarity: Use simple, straightforward, objective language. Avoid unnecessary details.",
    #             '- Format: Describe the object with a concise, 1-2 sentence caption. DO NOT mention camera blur or if an object is partially visible. DO NOT use "it" to refer to the object. DO NOT include statements like "The image shows" or "The object is".',
    #             "",
    #             "Output only the final caption.",
    #         ]
    #     ),
    # ),
    (
        "single-object-specific-v4",
        "\n".join(
            [
                "You are a helpful assistant who describes objects in images to a blind and low-vision person. Please describe the objects in the image and any essential details necessary for identifying them.",
                "",
                "Follow these guidelines:",
                "- Relevance and Specificity: Include visible text (including brand names), shapes, colors, textures, spatial relationships, or notable features only if they convey essential information about what is in the image.",
                "- Structure of Response: Provide a description of the object with essential details. Only include details about the surrounding environment if it helps to identify the object.",
                "- Clarity: Use simple, straightforward, objective language. Avoid unnecessary details.",
                '- Format: Describe the object with a concise, 1-2 sentence caption. DO NOT mention camera blur or if an object is partially visible. DO NOT use "it" to refer to the object. DO NOT include statements like "The image shows" or "The object is".',
                "",
                "Output only the final caption.",
            ]
        ),
    ),
    # (
    #     "no-blv-ref",
    #     """You are a program designed to help users understand images. Generate an image description that includes key visual and contextual details of the image. Focus on the following principles:
    #     Clarity and Conciseness: Use simple, straightforward language to describe the main subjects and their relationships.;
    #     Relevance: Highlight only essential visual elements that contribute to understanding the image or its purpose.;
    #     Context: Provide contextual information when necessary, such as emotional tone, setting, or action. Avoid assumptions or subjective interpretations.;
    #     Specificity: Include important details like colors, shapes, textures, or text visible in the image, if relevant. Avoid overly general terms or unnecessary details.
    #     Once you generate your caption, shorten it to a succinct, single sentence. Output only the final sentence.
    #     """,
    # ),
    # (
    #     "abstain",
    #     "You are a helpful assistant for describing images for blind and low-vision individuals. Do not hallucinate with incorrect answers if the image is indescribable. An image is indescribable if the provided image is too blurry, too bright or dark, obstructed, or too ill-framed to recognize correctly. Abstain from providing descriptions if the question is unanswerable.",
    # ),
    # (
    #     "abstain-v2",
    #     """You are a program designed to help users understand images. Generate an image description that includes key visual and contextual details of the image. Focus on the following principles:
    #     Clarity and Conciseness: Use simple, straightforward language to describe the main subjects and their relationships.;
    #     Relevance: Highlight only essential visual elements that contribute to understanding the image or its purpose.;
    #     Context: Provide contextual information when necessary, such as emotional tone, setting, or action. Avoid assumptions or subjective interpretations.;
    #     Specificity: Include important details like colors, shapes, textures, or text visible in the image, if relevant. Avoid overly general terms or unnecessary details.
    #     Abstain: Do not hallucinate with incorrect answers if the image is too blurry, too bright or dark, obstructed, or too ill-framed to recognize correctly. Attempt to generate a caption before abstaining.
    #     Once you generate your caption, shorten it to a succinct, single sentence. Output only the final sentence.
    #     """,
    # ),
]

In [10]:
print(prompts[0][1])

You are a helpful assistant who describes objects in images to a blind and low-vision person. Please describe the objects in the image and any essential details necessary for identifying them.

Follow these guidelines:
- Relevance and Specificity: Include visible text (including brand names), shapes, colors, textures, spatial relationships, or notable features only if they convey essential information about what is in the image.
- Structure of Response: Provide a description of the object with essential details. Only include details about the surrounding environment if it helps to identify the object.
- Clarity: Use simple, straightforward, objective language. Avoid unnecessary details.
- Format: Describe the object with a concise, 1-2 sentence caption. DO NOT mention camera blur or if an object is partially visible. DO NOT use "it" to refer to the object. DO NOT include statements like "The image shows" or "The object is".

Output only the final caption.


In [11]:
captions_per_prompt = generate_caption_output(
    dataset_filtered, prompts, "./data/caption-dataset/train"
)

pd.DataFrame.from_dict(captions_per_prompt)

  0%|          | 0/60 [00:00<?, ?it/s]

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,...,human_caption_3,is_precanned_3,is_rejected_3,human_caption_4,is_precanned_4,is_rejected_4,human_caption_5,is_precanned_5,is_rejected_5,caption_for_prompt_single-object-specific-v4
0,51,VizWiz_train_00000051.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,2,2,3,0,0,0,...,tube package of a substance that is orange blu...,False,False,A person holding up a food item with a blue an...,False,False,some sort of cream that someone is holding,False,False,A person's hand is holding a cylindrical plast...
1,70,VizWiz_train_00000070.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,2,3,5,1,4,1,...,A screen that is showing the frequency and aud...,False,False,A bright display saying something about FREDQU...,False,False,Quality issues are too severe to recognize vis...,True,False,"A blue sign with white text reading ""Frequency..."
2,78,VizWiz_train_00000078.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,5,0,0,0,...,A regular sized can of Mountain Dew missing it...,False,False,A can of Mountain Dew with the tab opener remo...,False,False,Image contain a partial part of a mountain dew...,False,False,A black aluminum soda can with a silver top an...
3,126,VizWiz_train_00000126.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,False,3,1,4,5,0,0,...,Finger covers camera lens so that the photo is...,False,False,Quality issues are too severe to recognize vis...,True,False,A orange on a gray cloth with the photographer...,False,False,A white cylindrical container with a black lid...
4,131,VizWiz_train_00000131.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,1,2,5,0,0,0,...,Quality issues are too severe to recognize vis...,True,False,A canister shaped object is near a wood wall.,False,False,some type of liquid that is in a container,False,False,"A white plastic bag with red text, likely a pr..."
5,139,VizWiz_train_00000139.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,3,3,1,2,0,...,A human hand holding a bottle of some sort of ...,False,False,A hand holding a container of seasoning with a...,False,False,A jar of Italian herbs in a someone's hand wit...,False,False,A person's hand is holding a glass jar with a ...
6,140,VizWiz_train_00000140.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,1,5,3,1,0,0,...,A box of fruit loops breakfast cereal with fru...,False,False,A red background with a black object on the le...,False,False,Front of a box of fruit loops breakfast cereal,False,False,"A bowl of colorful cereal, likely Froot Loops,..."
7,142,VizWiz_train_00000142.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,1,5,5,1,0,...,The medicinal uses of the product is described...,False,False,a natural supplement bottle of goldenseal turn...,False,False,A bottle is resting on a soft pillow-like obje...,False,False,A bottle of medication with a green cap and wh...
8,145,VizWiz_train_00000145.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,3,4,0,0,0,...,A white bottle of vitamins with Folic and 800...,False,False,A white bottle with an orange label with black...,False,False,A bottle of medicine laying on a wooden table,False,False,"A cylindrical container with a white cap, like..."
9,149,VizWiz_train_00000149.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,False,0,1,3,0,0,0,...,A battery is on top of the table.,False,False,AA battery that is placed inside a metal holder,False,False,Battery in a ratchet attachment on a surface.,False,False,A cylindrical metal container with a black pla...


### Output to CSV

In [14]:
# create a dataframe
output_df = pd.DataFrame.from_dict(captions_per_prompt)

# save file
outdir = "./data/labeled-data/molmo-model/prompt-testing"
if not os.path.exists(outdir):
    os.mkdir(outdir)

output_df.to_csv(
    f"{outdir}/prompt-testing-round7_32bit_03-25-25.csv",
    index=False,
)

# print dataframe
output_df.head()

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,...,human_caption_3,is_precanned_3,is_rejected_3,human_caption_4,is_precanned_4,is_rejected_4,human_caption_5,is_precanned_5,is_rejected_5,caption_for_prompt_single-object-specific-v4
0,51,VizWiz_train_00000051.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,2,2,3,0,0,0,...,tube package of a substance that is orange blu...,False,False,A person holding up a food item with a blue an...,False,False,some sort of cream that someone is holding,False,False,A person's hand is holding a cylindrical plast...
1,70,VizWiz_train_00000070.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,2,3,5,1,4,1,...,A screen that is showing the frequency and aud...,False,False,A bright display saying something about FREDQU...,False,False,Quality issues are too severe to recognize vis...,True,False,"A blue sign with white text reading ""Frequency..."
2,78,VizWiz_train_00000078.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,4,5,0,0,0,...,A regular sized can of Mountain Dew missing it...,False,False,A can of Mountain Dew with the tab opener remo...,False,False,Image contain a partial part of a mountain dew...,False,False,A black aluminum soda can with a silver top an...
3,126,VizWiz_train_00000126.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,False,3,1,4,5,0,0,...,Finger covers camera lens so that the photo is...,False,False,Quality issues are too severe to recognize vis...,True,False,A orange on a gray cloth with the photographer...,False,False,A white cylindrical container with a black lid...
4,131,VizWiz_train_00000131.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,1,2,5,0,0,0,...,Quality issues are too severe to recognize vis...,True,False,A canister shaped object is near a wood wall.,False,False,some type of liquid that is in a container,False,False,"A white plastic bag with red text, likely a pr..."


## Cleanup

In [None]:
if processor is not None:
    del processor
if model is not None:
    del model
torch.cuda.empty_cache()