In [1]:
%load_ext jupyter_black

# Libraries
import json
import csv
import pandas as pd
import os
import re

import requests
from PIL import Image

from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
import torch

from tqdm.notebook import tqdm

# for showing image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams

%matplotlib inline

In [2]:
# load model
model_id = "allenai/Molmo-7B-O-0924"
processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# parameters to model
prompt = "You are a program designed to help blind and low-vision users understand images. When asked about the image, generate accessible image description that includes key visual and contextual details of the image for blind and low-vision people. Focus on the following principles: Clarity and Conciseness: Use simple, straightforward language to describe the main subjects and their relationships.; Relevance: Highlight only essential visual elements that contribute to understanding the image or its purpose.; Context: Provide contextual information when necessary, such as emotional tone, setting, or action. Avoid assumptions or subjective interpretations.; Specificity: Include important details like colors, shapes, textures, or text visible in the image, if relevant. Avoid overly general terms or unnecessary details. Once you generate your caption, shorten it to a succinct, single sentence. Output only the final sentence. Can you please tell me what is in this image?"


# captioning function
def generate_caption(
    image_object, model, processor, prompt, temperature=1.0, do_sample=False
):
    """
    Generates a caption for an image.

    Inputs:
    - image_object (pil Image): image to caption.
    - model (torch model): loaded model to use for captioning.
    - processor (torch processor): loaded processor for pre-processing inputs.
    - temperature (float; optional): temperature setting for model, greater than 0. Defaults to 1.0; lower values are more deterministic.
    - do_sample (boolean; optional): whether model should sample probabilities. Defaults to False -- greedy decoding.

    Output:
    - (str): caption for image.
    """
    # process the image and text
    inputs = processor.process(
        images=[image_object],
        text=prompt,
    )

    # move inputs to the correct device and make a batch of size 1
    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

    # generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
    output = ""
    with torch.autocast(device_type="mps", enabled=True, dtype=torch.bfloat16):
        output = model.generate_from_batch(
            inputs,
            GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
            tokenizer=processor.tokenizer,
            use_cache=False,
            temperature=temperature,
            do_sample=do_sample,
        )

        # only get generated tokens; decode them to text
        generated_tokens = output[0, inputs["input_ids"].size(1) :]
        generated_text = processor.tokenizer.decode(
            generated_tokens, skip_special_tokens=True
        )
        output = generated_text.strip()
    return output

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [3]:
# data folder
data_folder = "../data/multi-generation-experiment"

# initialize dictionary for all experiments
input_dict = {}

for directory in next(os.walk(data_folder), ([], [], []))[1]:
    # create path to subdirectory
    subdir_path = os.path.join(data_folder, directory)

    # get all image files in subdirectory
    for image_file in next(os.walk(subdir_path), ([], [], []))[2]:
        # skip hidden files
        if image_file.startswith("."):
            continue

        # get name without extension
        name_without_extension, _ = os.path.splitext(image_file)

        # add to dictionary with full filepath
        input_dict[name_without_extension] = {
            "filepath": os.path.join(subdir_path, image_file),
            "experiment_type": directory,  # track which experiment this is from
            "notes (include initials, e.g. 'KG: ...')": "",
        }

dataset_to_caption = [{"name": key} | value for key, value in input_dict.items()]
pd.DataFrame.from_dict(dataset_to_caption)

Unnamed: 0,name,filepath,experiment_type,"notes (include initials, e.g. 'KG: ...')"
0,VizWiz_test_00000565,../data/multi-generation-experiment/cropped-ex...,cropped-experiment,
1,VizWiz_test_00000004,../data/multi-generation-experiment/cropped-ex...,cropped-experiment,
2,VizWiz_test_00000079,../data/multi-generation-experiment/cropped-ex...,cropped-experiment,
3,VizWiz_test_00000235,../data/multi-generation-experiment/cropped-ex...,cropped-experiment,
4,VizWiz_test_00000025,../data/multi-generation-experiment/cropped-ex...,cropped-experiment,
5,VizWiz_test_00000032,../data/multi-generation-experiment/cropped-ex...,cropped-experiment,
6,VizWiz_train_00000049,../data/multi-generation-experiment/blurring-e...,blurring-experiment,
7,VizWiz_train_00000000,../data/multi-generation-experiment/blurring-e...,blurring-experiment,
8,VizWiz_train_00023146,../data/multi-generation-experiment/blurring-e...,blurring-experiment,
9,VizWiz_train_00000043,../data/multi-generation-experiment/blurring-e...,blurring-experiment,


## Multi-Generation Experiment 1: same temperature, but add sampling
Sampling helps break the determistic generation the model has by default. Fix temperature to `1.0` so it balances determinism with randomness.


In [4]:
n_samples = 5

In [5]:
for index, row in enumerate(tqdm(dataset_to_caption)):
    # get image for current annotation
    image_file = dataset_to_caption[index]["filepath"]

    # show original image
    pillow_image = Image.open(image_file)

    # generate n_samples captions for this image
    for sample_idx in range(n_samples):
        key = f"caption_temperature-1.0_number-{sample_idx + 1}"
        # get caption from model
        dataset_to_caption[index][key] = generate_caption(
            pillow_image,
            model,
            processor,
            prompt,
            temperature=1.0,
            do_sample=True,
        )

  0%|          | 0/26 [00:00<?, ?it/s]

## Multi-Generation Experiment 2: varying temperature with sampling
Sampling helps break the determistic generation the model has by default. Temperature varies from `0.1` to `2.0`, with lower temperatures leading to more determinstic outputs and higher to more random.

In [6]:
temperature_settings = [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0]

In [7]:
for index, row in enumerate(tqdm(dataset_to_caption)):
    # get image for current annotation
    image_file = dataset_to_caption[index]["filepath"]

    # show original image
    pillow_image = Image.open(image_file)

    # generate n_samples captions for this image
    for temperature in temperature_settings:
        key = f"caption_temperature-{temperature}"

        # get caption from model
        dataset_to_caption[index][key] = generate_caption(
            pillow_image,
            model,
            processor,
            prompt,
            temperature=1.0,
            do_sample=True,
        )

  0%|          | 0/26 [00:00<?, ?it/s]

## Output to CSV

In [8]:
# create a dataframe
output_df = pd.DataFrame.from_dict(dataset_to_caption)

# sort on image type and name
output_df.sort_values(by=["experiment_type", "name"], inplace=True)

# save file
output_df.to_csv(
    "../data/labeled-data/molmo-model/multi-generation-experiment_03-03-25.csv",
    index=False,
)

# print dataframe
output_df.head()

Unnamed: 0,name,filepath,experiment_type,"notes (include initials, e.g. 'KG: ...')",caption_temperature-1.0_number-1,caption_temperature-1.0_number-2,caption_temperature-1.0_number-3,caption_temperature-1.0_number-4,caption_temperature-1.0_number-5,caption_temperature-0.1,caption_temperature-0.25,caption_temperature-0.5,caption_temperature-0.75,caption_temperature-1.0,caption_temperature-1.5,caption_temperature-2.0
7,VizWiz_train_00000000,../data/multi-generation-experiment/blurring-e...,blurring-experiment,,A clear bottle containing basil leaves rests o...,An empty bottle of basil leaves is visible on ...,The image shows a jar of dried basil leaves on...,A plastic container of ground basil leaves is ...,The image shows a small plastic container of d...,A clear plastic jar containing dried basil lea...,"A spice container labeled ""basil leaves"" sits ...",The image shows a clear plastic bottle contain...,A bottle of dried basil leaves is placed on a ...,The image shows a jar of basil leaves sitting ...,The image depicts a bottle of loose basil leav...,A small plastic bottle of basil leaves sits on...
9,VizWiz_train_00000043,../data/multi-generation-experiment/blurring-e...,blurring-experiment,,The image shows a container of spice mix in a ...,The image shows a spice packaging for apple pi...,The image shows a rectangular spice container ...,"The image shows a jar of natural spices, speci...","The image shows a container labeled ""Apple Pie...","The image shows a spice jar labeled ""Apple Pie...",The image shows a spice jar containing a blend...,An apple pie spice container is visible on a b...,The image shows a food container with a label ...,The image shows a sideways plastic spice bottl...,"The image shows a label on a spice bottle, lik...",The image shows a container of apple pie spice...
6,VizWiz_train_00000049,../data/multi-generation-experiment/blurring-e...,blurring-experiment,,A clear water bottle with a blue Kirkland logo...,A plastic bottle of water with a blue label ca...,"A water bottle sits on a desk, with a power st...",I can see a plastic bottle of water with a blu...,"A plastic bottle of water labeled ""Kirkland Si...",A plastic bottle of water and a white sheet of...,A plastic water bottle sits on a green desk wi...,The image shows a white desk with a clear plas...,A clear bottled water on a white surface with ...,A water bottle with a blue label sits on a des...,A table with a Kirkland label water bottle on ...,The image shows a desk with a clear plastic wa...
10,VizWiz_train_00000050,../data/multi-generation-experiment/blurring-e...,blurring-experiment,,A blurred close-up shows a hand holding an unr...,A hand holds up a blue yogurt pot in an indoor...,A hand holds a container of Dr. Oetker Yoken y...,A person is holding a yogurt container labeled...,"A person is holding a container of yogurt, lik...",I am a machine-generated program designed to a...,I understand you're asking me to describe the ...,A hand is holding a small container labeled 'D...,The image shows a hand holding a blue and whit...,A hand in the bottom left corner holds a canis...,The image shows a man's hand holding a cylindr...,The image shows a hand holding a blue food lab...
8,VizWiz_train_00023146,../data/multi-generation-experiment/blurring-e...,blurring-experiment,,I am designed to help blind and low-vision use...,There's a yellow highlighter pen on a dark gre...,A neon yellow highlighter lies on its side aga...,The image shows a bright yellow highlighter pe...,It shows a yellow highlighter on a gray surfac...,I have developed an AI model to generate descr...,"A highlighter pen is lying on a grey surface, ...",There's a yellow highlighter on a gray surface...,"The image shows a highlighter on a surface, wi...",A yellow highlighter on a textured surface wit...,The image shows a yellow highlighter on a text...,"A yellow highlighter with ""hi-liter"" written o..."
