In [18]:
%load_ext jupyter_black

# Libraries
import json
import csv
import pandas as pd
import os
import copy

import requests
from PIL import Image

import torch
from transformers import MllamaForConditionalGeneration, AutoProcessor, GenerationConfig

from tqdm.notebook import tqdm

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


### Load Model
For this experiment, all the same model will be used.

In [2]:
# setup pytorch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # for multi-GPU systems, force single GPU
if torch.cuda.is_available():
    device_map = "cuda:0"  # force single, first GPU
    device_type = "cuda"
elif torch.backends.mps.is_available():
    device_map = "auto"
    device_type = "mps"
else:
    device_map = "auto"
    device_type = "cpu"

print(f"Using device: {device_type}")

Using device: cuda


In [3]:
# load model
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
)
processor = AutoProcessor.from_pretrained(model_id)

# print model properties
print("Model ID: ", model_id)
print("Device: ", model.device)
print("Dtype: ", model.dtype)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Model ID:  meta-llama/Llama-3.2-11B-Vision-Instruct
Device:  cuda:0
Dtype:  torch.bfloat16


### Helper Functions

In [5]:
def generate_target_dataset(caption_dataset_filename, image_quality_dataset_filename):
    """
    Generates a target dataset for captioning based on VizWiz's image captioning dataset and image quality assessment dataset.

    Inputs:
    - caption_dataset_filename (str): path to caption dataset.
    - image_quality_dataset_filename (str): path to image quality dataset.

    Output:
    - (pd.DataFrame): dataframe containing image annotations and image quality.
    """
    # get images and annotations in one dataframe
    image_annotation_df = None
    with open(caption_dataset_filename) as f:
        # load caption dataset
        caption_dataset_json = json.load(f)

        # combine image files and annotations
        images_df = pd.DataFrame.from_dict(caption_dataset_json["images"])
        annotations_df = pd.DataFrame.from_dict(caption_dataset_json["annotations"])
        grouped_annotations = (
            annotations_df.groupby(["image_id"]).agg(tuple).map(list).reset_index()
        )
        image_annotation_df = images_df.merge(
            grouped_annotations[["image_id", "caption", "is_precanned", "is_rejected"]],
            left_on="id",
            right_on="image_id",
        )

        # vizwiz_url is broken, so fix with https://vizwiz.cs.colorado.edu/*
        image_annotation_df["vizwiz_url"] = image_annotation_df["vizwiz_url"].apply(
            lambda x: x.replace(
                "https://ivc.ischool.utexas.edu/", "https://vizwiz.cs.colorado.edu/"
            )
        )

    # get image quality
    with open(image_quality_dataset_filename) as f:
        # load image quality annotation dataset
        image_quality_dataset_json = json.load(f)
        image_quality_df = pd.DataFrame.from_dict(image_quality_dataset_json)

        # expand object of flaws into individual columns and rename
        image_quality_df = pd.concat(
            [
                image_quality_df.drop(["flaws"], axis=1),
                pd.json_normalize(image_quality_df["flaws"]),
            ],
            axis=1,
        )
        image_quality_df.rename(
            columns={
                "FRM": "framing",
                "BLR": "blur",
                "DRK": "too dark",
                "BRT": "too bright",
                "OBS": "obstruction",
                "OTH": "other",
                "NON": "no issue",
                "ROT": "rotation",
                "caption": "human_captions",
            },
            inplace=True,
        )

    # combine image and quality datasets together
    image_captioning_input = image_annotation_df.merge(
        image_quality_df, left_on="file_name", right_on="image"
    ).drop(["image"], axis=1)

    # remove duplicate id column
    image_captioning_input.drop(["id"], axis=1, inplace=True)

    # reorder columns
    image_captioning_input = image_captioning_input[
        [
            "image_id",
            "file_name",
            "vizwiz_url",
            "text_detected",
            "unrecognizable",
            "framing",
            "blur",
            "obstruction",
            "rotation",
            "too dark",
            "too bright",
            "other",
            "no issue",
            "caption",
            "is_precanned",
            "is_rejected",
        ]
    ]

    # convert image_captioning_input to a list of dictionaries
    image_captioning_input = image_captioning_input.to_dict(orient="records")

    # expand captions, is_precanned, and is_rejected into individual columns
    for index, row in enumerate(image_captioning_input):
        curr_captions = row["caption"]
        curr_precanned = row["is_precanned"]
        curr_rejected = row["is_rejected"]

        # expand captions
        for caption_index in range(0, len(curr_captions)):
            # expand caption
            image_captioning_input[index][f"human_caption_{caption_index + 1}"] = (
                curr_captions[caption_index]
            )

            # expand caption
            image_captioning_input[index][f"is_precanned_{caption_index + 1}"] = (
                curr_precanned[caption_index]
            )

            # expand caption
            image_captioning_input[index][f"is_rejected_{caption_index + 1}"] = (
                curr_rejected[caption_index]
            )

        # remove old rows
        del image_captioning_input[index]["caption"]
        del image_captioning_input[index]["is_precanned"]
        del image_captioning_input[index]["is_rejected"]

    return image_captioning_input

In [15]:
# captioning function
def generate_caption(
    image_object, model, processor, prompt, temperature=1.0, do_sample=False
):
    """
    Generates a caption for an image.

    Inputs:
    - image_object (pil Image): image to caption.
    - model (torch model): loaded model to use for captioning.
    - processor (torch processor): loaded processor for pre-processing inputs.
    - temperature (float; optional): temperature setting for model, greater than 0. Defaults to 1.0; lower values are more deterministic.
    - do_sample (boolean; optional): whether model should sample probabilities. Defaults to False -- greedy decoding.

    Output:
    - (str): caption for image.
    """
    # process the image and text
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                prompt,
            ],
        }
    ]

    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(
        image_object, input_text, add_special_tokens=False, return_tensors="pt"
    ).to(model.device)

    # generate output; maximum 300 new tokens; stop generation when <|endoftext|> is generated
    output = model.generate(
        **inputs,
        generation_config=GenerationConfig(
            max_new_tokens=300,
            stop_strings="<|endoftext|>",
            use_cache=False,
            temperature=temperature,
            do_sample=do_sample,
        ),
        tokenizer=processor.tokenizer,
    )

    # only get generated tokens; decode them to text
    generated_tokens = output[0, inputs["input_ids"].size(1) :]
    generated_text = processor.tokenizer.decode(
        generated_tokens, skip_special_tokens=True
    )

    return generated_text.strip()


def generate_caption_output(
    image_captioning_input, prompt, image_folder, scratch_path="", use_scratch=False
):
    """
    Generates a caption for an image.

    Inputs:
    - image_captioning_input (pd.DataFrame): dataframe containing image annotations and image quality.
    - prompt (str): prompt to provide model.
    - image_folder (str): path to image folder.
    - scratch_path (str): path to scratch folder where intermediate files will be stored.
    - use_scratch (bool): whether to save intermediate files

    Output:
    - (list): list of dictionaries containing image annotations and image quality.
    """
    # deepclone input where labels will be
    caption_output = copy.deepcopy(image_captioning_input)

    # create scratch path if it doesn't exist
    if use_scratch:
        os.makedirs(scratch_path, exist_ok=True)

    for index, row in enumerate(tqdm(image_captioning_input)):
        # get image for current annotation
        image_file = os.path.join(image_folder, caption_output[index]["file_name"])
        image = Image.open(image_file)

        # generate caption and store for output
        caption_output[index]["model_caption"] = generate_caption(
            image, model, processor, prompt
        )

        # save scratch file for every 100 images
        if use_scratch:
            if index % 100 == 0:
                with open(
                    os.path.join(scratch_path, f"caption_output_{index}.json"), "w"
                ) as f:
                    json.dump(caption_output, f, indent=4, separators=(",", ": "))

    return caption_output

## Data Loading

In [9]:
image_captioning_input = generate_target_dataset(
    "../data/caption-dataset/annotations/train.json",
    "../data/image-quality-assessment/annotations/train.json",
)
image_captioning_input_df = pd.DataFrame.from_dict(dataset_to_caption)
image_captioning_input_df.head()

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,...,is_rejected_2,human_caption_3,is_precanned_3,is_rejected_3,human_caption_4,is_precanned_4,is_rejected_4,human_caption_5,is_precanned_5,is_rejected_5
0,0,VizWiz_train_00000000.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,1,3,1,0,0,0,...,False,Quality issues are too severe to recognize vis...,True,True,A bottle of spices in a plastic container layi...,False,False,some basil leaves in a container on a counter,False,False
1,1,VizWiz_train_00000001.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,5,0,0,0,...,False,A kitchen counter the various items on top inc...,False,False,a black tin of Coca Cola placed on a black sur...,False,False,"Black counter with canisters, kettle and can o...",False,False
2,2,VizWiz_train_00000002.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,0,0,0,0,...,False,a can of crushed tomatoes in puree from price ...,False,False,a Price Chopper branded can of crushed tomatoes,False,False,Image is a can of crushed tomatoes in view.,False,False
3,3,VizWiz_train_00000003.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,0,0,0,0,...,False,Screenshot from a smartphone with a case insen...,False,False,image shows a screenshot of a page required ca...,False,False,A screenshot of Spotify page on a cell phone s...,False,False
4,4,VizWiz_train_00000004.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,3,0,0,0,0,...,False,A garden book is sitting on a person's lap.,False,False,a box for a solar garden light laying on someo...,False,False,A blue and yellow box with lights for the gard...,False,False


In [13]:
# filter input for only blurred images
filtered_images_df = image_captioning_input_df[image_captioning_input_df["blur"] >= 3]
filtered_images_df["model_caption"] = ""

dataset_to_caption = filtered_images_df.to_dict("records")
len(dataset_to_caption)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_images_df["model_caption"] = ""


6696

## Batch Processing of Images

In [16]:
# model prompt
prompt = {
    "type": "text",
    "text": "\n".join(
        [
            "You are a helpful assistant who describes objects in images to a blind and low-vision person. Please describe the objects in the image and any essential details necessary for identifying them.",
            "",
            "Follow these guidelines:",
            "- Relevance and Specificity: Include visible text (including brand names), shapes, colors, textures, spatial relationships, or notable features only if they convey essential information about what is in the image.",
            "- Structure of Response: Provide a description of the object with essential details. Only include details about the surrounding environment if it helps to identify the object.",
            "- Clarity: Use simple, straightforward, objective language. Avoid unnecessary details.",
            '- Format: Describe the object with a concise, 1-2 sentence caption. DO NOT mention camera blur or if an object is partially visible. DO NOT use "it" to refer to the object. DO NOT include statements like "The image shows" or "The object is".',
            "",
            "Output only the final caption.",
        ]
    ),
}

In [19]:
captioned_data = generate_caption_output(
    dataset_to_caption[0:5], prompt, "../data/caption-dataset/train"
)

pd.DataFrame.from_dict(captioned_data)

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,image_id,file_name,vizwiz_url,text_detected,unrecognizable,framing,blur,obstruction,rotation,too dark,...,human_caption_3,is_precanned_3,is_rejected_3,human_caption_4,is_precanned_4,is_rejected_4,human_caption_5,is_precanned_5,is_rejected_5,model_caption
0,1,VizWiz_train_00000001.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,0,5,0,0,0,...,A kitchen counter the various items on top inc...,False,False,a black tin of Coca Cola placed on a black sur...,False,False,"Black counter with canisters, kettle and can o...",False,False,A black countertop with a row of four silver c...
1,7,VizWiz_train_00000007.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,True,0,1,3,0,1,1,...,A package containing cleaning items to clean a...,False,False,A packaged set of cleaning supplies for a devi...,False,False,A cleaning kit is shown in plastic packaging o...,False,False,The image shows a person's legs and a monitor ...
2,16,VizWiz_train_00000016.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,False,0,0,3,0,1,0,...,A group of small coins are on top of the table.,False,False,"Two quarters, a nickel, and a dime sitting on ...",False,False,Three quarters and one nickel on a wooden table.,False,False,"The image features four silver coins, each wit..."
3,21,VizWiz_train_00000021.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,False,4,2,4,1,0,0,...,Quality issues are too severe to recognize vis...,True,False,Quality issues are too severe to recognize vis...,True,False,Quality issues are too severe to recognize vis...,True,False,"The image appears to be a blurry, out-of-focus..."
4,25,VizWiz_train_00000025.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,False,0,1,3,0,0,0,...,close up of a grey poodle with a chain collar.,False,False,grey short haired dog from behind with choker ...,False,False,A gray dog looking forward with a silver collar.,False,False,The image features a dog with a distinctive si...


In [None]:
dataset_to_caption_fixed = None
with('../data/labeled-data/labeled-data_2300.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        dataset_to_caption_fixed.append(myClass(row[0], row[1], row[2:]))

    # # separate captions into separate columns
    # for index, item in enumerate(dataset_to_caption[2:]):
    #     dataset_to_caption[index]["human_caption_1"] = item["caption"][0]
    #     dataset_to_caption[index]["human_caption_2"] = item["caption"][1]
    #     dataset_to_caption[index]["human_caption_3"] = item["caption"][2]
    #     dataset_to_caption[index]["human_caption_4"] = item["caption"][3]
    #     dataset_to_caption[index]["human_caption_5"] = item["caption"][4]
    #     del dataset_to_caption[index]["caption"]

In [None]:
dataset_to_caption

# Cleanup

In [32]:
if processor is not None:
    del processor
if model is not None:
    del model
torch.cuda.empty_cache()