In [1]:
%load_ext jupyter_black

# Libraries
import json
import csv
import pandas as pd
import os

import requests
from PIL import Image

import torch
from transformers import MllamaForConditionalGeneration, AutoProcessor, GenerationConfig

from tqdm.notebook import tqdm

In [4]:
# set target dataset and file locations
target_caption_dataset_filename = "../data/caption-dataset/annotations/train.json"
image_folder = "../data/caption-dataset/train/"

# get image quality annotations
target_image_quality_dataset_filename = (
    "../data/image-quality-assessment/annotations/train.json"
)

In [5]:
# get images and annotations in one dataframe
image_annotation_df = None
with open(target_caption_dataset_filename) as f:
    # load caption dataset
    caption_dataset_json = json.load(f)

    # combine image files and annotations
    images_df = pd.DataFrame.from_dict(caption_dataset_json["images"])
    annotations_df = pd.DataFrame.from_dict(caption_dataset_json["annotations"])
    grouped_annotations = (
        annotations_df.groupby(["image_id"]).agg(tuple).map(list).reset_index()
    )
    image_annotation_df = images_df.merge(
        grouped_annotations[["image_id", "caption", "is_precanned", "is_rejected"]],
        left_on="id",
        right_on="image_id",
    )

    # vizwiz_url is broken, so fix with https://vizwiz.cs.colorado.edu/*
    image_annotation_df["vizwiz_url"] = image_annotation_df["vizwiz_url"].apply(
        lambda x: x.replace(
            "https://ivc.ischool.utexas.edu/", "https://vizwiz.cs.colorado.edu/"
        )
    )


image_annotation_df

Unnamed: 0,file_name,vizwiz_url,id,text_detected,image_id,caption,is_precanned,is_rejected
0,VizWiz_train_00000000.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,0,True,0,[ITS IS A BASIL LEAVES CONTAINER ITS CONTAINS ...,"[False, False, True, False, False]","[False, False, True, False, False]"
1,VizWiz_train_00000001.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,1,True,1,[A can of Coca Cola on a counter is shown for ...,"[False, False, False, False, False]","[False, False, False, False, False]"
2,VizWiz_train_00000002.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,2,True,2,[A can of crushed tomatoes are on a brown surf...,"[False, False, False, False, False]","[False, False, False, False, False]"
3,VizWiz_train_00000003.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,3,True,3,[A white screen with a captcha that needs to b...,"[False, False, False, False, False]","[False, False, False, False, False]"
4,VizWiz_train_00000004.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,4,True,4,[A box for a garden light rests in someone's l...,"[False, False, False, False, False]","[False, False, False, False, False]"
...,...,...,...,...,...,...,...,...
23426,VizWiz_train_00023426.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23426,True,23426,[A PlayStation 2 video game that is for the sp...,"[False, False, False, False, False]","[False, False, False, False, False]"
23427,VizWiz_train_00023427.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23427,False,23427,[A folded paper currency with a depiction of a...,"[False, True, False, False, True]","[False, False, False, False, False]"
23428,VizWiz_train_00023428.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23428,True,23428,[A black and white sheet of paper with font le...,"[False, False, False, False, False]","[False, False, False, False, False]"
23429,VizWiz_train_00023429.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23429,True,23429,[A bottle of Healthy Hair Skin & Nails capsule...,"[False, False, False, False, False]","[False, False, False, False, False]"


In [31]:
# get image quality
image_quality_annotation_df = None
with open(target_image_quality_dataset_filename) as f:
    # load image quality annotation dataset
    image_quality_dataset_json = json.load(f)
    image_quality_df = pd.DataFrame.from_dict(image_quality_dataset_json)

    # expand object of flaws into individual columns and rename
    image_quality_df = pd.concat(
        [
            image_quality_df.drop(["flaws"], axis=1),
            pd.json_normalize(image_quality_df["flaws"]),
        ],
        axis=1,
    )
    image_quality_df.rename(
        columns={
            "FRM": "framing",
            "BLR": "blur",
            "DRK": "too dark",
            "BRT": "too bright",
            "OBS": "obstruction",
            "OTH": "other",
            "NON": "no issue",
            "ROT": "rotation",
        },
        inplace=True,
    )

image_quality_df

Unnamed: 0,image,unrecognizable,framing,blur,too dark,too bright,obstruction,other,no issue,rotation
0,VizWiz_train_00022585.jpg,4,1,5,0,3,0,0,0,0
1,VizWiz_train_00004143.jpg,1,4,3,0,0,0,0,1,0
2,VizWiz_train_00002092.jpg,5,0,3,2,0,1,2,0,0
3,VizWiz_train_00008936.jpg,1,4,4,0,0,1,0,0,0
4,VizWiz_train_00020927.jpg,0,0,2,0,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...
23426,VizWiz_train_00020740.jpg,2,3,3,1,1,1,0,1,0
23427,VizWiz_train_00014487.jpg,0,0,1,0,0,0,0,1,4
23428,VizWiz_train_00005846.jpg,1,4,0,2,0,0,0,1,0
23429,VizWiz_train_00017677.jpg,0,3,1,0,0,0,1,1,0


In [32]:
# combine image and quality datasets together
image_captioning_input = image_annotation_df.merge(
    image_quality_df, left_on="file_name", right_on="image"
).drop(["image"], axis=1)

image_captioning_input

Unnamed: 0,file_name,vizwiz_url,id,text_detected,image_id,caption,is_precanned,is_rejected,unrecognizable,framing,blur,too dark,too bright,obstruction,other,no issue,rotation
0,VizWiz_train_00000000.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,0,True,0,[ITS IS A BASIL LEAVES CONTAINER ITS CONTAINS ...,"[False, False, True, False, False]","[False, False, True, False, False]",1,3,1,0,0,0,1,1,0
1,VizWiz_train_00000001.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,1,True,1,[A can of Coca Cola on a counter is shown for ...,"[False, False, False, False, False]","[False, False, False, False, False]",0,0,5,0,0,0,0,0,0
2,VizWiz_train_00000002.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,2,True,2,[A can of crushed tomatoes are on a brown surf...,"[False, False, False, False, False]","[False, False, False, False, False]",0,0,0,0,0,0,0,5,0
3,VizWiz_train_00000003.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,3,True,3,[A white screen with a captcha that needs to b...,"[False, False, False, False, False]","[False, False, False, False, False]",0,0,0,0,0,0,0,5,0
4,VizWiz_train_00000004.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,4,True,4,[A box for a garden light rests in someone's l...,"[False, False, False, False, False]","[False, False, False, False, False]",0,3,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23426,VizWiz_train_00023426.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23426,True,23426,[A PlayStation 2 video game that is for the sp...,"[False, False, False, False, False]","[False, False, False, False, False]",0,0,1,0,0,0,0,4,0
23427,VizWiz_train_00023427.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23427,False,23427,[A folded paper currency with a depiction of a...,"[False, True, False, False, True]","[False, False, False, False, False]",2,4,5,0,2,0,0,0,0
23428,VizWiz_train_00023428.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23428,True,23428,[A black and white sheet of paper with font le...,"[False, False, False, False, False]","[False, False, False, False, False]",0,4,0,1,0,0,0,0,1
23429,VizWiz_train_00023429.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23429,True,23429,[A bottle of Healthy Hair Skin & Nails capsule...,"[False, False, False, False, False]","[False, False, False, False, False]",0,0,0,0,0,0,0,5,0


In [33]:
image_captioning_input[image_captioning_input["blur"] == 1][::-10]

Unnamed: 0,file_name,vizwiz_url,id,text_detected,image_id,caption,is_precanned,is_rejected,unrecognizable,framing,blur,too dark,too bright,obstruction,other,no issue,rotation
23426,VizWiz_train_00023426.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23426,True,23426,[A PlayStation 2 video game that is for the sp...,"[False, False, False, False, False]","[False, False, False, False, False]",0,0,1,0,0,0,0,4,0
23381,VizWiz_train_00023381.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23381,True,23381,[A box of Aunt Jennie Original Waffle Mix is o...,"[False, False, False, False, False]","[False, False, False, False, False]",0,4,1,0,0,0,0,1,0
23329,VizWiz_train_00023329.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23329,False,23329,[A brown dresser with a long mirror sitting on...,"[False, False, False, False, False]","[False, False, False, False, False]",0,0,1,0,0,0,0,4,0
23297,VizWiz_train_00023297.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23297,False,23297,[Quality issues are too severe to recognize vi...,"[True, True, True, True, True]","[False, False, False, False, False]",5,4,1,3,0,1,0,0,0
23231,VizWiz_train_00023231.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,23231,True,23231,[a seasoning in a Italian style was stored in ...,"[False, False, False, False, False]","[False, False, False, False, False]",0,1,1,0,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,VizWiz_train_00000237.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,237,True,237,[Quality issues are too severe to recognize vi...,"[True, False, True, False, False]","[False, False, False, False, False]",2,4,1,0,0,1,0,0,4
189,VizWiz_train_00000189.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,189,True,189,"[Appears to be a picture of a flyer, Package ...","[False, False, False, False, False]","[False, False, False, False, False]",0,4,1,0,0,0,0,1,3
128,VizWiz_train_00000128.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,128,True,128,[Close up of white and grey computer keyboard ...,"[False, False, False, False, False]","[False, False, False, False, False]",0,2,1,0,0,0,0,3,0
52,VizWiz_train_00000052.jpg,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...,52,True,52,"[A package of USDA Choice Beef Sirloin Steaks,...","[False, False, False, False, False]","[False, False, False, False, False]",0,2,1,0,0,0,0,2,0


In [34]:
# filter input for only blurred images
filtered_images_df = image_captioning_input[image_captioning_input["blur"] >= 3]
filtered_images_df["model_caption"] = ""

dataset_to_caption = filtered_images_df.to_dict("records")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_images_df["model_caption"] = ""


In [None]:
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]



In [36]:
# model prompt
system_prompt = {
    "role": "system",
    "content": "You are a program designed to help blind and low-vision users understand images. When asked about the image, generate accessible image description that includes key visual and contextual details of the image for blind and low-vision people. Focus on the following principles: Clarity and Conciseness: Use simple, straightforward language to describe the main subjects and their relationships.; Relevance: Highlight only essential visual elements that contribute to understanding the image or its purpose.; Context: Provide contextual information when necessary, such as emotional tone, setting, or action. Avoid assumptions or subjective interpretations.; Specificity: Include important details like colors, shapes, textures, or text visible in the image, if relevant. Avoid overly general terms or unnecessary details. Once you generate your caption, shorten it to a succinct, single sentence. Output only the final sentence.",
}

for index, row in enumerate(tqdm(dataset_to_caption[0:1])):
    # get image for current annotation
    image_file = os.path.join(image_folder, dataset_to_caption[index]["file_name"])
    image = Image.open(image_file)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                system_prompt,
                {
                    "type": "text",
                    "text": "Can you please tell me what is in this image?",
                },
            ],
        }
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(
        image, input_text, add_special_tokens=False, return_tensors="pt"
    ).to(model.device)

    output = model.generate(
        **inputs,
        generation_config=GenerationConfig(
            max_new_tokens=200,
            stop_strings="<|endoftext|>",
            use_cache=False,
            temperature=1.0,
            do_sample=True,
        ),
        tokenizer=processor.tokenizer,
    )

    # only get generated tokens; decode them to text
    generated_tokens = output[0, inputs["input_ids"].size(1) :]
    generated_text = processor.tokenizer.decode(
        generated_tokens, skip_special_tokens=True
    )
    dataset_to_caption[index]["model_caption"] = generated_text

    # write file if 50 rows have been processed
    if (index + 1) % 50 == 0 or index == len(dataset_to_caption) - 1:
        with open(
            "./data/labeled-data/labeled-data_{}.csv".format(index + 1),
            "w",
            encoding="utf8",
            newline="",
        ) as output_file:
            fc = csv.DictWriter(
                output_file,
                fieldnames=dataset_to_caption[0].keys(),
            )
            fc.writeheader()
            fc.writerows(dataset_to_caption)

  0%|          | 0/1 [00:00<?, ?it/s]

There is a black countertop, beige cabinets, a stainless-steel coffee pot, a cream mug, and several coffee tins. Also, the silver coffee pot lid has been removed to show the black coffee maker inside. Additionally, there is a


In [None]:
dataset_to_caption_fixed = None
with('../data/labeled-data/labeled-data_2300.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        dataset_to_caption_fixed.append(myClass(row[0], row[1], row[2:]))

    # # separate captions into separate columns
    # for index, item in enumerate(dataset_to_caption[2:]):
    #     dataset_to_caption[index]["human_caption_1"] = item["caption"][0]
    #     dataset_to_caption[index]["human_caption_2"] = item["caption"][1]
    #     dataset_to_caption[index]["human_caption_3"] = item["caption"][2]
    #     dataset_to_caption[index]["human_caption_4"] = item["caption"][3]
    #     dataset_to_caption[index]["human_caption_5"] = item["caption"][4]
    #     del dataset_to_caption[index]["caption"]

In [None]:
dataset_to_caption