In [1]:

from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
from llava.conversation import conv_templates, SeparatorStyle

from PIL import Image
import requests
import copy
import torch

import sys
import warnings

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

dataset_dir = '/mnt/disks/data/webqa/WebQA_train_val.json'
with open(dataset_dir, 'r') as f:
    data = json.load(f)

In [3]:
val_data = {}
for id,dict in data.items():
    if dict['Qcate'] == 'text':
        continue
    elif not dict['split'] == 'val':
        continue
    elif not len(dict['img_posFacts']) > 0:
        continue
    val_data[id] = dict

print(len(val_data))


2511


In [28]:
from io import BytesIO
import base64

lineidx_dir = '/mnt/disks/data/webqa/imgs.lineidx'
images_dir = '/mnt/disks/data/webqa/imgs.tsv'

with open(lineidx_dir, "r") as fp_lineidx:
    lineidx = [int(i.strip()) for i in fp_lineidx.readlines()]

def read_image(image_id):
    with open(images_dir, "r") as fp:
        fp.seek(lineidx[int(image_id)%10000000])
        imgid, img_base64 = fp.readline().strip().split('\t')
    assert int(image_id) == int(imgid), f'{image_id} {imgid}'
    im = Image.open(BytesIO(base64.b64decode(img_base64)))
    return im


In [26]:
def get_multihop_prompt(ques, titles, captions):
    question = f"{DEFAULT_IMAGE_TOKEN} This is the first image.\nTitle: {titles[0]}\nCaption: {captions[0]}\n\nNow, let's look at another image: {DEFAULT_IMAGE_TOKEN} Title: {titles[1]}\nCaption: {captions[1]}\nRespond with the correct answer to the following question in a complete sentence. Question: {ques}"
    return question

def get_si_prompt(ques, titles, captions):
    question = f"{DEFAULT_IMAGE_TOKEN} You have to answer a question about this image\nTitle: {titles[0]}\nCaption: {captions[0]}\nRespond with the correct answer to the following question in a complete sentence. Question: {ques}"
    return question

def get_answer(data, model, image_processor, tokenizer, max_len, device):
    ques = data['Q']
    images = []
    titles = []
    captions = []
    for img in data['img_posFacts']:
        images.append(read_image(img['image_id']))
        titles.append(img['title'])
        captions.append(img['caption'])
    if len(images) == 2:
        prompt = get_multihop_prompt(ques, titles, captions)
    else:
        prompt = get_si_prompt(ques, titles, captions)
    image_tensors = process_images(images, image_processor, model.config)
    image_tensors = [_image.to(dtype=torch.float16, device=device) for _image in image_tensors]
    
    conv_template = "qwen_1_5"
    conv = copy.deepcopy(conv_templates[conv_template])
    conv.append_message(conv.roles[0], prompt)
    conv.append_message(conv.roles[1], None)
    prompt_question = conv.get_prompt()

    input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
    image_sizes = [image.size for image in images]
    print(prompt_question)
    # Generate response
    cont = model.generate(
        input_ids,
        images=image_tensors,
        image_sizes=image_sizes,
        do_sample=False,
        temperature=0,
        max_new_tokens=4096,
    )
    text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
    return text_outputs[0]

In [18]:
count = 0
for id, d in val_data.items():
    if len(d['img_posFacts']) > 1:
        count += 1
        print(d)
        if count > 2:
            break

{'Q': '"On which organ, are more round visible stop knobs installed; Western Cottage pump organ or Wurlitzer Model 44 Electrostatic Reed Organ?"', 'A': ['"The Western Cottage pump organ has more round visible stop knobs installed compared to the Wurlitzer Model 44 Electrostatic Reed Organ."'], 'topic': 'keyboard instruments', 'split': 'val', 'Qcate': 'choose', 'Guid': 'd5bbdefc0dba11ecb1e81171463288e9', 'img_posFacts': [{'image_id': 30086430, 'title': 'Western Cottage pump organ', 'caption': 'Western Cottage pump organ', 'url': 'https://commons.wikimedia.org/wiki/File:Western_Cottage_pump_organ.jpg', 'imgUrl': 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/69/Western_Cottage_pump_organ.jpg/800px-Western_Cottage_pump_organ.jpg'}, {'image_id': 30077357, 'title': 'Wurlitzer Model 44 Electrostatic Reed Organ.png', 'caption': 'Wurlitzer Model 44 Electrostatic Reed Organ.png', 'url': 'https://commons.wikimedia.org/wiki/File:Wurlitzer_Model_44_Electrostatic_Reed_Organ.png', 'imgUrl':

In [8]:
## Load model
# pretrained = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
pretrained = "lmms-lab/llava-onevision-qwen2-7b-ov"
model_name = "llava_qwen"
device = "cuda"
device_map = "auto"
llava_model_args = {
        "multimodal": True,
    }
overwrite_config = {}
overwrite_config["image_aspect_ratio"] = "pad"
llava_model_args["overwrite_config"] = overwrite_config
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, load_8bit=True, **llava_model_args)

model.eval()

Loaded LLaVA model: lmms-lab/llava-onevision-qwen2-7b-ov


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type llava to instantiate a model of type llava_qwen. This is not supported for all configurations of models and can yield errors.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Overwriting config with {'image_aspect_ratio': 'pad'}


Downloading shards: 100%|██████████| 4/4 [01:14<00:00, 18.59s/it]


Loading vision tower: google/siglip-so400m-patch14-384


Loading checkpoint shards: 100%|██████████| 4/4 [01:08<00:00, 17.23s/it]


Model Class: LlavaQwenForCausalLM


LlavaQwenForCausalLM(
  (model): LlavaQwenModel(
    (embed_tokens): Embedding(151647, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2FlashAttention2(
          (q_proj): Linear8bitLt(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear8bitLt(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear8bitLt(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear8bitLt(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear8bitLt(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear8bitLt(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear8bitLt(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
 

In [27]:
count = 0
for id,d in val_data.items():
    count += 1
    ans = get_answer(d, model, image_processor, tokenizer, max_length, device)
    print(f"Ques: {d['Q']}\nAssistant: {ans}\nGT: {d['A'][0]}\n")
    if count == 10:
        break

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<image> You have to answer a question about this image
Title: 2020-05-08 15 17 05 Minnetonka Rhododendron flower along Tranquility Court in the Franklin Farm section of Oak Hill, Fairfax County, Virginia
Caption: 2020-05-08 15 17 05 Minnetonka Rhododendron flower along Tranquility Court in the Franklin Farm section of Oak Hill, Fairfax County, Virginia Minnetonka Rhododendron flower along Tranquility Court in the Franklin Farm section of Oak Hill, Fairfax County, Virginia
 Think deeply and respond with the correct answer to the following question in a complete sentence. Question: "Does a Minnetonka Rhododendron flower have petals in a cup shape?"<|im_end|>
<|im_start|>assistant

Ques: "Does a Minnetonka Rhododendron flower have petals in a cup shape?"
Assistant: Yes, a Minnetonka Rhododendron flower has petals in a cup shape.
GT: "No, a Minnetonka Rhododendron flower does not have petals in a cup shape."

<|im_s

In [19]:
prompt_question

"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image> This is the first image. Title: Wurlitzer Model 44 Electrostatic Reed Organ\n\nNow, let's look at another image: <image> Title: Wurlitzer Model 44 Electrostatic Reed Organ.png\nQ: On which organ, are more round visible stop knobs installed; Western Cottage pump organ or Wurlitzer Model 44 Electrostatic Reed Organ? Respond with the correct answer in a complete sentence.<|im_end|>\n<|im_start|>assistant\n"