# Testing QWEN2.5 VL on DocVQA 

In [2]:
import codecs, json, re, os, base64
import pandas as pd

from io import BytesIO
from PIL import Image
from IPython.display import HTML, display
from typing import Any
from datasets import load_dataset, Dataset

ds = load_dataset("lmms-lab/DocVQA", "DocVQA")

In [3]:
# Get dataset 
train = ds['validation']

In [4]:
def convert_to_base64(pil_image):
    """
    Convert PIL images to Base64 encoded strings
    :param pil_image: PIL image
    :return: Re-sized Base64 string
    """

    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str

def plt_img_base64(img_base64):
    """
    Display base64 encoded string as image
    :param img_base64: Base64 string
    """
    #create an HTML img tag w/ base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}"/>'
    #display the image by rendering HTML
    display(HTML(image_html))

In [5]:
import torch 
import torch.nn as nn 
device = torch.device("cuda")

import transformers
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

2025-04-28 08:29:22.531231: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-28 08:29:22.542578: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745850562.556172   17373 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745850562.559999   17373 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745850562.570405   17373 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [6]:
# Get model from pretrained checkpoint
checkpoint = "Qwen/Qwen2.5-VL-7B-Instruct"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto")
processor = AutoProcessor.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
def inference(image, prompt, sys_prompt="You are a helpful assistant.", max_new_tokens=150, return_input=False):
    ''' Method will take a prompt and image as input for the model.
    '''
    try:
        messages = [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": [
                    {"type": "text", "text": prompt},
                    {"image": image},
                ]
            },
        ]
    
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        print("text:", text)
        # image_inputs, video_inputs = process_vision_info([messages])
        inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
        inputs = inputs.to('cuda')

        output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        if return_input:
            return output_text[0], inputs
        else:
            return output_text[0]
    except: 
        return ""

In [None]:
train_answers = []
for i in train:
    try:
        prompt = f'''Use the image provided to answer the following question: {i["question"]}. Keep the answer concise and limited to only words found within the document.'''
        response = inference(
            image=i["image"],
            prompt=prompt
        )
        train_answers.append({"gt": i["answers"], "pred": response})
    except: 
        continue
    

1
text: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Use the image provided to answer the following question: What is the ‘actual’ value per 1000, during the year 1975?. Keep the answer concise and limited to only words found within the document.<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

2
text: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Use the image provided to answer the following question: What is name of university?. Keep the answer concise and limited to only words found within the document.<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

3
text: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Use the image provided to answer the following question: What is the name of the company?. Keep the answer concise and limited to only words found within the document.<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

In [18]:
train_answers[1]

{'gt': ['university of california',
  'University of California',
  'university of california, san diego'],
 'pred': 'UNIVERSITY OF CALIFORNIA, SAN DIEGO'}

# Evaluation using F1 

In [None]:
def eval(answers):
    TP = 0
    FN = 0
    for i in answers: 
        gt = [item.lower() for item in i["gt"]]
        pred = i["pred"].lower()
        if pred in gt:
            TP += 1
        else: 
            FN += 1

    F1 = TP / (TP + (0.5 * FN))
    return F1 

In [24]:
print("Training Evaluation: ")
print(eval(train_answers))

Training Evaluation: 
0.9207021791767555


# Examples

In [9]:
train[2]

{'questionId': '57349',
 'question': 'What is the name of the company?',
 'question_types': ['layout'],
 'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=1701x2386>,
 'docId': 4733,
 'ucsf_document_id': 'snbx0223',
 'ucsf_document_page_no': '22',
 'answers': ['itc limited', 'ITC Limited'],
 'data_split': 'val'}

In [10]:
b64_image = convert_to_base64(train[2]["image"])
plt_img_base64(b64_image)

In [12]:
import time 
start = time.time()
prompt = f'''Use the image provided to answer the following question: {train[2]}. Keep the answer concise and limited to only words found within the document.'''
response = inference(
        image=train[2]["image"],
        prompt=prompt
        )
print(response)
end = time.time()
print("Inference Runtime: ", end-start, " seconds")

text: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Use the image provided to answer the following question: {'questionId': '57349', 'question': 'What is the name of the company?', 'question_types': ['layout'], 'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=1701x2386 at 0x7F3ADE482A40>, 'docId': 4733, 'ucsf_document_id': 'snbx0223', 'ucsf_document_page_no': '22', 'answers': ['itc limited', 'ITC Limited'], 'data_split': 'val'}. Keep the answer concise and limited to only words found within the document.<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

ITC Limited
Inference Runtime:  1.6693525314331055  seconds


In [14]:
train[10]

{'questionId': '49172',
 'question': 'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?',
 'question_types': ['table/list'],
 'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=850x1644>,
 'docId': 14281,
 'ucsf_document_id': 'txpp0227',
 'ucsf_document_page_no': '10',
 'answers': ['TRRF Vice President', 'lee a. waller'],
 'data_split': 'val'}

In [16]:
b64_image = convert_to_base64(train[10]["image"])
plt_img_base64(b64_image)

In [26]:
import time 
start = time.time()
prompt = f'''Use the image provided to answer the following question: {train[10]}. Keep the answer concise and limited to only words found within the document.'''
response = inference(
        image=train[10]["image"],
        prompt=prompt
        )
print(response)
end = time.time()
print("Inference Runtime: ", end-start, " seconds")

text: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Use the image provided to answer the following question: {'questionId': '49172', 'question': 'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?', 'question_types': ['table/list'], 'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=850x1644 at 0x7F3ADE4817E0>, 'docId': 14281, 'ucsf_document_id': 'txpp0227', 'ucsf_document_page_no': '10', 'answers': ['TRRF Vice President', 'lee a. waller'], 'data_split': 'val'}. Keep the answer concise and limited to only words found within the document.<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

Lee A. Waller
Inference Runtime:  0.3857994079589844  seconds


In [25]:
start = time.time()
prompt = f'''Use the image provided to answer the following question: {train[10]["question"]}. Keep the answer limited to only words found within the document. Output a list of all the possible answers. Here is an example of a possible answer: ["Barbara Walters", "barbara walters", "journalist"].'''
response = inference(
        image=train[10]["image"],
        prompt=prompt
        )
print(response)
end = time.time()
print("Inference Runtime: ", end-start, " seconds")

text: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Use the image provided to answer the following question: Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?. Keep the answer limited to only words found within the document. Output a list of all the possible answers. Here is an example of a possible answer: ["Barbara Walters", "barbara walters", "journalist"].<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

["Lee A. Waller", "lee a. waller", "TRRF Vice President"]
Inference Runtime:  0.7587323188781738  seconds
