In [3]:
import sys
sys.path.append("..")

In [5]:
from artseek.utils.dirutils import get_data_dir, get_fonts_dir
from artseek.data.datasets.processing import FragmentCreator
from langchain_core.messages import SystemMessage, HumanMessage
from datasets import load_from_disk
import matplotlib.pyplot as plt
from artseek.method.generate import Qwen2_5_VLChatModel
import torch
from tqdm import tqdm
import io
from PIL import Image
from colpali_engine.models import ColQwen2, ColQwen2Processor

In [7]:
ds = load_from_disk(get_data_dir() / "wikifragments_visual_arts_dataset_embeds_sample_10000_question_answer_embeds")

In [8]:
ds[0]

{'id': 12526788,
 'title': 'Bornö Marine Research Station',
 'text': 'Bornö Marine Research Station, owned by the Bornö Institute for Ocean and Climate Studies, is located at <a href="Holma%2C%20Lysekil%20Municipality">Holma</a> on the island <a href="Stora%20Born%C3%B6">Stora Bornö</a> in <a href="Gullmarsfjorden">Gullmarsfjorden</a>, about north of <a href="Gothenburg">Gothenburg</a>, Sweden. It was built in 1902 by and , both pioneers of Swedish marine research. The island has been considered by many Swedes to be the birthplace of Swedish <a href="oceanography">oceanography</a>.',
 'url': 'https://en.wikipedia.org/wiki?curid=49043045',
 'wiki_id': 49043045,
 'paragraph_id': 0,
 'images': {'caption': [], 'image': [], 'type': [], 'url': []},
 'fragment': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1654x40>,
 'old_fragment': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1654x175>,
 'old_images': {'caption': ['Bornö Marine Research Station'],
  'image': [{'bytes': b'\xff

In [7]:
model = ColQwen2.from_pretrained("vidore/colqwen2-v1.0", torch_dtype=torch.bfloat16, device_map="auto")
processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v1.0")
model = model.eval()

Downloading shards: 100%|██████████| 2/2 [07:07<00:00, 213.88s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.10s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
fragment_creator = FragmentCreator(font_path=get_fonts_dir() / "arial.ttf", font_size_text=24, font_size_caption=18)

In [11]:
ds[0]

{'id': 12526788,
 'title': 'Bornö Marine Research Station',
 'text': 'Bornö Marine Research Station, owned by the Bornö Institute for Ocean and Climate Studies, is located at <a href="Holma%2C%20Lysekil%20Municipality">Holma</a> on the island <a href="Stora%20Born%C3%B6">Stora Bornö</a> in <a href="Gullmarsfjorden">Gullmarsfjorden</a>, about north of <a href="Gothenburg">Gothenburg</a>, Sweden. It was built in 1902 by and , both pioneers of Swedish marine research. The island has been considered by many Swedes to be the birthplace of Swedish <a href="oceanography">oceanography</a>.',
 'url': 'https://en.wikipedia.org/wiki?curid=49043045',
 'wiki_id': 49043045,
 'paragraph_id': 0,
 'images': {'caption': [], 'image': [], 'type': [], 'url': []},
 'fragment': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1654x40>,
 'old_fragment': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1654x175>,
 'old_images': {'caption': ['Bornö Marine Research Station'],
  'image': [{'bytes': b'\xff

In [12]:
# make directory named "images"
import os   
if not os.path.exists("images"):
    os.makedirs("images")

for x in tqdm(ds):
    if len(x["images"]["image"]) == 0:
        fragment: Image.Image = fragment_creator.example_to_image(x)
        if "Leonardo" in x["text"]:
            # save fragment to images directory
            image = fragment.convert("RGB")
            image.save(f"images/{x['id']}.jpg")
            break

  2%|▏         | 242/10000 [00:25<17:05,  9.51it/s]


In [None]:
def make_eval_fragment(example):
    example["old_fragment"] = example["fragment"]
    example["old_images"] = example["images"]
    example["images"] = {k: v[1:] for k, v in example["images"].items()}    
    example["fragment"] = fragment_creator.example_to_image(example)
    return example

ds = ds.map(make_eval_fragment, num_proc=8)

In [None]:
ds[0]["fragment"]

In [None]:
ds[0]["old_fragment"]

In [None]:
model = Qwen2_5_VLChatModel.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="sequential",
)

In [None]:
z = 0

In [None]:
system_prompt = """You are a helpful assistant.

# Task  
You are given a **query image** and a **document image**. Your goal is to generate a **question about the query image** that can be used to test whether a system can correctly retrieve the associated document image based on the query.

- The question **must include the phrase "this image"**.  
- **Do not mention or reference the document image** in the question.  
- The **answer to the question must be found within the document image**.

# Format  
Return your response as a JSON object in the following format:

{
  "question": "Your generated question here.",
  "answer": "The correct answer found in the document image."
}
"""

messages = [
    SystemMessage(
        content=[{"type": "text", "text": system_prompt}],
    ),
    HumanMessage(
        content=[
            {"type": "text", "text": "Query image:"},
            {"type": "image"},
            {"type": "text", "text": f"Query caption: {ds[z]['old_images']['caption'][0]}"},
            {"type": "text", "text": "Document image:"},
            {"type": "image"},
        ]
    ),
]
images = [Image.open(io.BytesIO(ds[z]["old_images"]["image"][0]["bytes"])), ds[z]["fragment"]]
response = model.invoke(messages, images=images)

In [None]:
display(Image.open(io.BytesIO(ds[z]["old_images"]["image"][0]["bytes"]))), display(ds[z]["fragment"])
print(ds[z]["old_images"]["caption"][0])    

In [None]:
print(response.content)