### Libraries and device

In [1]:
import warnings
warnings.filterwarnings("ignore")

from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
import torch
import os
from qwen_vl_utils import process_vision_info

from util import preprocess_image

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[INFO] Using device: {device}")

[INFO] Using device: cuda


### Model initialization

In [None]:
# Load Qwen model and processor
qwen_model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
qwen_processor_folder = os.path.join("..", "assets", "qwen_processor")

print("[INFO] Loading Qwen processor...")
qwen_processor = AutoProcessor.from_pretrained(qwen_processor_folder, trust_remote_code=True, use_fast=True, verbose=False)
print("[INFO] Qwen processor loaded.\n")

print("[INFO] Loading Qwen model...")
qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    qwen_model_name,
    torch_dtype=torch.float16,
)
qwen_model.to(device)
print("[INFO] Qwen model loaded.")

[INFO] Loading Qwen processor...
[INFO] Qwen processor loaded.

[INFO] Loading Qwen model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Qwen model loaded.


### Inference

In [3]:
# Image setup
image_size = 384
image_path = os.path.join("..", "test_images", "casteldelmonte.jpg")
image = preprocess_image(image_path, image_size)


# Prepare inputs
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

text = qwen_processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
image_inputs, video_inputs = process_vision_info(messages)

inputs = qwen_processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    return_tensors="pt",
    padding=True
).to(device)


# Inference
print("[INFO] Generating...")
with torch.no_grad():
    outputs = qwen_model.generate(
        **inputs,
        max_new_tokens=128,
    )

generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], outputs)
]
output_text = qwen_processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print("\n--- Output ---")
print(output_text[0])

[INFO] Generating...

--- Output ---
The image depicts a large, historic castle with a distinctive octagonal shape and multiple towers. The castle is constructed from stone and features a central entrance with a large, arched doorway. Surrounding the entrance are several smaller windows and turrets. The castle is situated in a lush, green landscape with rolling hills and dense forests in the background. The ground around the castle is paved, and there are a few people visible near the entrance, suggesting that the castle might be open to visitors or tourists. The overall scene conveys a sense of historical significance and natural beauty.
