In [4]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto"
).to("cuda")

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.76it/s]


In [21]:
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": """Generate 20 simple questions about the image for image inferencing or object detection that is clearly visible in the image for other visual language models to infer, give me only the questions"""},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


["1. What is the woman doing?\n2. What is the dog doing?\n3. What is the dog wearing?\n4. What is the woman wearing?\n5. What is the weather like?\n6. What is the time of day?\n7. What is the dog's breed?\n8. What is the woman's hairstyle?\n9. What is the woman's shoe?\n10. What is the dog's leash?\n11. What is the dog's collar?\n12. What is the dog's harness?\n13. What is the dog's tail?\n14. What is the dog's front paw?\n15. What is the dog's back paw?\n16. What is the dog's body posture?\n17. What is the dog's facial expression?\n18. What is the dog's tail color?\n19. What is the dog's fur color?\n20. What is the dog's breed?"]


In [22]:
# Access the text prompt from the 'content' list
text_prompt = None
for item in messages[0]['content']:
    if item['type'] == 'text':
        text_prompt = item['text']
        
print(text_prompt)

Generate 20 simple questions about the image for image inferencing or object detection that is clearly visible in the image for other visual language models to infer, give me only the questions


In [23]:
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
output = " ".join(output_text)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": {output}},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


["1. The woman is sitting on the beach, smiling and playing with the dog.\n2. The dog is sitting on the beach, with its front paw raised in a high-five gesture.\n3. The dog is wearing a harness.\n4. The woman is wearing a plaid shirt and black pants.\n5. The weather appears to be sunny and warm, with a clear sky.\n6. It is likely early morning or late afternoon, as the sun is low in the sky.\n7. The dog's breed is not clearly identifiable from the image.\n8. The woman's hairstyle is not visible in the image.\n9. The woman is not wearing shoes.\n10. The dog's leash is red.\n11. The dog's collar is not visible in the image.\n12. The dog's harness is blue and red.\n13. The dog's tail is long and bushy.\n14. The dog's front paw is raised in the air.\n15. The dog's back paw is not visible in the image.\n16. The dog's body posture is relaxed and playful.\n17. The dog's facial expression is happy and playful.\n18. The dog's tail is a light brown color.\n19. The dog's fur color is light brown.