In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto"
).to(

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

In [52]:
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": """Generate 20 simple questions about the image for image inferencing for other visual language models to infer"""},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


["1. What is the woman doing?\n2. What is the dog doing?\n3. What is the setting?\n4. What is the weather like?\n5. What is the time of day?\n6. What is the woman wearing?\n7. What is the dog wearing?\n8. What is the dog's leash?\n9. What is the dog's collar?\n10. What is the dog's breed?\n11. What is the dog's name?\n12. What is the woman's name?\n13. What is the woman's profession?\n14. What is the woman's occupation?\n15. What is the woman's hobby?\n16. What is the woman's profession?\n17. What is the woman's occupation?\n18. What is the woman's hobby?\n19. What is the woman's profession?\n20. What is the woman's occupation?"]


In [32]:
# Access the text prompt from the 'content' list
text_prompt = None
for item in messages[0]['content']:
    if item['type'] == 'text':
        text_prompt = item['text']
        
print(text_prompt)

Generate 10 different questions regarding this image, about object detection ,for other Visual Language Models to answer, 
            give me only the questions, for example: what objects are there in the image


In [53]:
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
output = " ".join(output_text)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": {output}},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


["1. The woman is sitting on the beach, smiling and playing with a dog.\n2. The dog is sitting on the beach, with its front paws raised in a high-five gesture.\n3. The setting is a beach with the ocean in the background.\n4. The weather appears to be clear and sunny.\n5. The time of day is likely early morning or late afternoon, as the sun is low in the sky.\n6. The woman is wearing a plaid shirt and black pants.\n7. The dog is wearing a harness with a leash attached to it.\n8. The dog's collar is visible around its neck.\n9. The dog's breed is not clearly identifiable from the image.\n10. The dog's name is not visible in the image.\n11. The woman's name is not visible in the image.\n12. The woman's profession is not visible in the image.\n13. The woman's occupation is not visible in the image.\n14. The woman's hobby is not visible in the image.\n15. The woman's profession is not visible in the image.\n16. The woman's occupation is not visible in the image.\n17. The woman's hobby is no