In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto"
).to("cuda")

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

In [4]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering

processor_answer = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model_answer = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

In [None]:
# Input Image ( Loads the image )
image_file = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
raw_image

In [75]:
prompt = f""" Given the input image, generate 15 unique and diverse questions that can be answered based on the visual content. The questions should cover a wide range of topics such as the following:

The main subject or objects in the image.
Colors, patterns, and shapes.
The actions or movements taking place.
The spatial relationships between objects or people.
Environmental context (indoor, outdoor, nature, urban, etc.).
Emotions, expressions, or moods conveyed.
Any interactions between objects, people, or elements.
Specific details about the setting or background.
Objects or items in the foreground and background.
The condition or state of any objects or people (e.g., new, old, active, idle).
Make sure to vary the types of questions so they touch on different aspects of the image, and ensure that the questions are easily inferable from the visual content.
Ensure each question only has one question to it.
"""

In [76]:
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": f"{image_file}",
            },
            {"type": "text", "text": f"{prompt}"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


["1. What is the primary subject of the image?\n2. What colors are prominent in the image?\n3. What shapes are present in the image?\n4. What actions are taking place in the image?\n5. What is the spatial relationship between the woman and the dog?\n6. What is the environmental context of the image?\n7. What emotions or expressions are conveyed in the image?\n8. What interactions are taking place between the woman and the dog?\n9. What specific details about the setting or background are visible in the image?\n10. What objects or items are in the foreground and background of the image?\n11. What is the condition or state of any objects or people in the image?\n12. How does the image convey the condition or state of the woman and the dog?\n13. What is the condition or state of the dog in the image?\n14. How does the image convey the condition or state of the woman's mood or expression?\n15. What is the condition or state of the dog's fur or coat in the image?"]


In [77]:
# Given string (from the list you provided)
questions_string = output_text[0]

# Replace the escaped \\n with actual newline characters
questions_string = questions_string.replace("\\n", "\n")

# Split the string into individual questions by newline characters
questions = questions_string.split("\n")

# Optional: Remove the numbering and "1.", "2." part of each question
cleaned_questions = [q.split(". ", 1)[1] for q in questions if q]

# Print the result
print(cleaned_questions)



['What is the primary subject of the image?', 'What colors are prominent in the image?', 'What shapes are present in the image?', 'What actions are taking place in the image?', 'What is the spatial relationship between the woman and the dog?', 'What is the environmental context of the image?', 'What emotions or expressions are conveyed in the image?', 'What interactions are taking place between the woman and the dog?', 'What specific details about the setting or background are visible in the image?', 'What objects or items are in the foreground and background of the image?', 'What is the condition or state of any objects or people in the image?', 'How does the image convey the condition or state of the woman and the dog?', 'What is the condition or state of the dog in the image?', "How does the image convey the condition or state of the woman's mood or expression?", "What is the condition or state of the dog's fur or coat in the image?"]


In [78]:
# Open a file in write mode
with open('questions.txt', 'w') as file:
    # Iterate through the list of questions
    for question in cleaned_questions:
        # Write each question on a new line
        file.write(question + '\n')

print("Questions have been saved to 'questions.txt'")

Questions have been saved to 'questions.txt'


In [79]:
# Access the text prompt from the 'content' list
text_prompt = None
for item in messages[0]['content']:
    if item['type'] == 'text':
        text_prompt = item['text']
        
print(text_prompt)

 Given the input image, generate 15 unique and diverse questions that can be answered based on the visual content. The questions should cover a wide range of topics such as the following:

The main subject or objects in the image.
Colors, patterns, and shapes.
The actions or movements taking place.
The spatial relationships between objects or people.
Environmental context (indoor, outdoor, nature, urban, etc.).
Emotions, expressions, or moods conveyed.
Any interactions between objects, people, or elements.
Specific details about the setting or background.
Objects or items in the foreground and background.
The condition or state of any objects or people (e.g., new, old, active, idle).
Make sure to vary the types of questions so they touch on different aspects of the image, and ensure that the questions are easily inferable from the visual content.
Ensure each question only has one question to it.



In [89]:
answers = [] 
for question in cleaned_questions:
    inputs = processor_answer(raw_image, question , return_tensors="pt").to("cuda")
    
    out = model_answer.generate(**inputs)
    output = processor_answer.decode(out[0], skip_special_tokens=True)
    answers.append(output)
print(answers)

['dog', 'white and brown', 'dog', 'sitting', 'sitting', 'beach', 'happiness', 'friends', 'beach', 'woman and dog', 'people are in beach', 'happy', 'sad', 'happy', 'wet']


In [90]:
import json

# Pair the questions with answers
qa_data = [{"question": question, "answer": answer} for question, answer in zip(cleaned_questions, answers)]

# Write the generated data to a JSON file
with open("questions_answers(Qwen+Blip).json", "w") as file:
    json.dump(qa_data, file, indent=4)

print("Questions and answers have been written to 'questions_answers(Qwen+Blip).json'")


Questions and answers have been written to 'questions_answers(Qwen+Blip).json'


In [91]:


# Open and load the JSON file
with open('questions_answers(Qwen+Blip).json', 'r') as file:
    data = json.load(file)

# Print the contents of the JSON file
print(json.dumps(data, indent=4))  # This will format the JSON for better readability


[
    {
        "question": "What is the primary subject of the image?",
        "answer": "dog"
    },
    {
        "question": "What colors are prominent in the image?",
        "answer": "white and brown"
    },
    {
        "question": "What shapes are present in the image?",
        "answer": "dog"
    },
    {
        "question": "What actions are taking place in the image?",
        "answer": "sitting"
    },
    {
        "question": "What is the spatial relationship between the woman and the dog?",
        "answer": "sitting"
    },
    {
        "question": "What is the environmental context of the image?",
        "answer": "beach"
    },
    {
        "question": "What emotions or expressions are conveyed in the image?",
        "answer": "happiness"
    },
    {
        "question": "What interactions are taking place between the woman and the dog?",
        "answer": "friends"
    },
    {
        "question": "What specific details about the setting or background are vis