In [1]:
#Import Required Libraries
from PIL import Image
import requests
import torch

In [2]:
# Ensures that there is enough memory allocation for the model to load
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model_QWEN = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", attn_implementation='eager', device_map="cuda"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processor
processor_QWEN = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", use_fast=True)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.84s/it]


In [17]:
# Input Image
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

In [18]:
# Number of questions to be generated 
questions = 19

In [19]:
# Specify the path to your .txt file
file_path = 'caption.txt'  # Replace with your file's path

try:
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the contents of the file
        caption = file.read()
        
        # Print the contents of the file
        print(caption)

except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
except IOError:
    print("Error: An error occurred while reading the file.")


The image shows a young woman sitting on a sandy beach with her golden retriever dog. The woman is wearing a plaid shirt and black pants and is holding the dog's leash. The dog is sitting on the sand and is looking up at the woman with a smile on its face. The ocean can be seen in the background with waves crashing onto the shore. The sky is orange and pink, indicating that it is either sunrise or sunset. The overall mood of the image is peaceful and serene.


In [20]:
# Input prompt 
prompt = f"""
Task: Given the image input and the caption provided, generate {questions} simple, clear, and unique questions about the image. Each question should focus on one specific aspect of the scene and be easy to understand. The questions should be varied in type and explore different general aspects of the image, but each question should only contain one part.

Caption:
"{caption}"

Instructions:
- Generate {questions} distinct questions, each focusing on one unique detail or aspect of the scene.
- Ensure each question is simple and contains only one part (e.g., "What is the expression on the character's face?" or "What is the color of the sky?").
- Questions should explore different general aspects, such as:
    - The appearance or actions of any characters (people, animals, etc.)
    - The environment (natural elements like the sky, ocean, land, etc.)
    - Emotions or mood conveyed by the scene
    - Time of day or lighting (e.g., sunrise, sunset, bright, dark, etc.)
    - The relationship between characters (if applicable)
    - Objects or features in the scene (e.g., clothing, accessories, weather conditions)
- Avoid compound questions or combining more than one query in a single question.
- Each question should explore a different aspect of the scene in a clear and simple manner.
"""


In [21]:
import torch
torch.cuda.empty_cache()

In [22]:
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": f"{url}",
            },
            {"type": "text", "text": f"{prompt}"},
        ],
    }
]

# Preparation for inference
text = processor_QWEN.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor_QWEN(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
with torch.no_grad():
    generated_ids = model_QWEN.generate(**inputs, max_new_tokens=1000)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor_QWEN.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
print(output_text)

["1. What is the color of the sky in the image?\n2. What is the dog's breed?\n3. How is the woman dressed?\n4. What is the dog doing?\n5. What is the woman holding?\n6. What time of day does the image appear to be taken?\n7. What is the ocean doing?\n8. What is the woman's expression?\n9. What is the dog's expression?\n10. What is the color of the sand?\n11. What is the woman wearing?\n12. What is the dog wearing?\n13. What is the woman's hair color?\n14. What is the dog's fur color?\n15. What is the woman's posture?\n16. What is the dog's posture?\n17. What is the woman's clothing style?\n18. What is the dog's activity?\n19. What is the woman's mood?"]


In [23]:
# Given string (from the list you provided)
questions_string = output_text[0]

# Replace the escaped \\n with actual newline characters
questions_string = questions_string.replace("\\n", "\n")

# Split the string into individual questions by newline characters
questions = questions_string.split("\n")

# Optional: Remove the numbering and "1.", "2." part of each question
cleaned_questions = [q.split(". ", 1)[1] for q in questions if q]

# Print the result
print(cleaned_questions)



['What is the color of the sky in the image?', "What is the dog's breed?", 'How is the woman dressed?', 'What is the dog doing?', 'What is the woman holding?', 'What time of day does the image appear to be taken?', 'What is the ocean doing?', "What is the woman's expression?", "What is the dog's expression?", 'What is the color of the sand?', 'What is the woman wearing?', 'What is the dog wearing?', "What is the woman's hair color?", "What is the dog's fur color?", "What is the woman's posture?", "What is the dog's posture?", "What is the woman's clothing style?", "What is the dog's activity?", "What is the woman's mood?"]


In [24]:
# Open a file in write mode
with open('questions.txt', 'w') as file:
    # Iterate through the list of questions
    for question in cleaned_questions:
        # Write each question on a new line
        file.write(question + '\n')

print("Questions have been saved to 'questions.txt'")

Questions have been saved to 'questions.txt'
