In [5]:
#Import Required Libraries
from PIL import Image
import requests

In [2]:
# Ensures that there is enough memory allocation for the model to load
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [1]:
from transformers import pipeline
import torch

pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    device="cuda",
    torch_dtype=torch.bfloat16
)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda


In [20]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import json
import requests

processor_answer = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model_answer = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

In [None]:
# Input Image
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [7]:
# Number of questions to be generated 
questions = 19

In [8]:
# Specify the path to your .txt file
file_path = 'caption.txt'  # Replace with your file's path

try:
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the contents of the file
        caption = file.read()
        
        # Print the contents of the file
        print(caption)

except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
except IOError:
    print("Error: An error occurred while reading the file.")


The image shows a young woman sitting on a sandy beach with her golden retriever dog. The woman is wearing a plaid shirt and black pants and is holding the dog's leash. The dog is sitting on the sand and is looking up at the woman with a smile on its face. The ocean can be seen in the background with waves crashing onto the shore. The sky is orange and pink, indicating that it is either sunrise or sunset. The overall mood of the image is peaceful and serene.


In [14]:
# Input prompt 
prompt = f"""
Task: Given the image input and the caption provided, generate {questions} simple, clear, and unique questions about the image. Each question should focus on one specific aspect of the scene and be easy to understand. The questions should be varied in type and explore different general aspects of the image, but each question should only contain one part.

Caption:
"{caption}"

Instructions:
- Generate {questions} distinct questions, each focusing on one unique detail or aspect of the scene.
- Ensure each question is simple and contains only one part (e.g., "What is the expression on the character's face?" or "What is the color of the sky?").
- Questions should explore different general aspects, such as:
    - The appearance or actions of any characters (people, animals, etc.)
    - The environment (natural elements like the sky, ocean, land, etc.)
    - Emotions or mood conveyed by the scene
    - Time of day or lighting (e.g., sunrise, sunset, bright, dark, etc.)
    - The relationship between characters (if applicable)
    - Objects or features in the scene (e.g., clothing, accessories, weather conditions)
- Avoid compound questions or combining more than one query in a single question.
- Each question should explore a different aspect of the scene in a clear and simple manner.
- The output should only include the questions and nothing else.
"""


In [15]:
print(prompt)


Task: Given the image input and the caption provided, generate 19 simple, clear, and unique questions about the image. Each question should focus on one specific aspect of the scene and be easy to understand. The questions should be varied in type and explore different general aspects of the image, but each question should only contain one part.

Caption:
"The image shows a young woman sitting on a sandy beach with her golden retriever dog. The woman is wearing a plaid shirt and black pants and is holding the dog's leash. The dog is sitting on the sand and is looking up at the woman with a smile on its face. The ocean can be seen in the background with waves crashing onto the shore. The sky is orange and pink, indicating that it is either sunrise or sunset. The overall mood of the image is peaceful and serene."

Instructions:
- Generate 19 distinct questions, each focusing on one unique detail or aspect of the scene.
- Ensure each question is simple and contains only one part (e.g., "

In [16]:
import torch
torch.cuda.empty_cache()

In [17]:
messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are a helpful assistant."}]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": f"{url}"},
            {"type": "text", "text": f"{prompt}"}
        ]
    }
]

output = pipe(text=messages, max_new_tokens=1000)
print(output[0]["generated_text"][-1]["content"])
output_text = output[0]["generated_text"][-1]["content"]
# Okay, let's take a look! 
# Based on the image, the animal on the candy is a **turtle**. 
# You can see the shell shape and the head and legs.


1.  What color is the dog’s fur?
2.  What is the woman wearing?
3.  What is the dog doing?
4.  What type of surface is the woman sitting on?
5.  What is the color of the ocean?
6.  What is visible in the background?
7.  What color is the sky?
8.  What is the woman holding?
9.  What is the dog looking at?
10. What is the woman’s facial expression?
11. Is the scene taking place during the day or night?
12. What kind of weather is it?
13. What is the texture of the sand?
14. What is the dog’s posture?
15. What is the leash pattern like?
16. What is the woman doing with her hand?
17. What is the overall mood of the image?
18. Are there any waves visible?
19. What is the lighting like in the image?


In [26]:
# Given string (from the list you provided)
questions_string = output_text

# Replace the escaped \\n with actual newline characters
questions_string = questions_string.replace("\\n", "\n")

# Split the string into individual questions by newline characters
questions = questions_string.split("\n")

# Optional: Remove the numbering and "1.", "2." part of each question
cleaned_questions = [q.split(". ", 1)[1] for q in questions if q]

# Print the result
print(cleaned_questions)



[' What color is the dog’s fur?', ' What is the woman wearing?', ' What is the dog doing?', ' What type of surface is the woman sitting on?', ' What is the color of the ocean?', ' What is visible in the background?', ' What color is the sky?', ' What is the woman holding?', ' What is the dog looking at?', 'What is the woman’s facial expression?', 'Is the scene taking place during the day or night?', 'What kind of weather is it?', 'What is the texture of the sand?', 'What is the dog’s posture?', 'What is the leash pattern like?', 'What is the woman doing with her hand?', 'What is the overall mood of the image?', 'Are there any waves visible?', 'What is the lighting like in the image?']


In [28]:
# Open a file in write mode
with open('questions.txt', 'w') as file:
    # Iterate through the list of questions
    for question in cleaned_questions:
        # Write each question on a new line
        file.write(question + '\n')

print("Questions have been saved to 'questions.txt'")

Questions have been saved to 'questions.txt'


In [30]:
# Specify the path to your .txt file
file_path = 'questions.txt'  # Replace with your file's path

try:
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the contents of the file
        question = file.read()
        cleaned_questions = question.strip().split('\n')
        print(cleaned_questions)
except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
except IOError:
    print("Error: An error occurred while reading the file.")

['What color is the dog’s fur?', ' What is the woman wearing?', ' What is the dog doing?', ' What type of surface is the woman sitting on?', ' What is the color of the ocean?', ' What is visible in the background?', ' What color is the sky?', ' What is the woman holding?', ' What is the dog looking at?', 'What is the woman’s facial expression?', 'Is the scene taking place during the day or night?', 'What kind of weather is it?', 'What is the texture of the sand?', 'What is the dog’s posture?', 'What is the leash pattern like?', 'What is the woman doing with her hand?', 'What is the overall mood of the image?', 'Are there any waves visible?', 'What is the lighting like in the image?']


In [31]:
answers = [] 
for question in cleaned_questions:
    inputs = processor_answer(image, question , return_tensors="pt").to("cuda")
    
    out = model_answer.generate(**inputs)
    output = processor_answer.decode(out[0], skip_special_tokens=True)
    answers.append(output)
print(answers)

['tan', 'plaid shirt', 'sitting', 'sand', 'blue', 'ocean', 'white', 'dog', 'woman', 'smiling', 'day', 'sunny', 'soft', 'sitting', 'plaid', 'petting dog', 'happy', 'yes', 'sunny']


In [32]:
import json

# Pair the questions with answers
qa_data = [{"question": question, "answer": answer} for question, answer in zip(cleaned_questions, answers)]

# Write the generated data to a JSON file
with open("questions_answers(Qwen+Blip).json", "w") as file:
    json.dump(qa_data, file, indent=4)

print("Questions and answers have been written to 'questions_answers(Qwen+Blip).json'")


Questions and answers have been written to 'questions_answers(Qwen+Blip).json'


In [34]:


# Open and load the JSON file
with open('questions_answers(Qwen+Blip).json', 'r') as file:
    data = json.load(file)

# Print the contents of the JSON file
print(json.dumps(data, indent=4,ensure_ascii=False))  # This will format the JSON for better readability


[
    {
        "question": "What color is the dog’s fur?",
        "answer": "tan"
    },
    {
        "question": " What is the woman wearing?",
        "answer": "plaid shirt"
    },
    {
        "question": " What is the dog doing?",
        "answer": "sitting"
    },
    {
        "question": " What type of surface is the woman sitting on?",
        "answer": "sand"
    },
    {
        "question": " What is the color of the ocean?",
        "answer": "blue"
    },
    {
        "question": " What is visible in the background?",
        "answer": "ocean"
    },
    {
        "question": " What color is the sky?",
        "answer": "white"
    },
    {
        "question": " What is the woman holding?",
        "answer": "dog"
    },
    {
        "question": " What is the dog looking at?",
        "answer": "woman"
    },
    {
        "question": "What is the woman’s facial expression?",
        "answer": "smiling"
    },
    {
        "question": "Is the scene taking place duri