In [1]:
#Import Required Libraries
from PIL import Image
import requests

In [2]:
# Ensures that there is enough memory allocation for the model to load
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
from transformers import pipeline
import torch

pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    device="cuda",
    torch_dtype=torch.bfloat16
)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.22it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda


In [4]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import json
import requests

processor_answer = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model_answer = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

In [None]:
# Input Image
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [6]:
# Number of questions to be generated 
questions = 20

In [7]:
# Specify the path to your .txt file
file_path = 'caption.txt'  # Replace with your file's path

try:
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the contents of the file
        caption = file.read()
        
        # Print the contents of the file
        print(caption)

except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
except IOError:
    print("Error: An error occurred while reading the file.")


The image shows a young woman sitting on a sandy beach with her golden retriever dog. The woman is wearing a plaid shirt and black pants and is holding the dog's leash. The dog is sitting on the sand and is looking up at the woman with a smile on its face. The ocean can be seen in the background with waves crashing onto the shore. The sky is orange and pink, indicating that it is either sunrise or sunset. The overall mood of the image is peaceful and serene.


In [8]:
# Input prompt 
prompt = f"""
Task: Given the image input and the caption provided, generate {questions} simple, clear, and unique questions about the image. Each question should focus on one specific aspect of the scene and be easy to understand. The questions should be varied in type and explore different general aspects of the image, but each question should only contain one part.

Caption:
"{caption}"

Instructions:
- Generate {questions} distinct questions, each focusing on one unique detail or aspect of the scene.
- Ensure each question is simple and contains only one part (e.g., "What is the expression on the character's face?" or "What is the color of the sky?").
- Questions should explore different general aspects, such as:
    - The appearance or actions of any characters (people, animals, etc.)
    - The environment (natural elements like the sky, ocean, land, etc.)
    - Emotions or mood conveyed by the scene
    - Time of day or lighting (e.g., sunrise, sunset, bright, dark, etc.)
    - The relationship between characters (if applicable)
    - Objects or features in the scene (e.g., clothing, accessories, weather conditions)
- Avoid compound questions or combining more than one query in a single question.
- Each question should explore a different aspect of the scene in a clear and simple manner.
- The output should only include the questions and nothing else.

2. Provide a short and simple answer to the questions generated.
"""


In [9]:
print(prompt)


Task: Given the image input and the caption provided, generate 20 simple, clear, and unique questions about the image. Each question should focus on one specific aspect of the scene and be easy to understand. The questions should be varied in type and explore different general aspects of the image, but each question should only contain one part.

Caption:
"The image shows a young woman sitting on a sandy beach with her golden retriever dog. The woman is wearing a plaid shirt and black pants and is holding the dog's leash. The dog is sitting on the sand and is looking up at the woman with a smile on its face. The ocean can be seen in the background with waves crashing onto the shore. The sky is orange and pink, indicating that it is either sunrise or sunset. The overall mood of the image is peaceful and serene."

Instructions:
- Generate 20 distinct questions, each focusing on one unique detail or aspect of the scene.
- Ensure each question is simple and contains only one part (e.g., "

In [10]:
import torch
torch.cuda.empty_cache()

In [11]:
messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are a helpful assistant."}]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": f"{url}"},
            {"type": "text", "text": f"{prompt}"}
        ]
    }
]

output = pipe(text=messages, max_new_tokens=1000)
print(output[0]["generated_text"][-1]["content"])
output_text = output[0]["generated_text"][-1]["content"]
# Okay, let's take a look! 
# Based on the image, the animal on the candy is a **turtle**. 
# You can see the shell shape and the head and legs.


1.  What color is the dog’s fur?
    *   Yellow

2.  What is the woman wearing?
    *   A plaid shirt

3.  What is the dog doing?
    *   Sitting

4.  What is the surface the woman is sitting on?
    *   Sand

5.  What is visible in the background?
    *   The ocean

6.  What color is the sky?
    *   Orange and pink

7.  What is the woman holding?
    *   The dog’s leash

8.  What is the dog looking at?
    *   The woman

9.  What is the dog’s facial expression?
    *   A smile

10. What type of weather is suggested by the waves?
    *   A calm sea

11. What is the woman doing with her hand?
    *   Holding it out

12. What is the texture of the sand?
    *   Grainy

13. What time of day is it likely?
    *   Sunrise or sunset

14. What is the woman’s clothing style?
    *   Casual

15. What is the dog’s breed?
    *   Golden Retriever

16. What is the color of the dog’s collar?
    *   Multi-colored

17. What is the woman’s posture?
    *   Sitting

18. What is the general mood of th

In [14]:
import re

qa_raw_list = output_text.strip().split("\n")
questions = []
answers = []

# Step 2: Loop through the raw list to extract the questions and answers
for i in range(0, len(qa_raw_list), 3):  # Step by 3 since each question and answer is followed by a blank line
    question = re.sub(r'^\d+\.\s*', '', qa_raw_list[i]).strip()
    questions.append(question)
    answer = qa_raw_list[i + 1].strip().lstrip('* ')  # Clean the answer (remove '* ' prefix)
    answers.append(answer)
    
qa_data = [{"question": question, "answer": answer} for question, answer in zip(questions, answers)]    


# Step 4: Save the dictionary as a JSON file
with open('questions_answers(Gemma3).json', 'w') as json_file:
    json.dump(qa_data, json_file, indent=4)

print("The question-answer pairs have been saved to 'questions_answers(Gemma3).json'.")

The question-answer pairs have been saved to 'questions_answers(Gemma3).json'.


In [16]:


# Open and load the JSON file
with open('questions_answers(Gemma3).json', 'r') as file:
    data = json.load(file)

# Print the contents of the JSON file
print(json.dumps(data, indent=4,ensure_ascii=False))  # This will format the JSON for better readability


[
    {
        "question": "What color is the dog’s fur?",
        "answer": "Yellow"
    },
    {
        "question": "What is the woman wearing?",
        "answer": "A plaid shirt"
    },
    {
        "question": "What is the dog doing?",
        "answer": "Sitting"
    },
    {
        "question": "What is the surface the woman is sitting on?",
        "answer": "Sand"
    },
    {
        "question": "What is visible in the background?",
        "answer": "The ocean"
    },
    {
        "question": "What color is the sky?",
        "answer": "Orange and pink"
    },
    {
        "question": "What is the woman holding?",
        "answer": "The dog’s leash"
    },
    {
        "question": "What is the dog looking at?",
        "answer": "The woman"
    },
    {
        "question": "What is the dog’s facial expression?",
        "answer": "A smile"
    },
    {
        "question": "What type of weather is suggested by the waves?",
        "answer": "A calm sea"
    },
    {
     