In [None]:
# Load the Llava-hf model for generating questions
import requests 
from PIL import Image

import torch
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
model= LlavaOnevisionForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
).to("cuda")

processor = AutoProcessor.from_pretrained(model_id)

In [None]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering

processor_answer = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model_answer = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

In [3]:
# Specify the number of questions
questions = 15

In [22]:
# For generating the questions
conversation = [
    {

      "role": "user",
      "content": [
          # Input Prompt 
          {"type": "text", "text": f"Generate different {questions} questions for the image without any answers that can be easily inferenced from the image "},
          {"type": "image"}, 
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

In [None]:
# Input Image ( Loads the image )
image_file = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
raw_image

In [None]:
# Processes the image and text prompt
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

output = model.generate(**inputs, max_new_tokens=1000, do_sample=False)
answer = processor.decode(output[0][2:], skip_special_tokens=True)
# Remove the input prompt and only output the answers generated by the VLM
lines = answer.split('\n')
lines.pop(1)
results = '\n'.join(lines)
print(results)

In [8]:
# Saves the questions in a file 
with open("output.txt", "w") as file:
    file.write(results)

print("Text has been written to 'output.txt'")

Text has been written to 'output.txt'


In [10]:
# Open the file in read mode ('r')
with open("output.txt", "r") as file:
    # Read the contents of the file
    questions = file.read()  

In [11]:
import json
import re
 
# Split the text into each question 
lined = questions.strip().split("\n")
# The answers from the provided input
questions = [line.strip() for line in lined]
# Removes the numbers infront of the question
cleaned_questions = [re.sub(r'^\d+\.\s*', '', line) for line in questions]
cleaned_questions

['What is the dog wearing?',
 'What is the woman wearing?',
 'What is the setting of the image?',
 'What is the time of day?',
 'What is the weather like?',
 'What is the color of the sand?',
 "What is the color of the dog's fur?",
 "What is the color of the woman's shirt?",
 "What is the color of the dog's collar?",
 "What is the color of the woman's hair?",
 'What is the color of the waves in the ocean?',
 'What is the color of the sky?',
 'What is the color of the sand?',
 "What is the color of the dog's eyes?",
 "What is the color of the woman's shoes?"]

In [14]:
answers = [] 
for question in cleaned_questions:
    inputs = processor_answer(raw_image, question , return_tensors="pt").to("cuda")
    
    out = model_answer.generate(**inputs)
    output = processor_answer.decode(out[0], skip_special_tokens=True)
    answers.append(output)
print(answers)

['harness', 'plaid shirt', 'beach', 'sunset', 'sunny', 'white', 'tan', 'red and white', 'blue', 'brown', 'white', 'white', 'white', 'black', 'black']


In [17]:


# Pair the questions with answers
qa_data = [{"question": question, "answer": answer} for question, answer in zip(cleaned_questions, answers)]

# Write the generated data to a JSON file
with open("questions_answers(Blip).json", "w") as file:
    json.dump(qa_data, file, indent=4)

print("Questions and answers have been written to 'questions_answers(Blip).json'")


Questions and answers have been written to 'questions_answers(Blip).json'
