In [None]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests

processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) 
model.to("cuda:0")

# prepare image and text prompt, using the appropriate prompt template
url = "example_QA.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What is shown in this image?"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

# autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=100)

print(processor.decode(output[0], skip_special_tokens=True))




Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


[INST]  
What is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays values for multiple quantitative variables represented on axes starting from the same point. This particular radar chart is showing the performance of different models or systems across various metrics.

The axes represent different metrics or benchmarks, such as MM-Vet, MM-Vet, MM-Vet, MM-Vet, MM-Vet, MM-


In [None]:
import torch
from PIL import Image
import requests
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Specify the model ID; replace with the desired LLaVA model variant
model_id = "llava-hf/llava-1.5-7b-hf"

# Load the processor and model
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Load an image from a URL or local path
image_url = "https://example.com/image.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)

# Define the text prompt
text_prompt = "Describe the content of the image."

# Prepare the input
inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(model.device)

# Generate a response
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=50)

# Decode and print the response
response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
print(response)

In [None]:
import json
import random
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch

# Initialize your model and processor (assuming LLAVA is being used)
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) 
model.to("cuda:0")
# caption guided VQA: "The red square is diagonally up and right from the circle,"
# Sample questions dictionary with structure for JSON saving
questions_data = {
    # shape count
    "1": {
        "question": "How many shapes are present in the image?",
        "options": ["0", "1", "2", "equal to or more than 3"],
        "reference_answer": "2",
        "model_answer": None,
        "score": None
    },
    "2": {
        "question": "How many blobs are present in the image?",
        "options": ["0", "1", "2", "equal to or more than 3"],
        "reference_answer": "2",
        "model_answer": None,
        "score": None
    },
    "3": {
        "question": "Are there exactly two distinct shapes in the image?",
        "options": ["Yes", "No"],
        "reference_answer": "Yes",
        "model_answer": None,
        "score": None
    },
    # shape types
    "4": {
        "question": "What is the shape that is red in color?",
        "options": ["Circle", "Square", "Triangle", "Unidentifiable", "No red color shapes"],
        "reference_answer": "Square",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0" 
    },
    "5": {
        "question": "What is the shape that is blue in color?",
        "options": ["Circle", "Square", "Triangle", "Unidentifiable", "No blue color shapes"],
        "reference_answer": "Circle",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0"
    },
    # color types 
    "6": {
        "question": "What color is the square in the image?",
        "options": ["Red", "Blue", "Not red or blue", "No square shape"],
        "reference_answer": "Red",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0"
    },
    "7": {
        "question": "What color is the circle in the image?",
        "options": ["Red", "Blue", "Not red or blue", "No circle shape"],
        "reference_answer": "Blue",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0"
    },
    "8": {
        "question": "What color is the triangle in the image?",
        "options": ["Red", "Blue", "Not red or blue", "No triangle shape"],
        "reference_answer": "No triangle shape",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0"
    },
    # spatial relations
    "9": {
        "question": "What's the spatial relationship between 2 blobs?",
        "options": ["Directly above vs. below", "Directly left vs. right", "Upper-left vs. Lower-right", "Upper-right vs. Lower-left", "Overlapping"],
        "reference_answer": "Upper-right vs. Lower-left",
        "model_answer": None,
        "score": None,
        "condition": "shapes question = 2"
    },
    "10": {
        "question": "Where is the red square relative to the circle?",
        "options": ["Up and to the right", "Up and to the left", "Down and to the right", "Down and to the left", "Directly above", "Directly below", "Overlapping"],
        "reference_answer": "Up and to the right",
        "model_answer": None,
        "score": None,
        "condition": "correctly respond to above questions"
    },
}

# Save initial questions data to a JSON file
with open("questions.json", "w") as f:
    json.dump(questions_data, f, indent=4)

# Load and prompt the model with questions, update answers and scores
def ask_model_and_update_json(file_path="questions.json"):
    # Load questions from JSON
    with open(file_path, "r") as f:
        questions = json.load(f)

    # Iterate over each question and prompt the model
    for q_id, q_data in questions.items():
        question_text = q_data["question"]
        options = q_data["options"]
        reference_answer = q_data["reference_answer"]

        # Format the question as a prompt for the model
        prompt = f"{question_text} Options: {', '.join(options)}"

        # Assume we have an image tensor prepared for inference
        # Replace 'image' below with the actual image tensor
        inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")
        
        # Generate model's answer
        output = model.generate(**inputs, max_new_tokens=50)
        model_answer = processor.decode(output[0], skip_special_tokens=True)
        
        # Compare model answer to reference answer
        score = 1 if model_answer.strip() == reference_answer else 0
        
        # Update the question data with model answer and score
        questions[q_id]["model_answer"] = model_answer
        questions[q_id]["score"] = score

    # Save updated questions with answers and scores back to JSON
    with open(file_path, "w") as f:
        json.dump(questions, f, indent=4)

# Example image loading, replace with your image loading process


# Download an example image for testing (replace URL with your actual image URL)
url = "example_QA.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Run the function to ask model and update JSON with results
ask_model_and_update_json()
