In [1]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests
from transformers import AutoProcessor, LlavaForConditionalGeneration


In [2]:
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.version.cuda)         # Should show '12.1' or your installed CUDA version


True
12.4


### Load model

In [3]:
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) 
model.to("cuda:0")

# Load the local image
image_path = "example_QA.jpg"
image = Image.open(image_path)

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What is shown in this image?"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

# autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=100)

print(processor.decode(output[0], skip_special_tokens=True))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


[INST]  
What is shown in this image? [/INST] The image shows a simple geometric figure, which appears to be a red triangle. The background is a solid color, and there is a smaller blue rectangle to the right of the triangle. The image has a pixelated appearance, suggesting it might be a digital representation or a screenshot from a video game or a computer program. 


In [38]:
import json
import random

# Initialize your model and processor (assuming LLAVA is being used)
#processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
#model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) 
#model.to("cuda:0")
# caption guided VQA: "The red triangle is diagonally up and left from the square,"
# Sample questions dictionary with structure for JSON saving
questions_data = {
    # shape count
    "1": {
        "question": "How many shapes are present in the image?",
        "options": ["A. 0", "B. 1", "C. 2", "D. equal to or more than 3"],
        "instructions": "\n Answer only in A, B, C or D.",
        "reference_answer": "C",
        "model_answer": None,
        "score": None
    },
    "2": {
        "question": "How many blobs are present in the image?",
        "options": ["A. 0", "B. 1", "C. 2", "D. equal to or more than 3"],
        "instructions": "\n Answer only in A, B, C or D.",
        "reference_answer": "C",
        "model_answer": None,
        "score": None
    },
    "3": {
        "question": "Are there exactly two distinct shapes in the image?",
        "options": ["A. Yes", "B. No", "C. Not sure"],
        "instructions": "\n Answer only in A, B or C.",
        "reference_answer": "A",
        "model_answer": None,
        "score": None
    },
    # shape types
    "4": {
        "question": "What is the shape that is red in color?",
        "options": ["A. Circle", "B. Square", "C. Triangle", "D. Unidentifiable", "E. No red color shapes"],
        "instructions": "\n Answer only in A, B, C, D or E.",
        "reference_answer": "C",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0" 
    },
    "5": {
        "question": "What is the shape that is blue in color?",
        "options": ["A. Circle", "B. Square", "C. Triangle", "D. Unidentifiable", "E. No blue color shapes"],
        "instructions": "\n Answer only in A, B, C, D or E.",
        "reference_answer": "B",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0"
    },
    # color types 
    "6": {
        "question": "What color is the square in the image?",
        "options": ["A. Red", "B. Blue", "C. Not red or blue", "D. No square shape"],
        "instructions": "\n Answer only in A, B, C or D.",
        "reference_answer": "B",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0"
    },
    "7": {
        "question": "What color is the circle in the image?",
        "options": ["A. Red", "B. Blue", "C. Not red or blue", "D. No circle shape"],
        "instructions": "\n Answer only in A, B, C or D.",
        "reference_answer": "D",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0"
    },
    "8": {
        "question": "What color is the triangle in the image?",
        "options": ["A. Red", "B. Blue", "C. Not red or blue", "D. No triangle shape"],
        "instructions": "\n Answer only in A, B, C or D.",
        "reference_answer": "B",
        "model_answer": None,
        "score": None,
        "condition": "shapes question cannot be 0"
    },
    # spatial relations
    "9": {
        "question": "What's the spatial relationship between 2 blobs?",
        "options": ["A. Directly above vs. below", "B. Directly left vs. right", "C. Upper-left vs. Lower-right", "D. Upper-right vs. Lower-left", "E. Overlapping"],
        "instructions": "\n Answer only in A, B, C, D or E.",
        "reference_answer": "C",
        "model_answer": None,
        "score": None,
        "condition": "shapes question = 2"
    },
    "10": {
        "question": "Where is the red triangle relative to the square?",
        "options": ["A. Up and to the right", "B. Up and to the left", "C. Down and to the right", "D. Down and to the left", "E. Directly above", "F. Directly below", "G. Overlapping"],
        "instructions": "\n Answer only in A, B, C, D, E, F or G.",
        "reference_answer": "B",
        "model_answer": None,
        "score": None,
        "condition": "correctly respond to above questions"
    },
}

# Save initial questions data to a JSON file
with open("questions.json", "w") as f:
    json.dump(questions_data, f, indent=4)


In [39]:
# Load and prompt the model with questions, update answers and scores
def ask_model_and_update_json(processor, image, file_path="questions.json"):
    # Load questions from JSON
    with open(file_path, "r") as f:
        questions = json.load(f)

    # Iterate over each question and prompt the model
    for q_id, q_data in questions.items():
        question_text = q_data["question"]
        options = q_data["options"]
        instructions = q_data["instructions"]
        reference_answer = q_data["reference_answer"]
        text_input = f"{question_text} Options: {', '.join(options)} {instructions}"
        print(text_input)
        conversation = [
            {

            "role": "user",
            "content": [
                {"type": "text", "text": text_input},
                {"type": "image"},
                ],
            },
        ]
        # Format the question as a prompt for the model
        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

        # Assume we have an image tensor prepared for inference
        # Replace 'image' below with the actual image tensor
        inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")
        
        # Generate model's answer
        output = model.generate(**inputs, max_new_tokens=50)
        decoded_output = processor.decode(output[0], skip_special_tokens=True)
        # Extract only the answer by splitting on "[INST]" and taking the last non-empty part
        model_answer = decoded_output.split("[INST]")[-1].strip()

        # If the answer includes extra spaces or text, you can further clean it up
        model_answer = model_answer.split()[-1]  # This assumes the answer is a single letter like "A", "B", "C", or "D"
        
        # Compare model answer to reference answer
        score = 1 if model_answer.strip() == reference_answer else 0
        
        # Update the question data with model answer and score
        questions[q_id]["model_answer"] = model_answer
        questions[q_id]["score"] = score

    # Save updated questions with answers and scores back to JSON
    with open(file_path, "w") as f:
        json.dump(questions, f, indent=4)

# Example image loading, replace with your image loading process


# Download an example image for testing (replace URL with your actual image URL)
image_path = "example_QA.jpg"
image = Image.open(image_path)

# Run the function to ask model and update JSON with results
ask_model_and_update_json(processor, image)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


How many shapes are present in the image? Options: A. 0, B. 1, C. 2, D. equal to or more than 3 
 Answer only in A, B, C or D.
How many blobs are present in the image? Options: A. 0, B. 1, C. 2, D. equal to or more than 3 
 Answer only in A, B, C or D.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Are there exactly two distinct shapes in the image? Options: A. Yes, B. No, C. Not sure 
 Answer only in A, B or C.
What is the shape that is red in color? Options: A. Circle, B. Square, C. Triangle, D. Unidentifiable, E. No red color shapes 
 Answer only in A, B, C, D or E.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What is the shape that is blue in color? Options: A. Circle, B. Square, C. Triangle, D. Unidentifiable, E. No blue color shapes 
 Answer only in A, B, C, D or E.
What color is the square in the image? Options: A. Red, B. Blue, C. Not red or blue, D. No square shape 
 Answer only in A, B, C or D.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What color is the circle in the image? Options: A. Red, B. Blue, C. Not red or blue, D. No circle shape 
 Answer only in A, B, C or D.
What color is the triangle in the image? Options: A. Red, B. Blue, C. Not red or blue, D. No triangle shape 
 Answer only in A, B, C or D.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What's the spatial relationship between 2 blobs? Options: A. Directly above vs. below, B. Directly left vs. right, C. Upper-left vs. Lower-right, D. Upper-right vs. Lower-left, E. Overlapping 
 Answer only in A, B, C, D or E.
Where is the red triangle relative to the square? Options: A. Up and to the right, B. Up and to the left, C. Down and to the right, D. Down and to the left, E. Directly above, F. Directly below, G. Overlapping 
 Answer only in A, B, C, D, E, F or G.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


### Other models

In [None]:
# Specify the model ID; replace with the desired LLaVA model variant
model_id = "llava-hf/llava-1.5-7b-hf"

# Load the processor and model
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Load an image from a URL or local path
image_path = "example_QA.jpg"
image = Image.open(image_path)

# Define the text prompt
text_prompt = "Describe the content of the image."

# Prepare the input
inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(model.device)

# Generate a response
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=50)

# Decode and print the response
response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
print(response)