In [None]:
pip install datasets

In [None]:
pip install flash-attn

# Imports & Constants

In [14]:
from PIL import Image
import requests
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, Qwen2VLForConditionalGeneration
from datasets import load_dataset

# Constants for pixel constraints
max_pixels = 768 * 28 * 28
min_pixels = 1 * 28 * 28

# Load the Model and Processor

In [35]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    'llamaindex/vdr-2b-multi-v1',
    torch_dtype=torch.bfloat16,
    device_map="cuda:0"
).eval()

# Load the processor with pixel parameters
processor = AutoProcessor.from_pretrained(
    'llamaindex/vdr-2b-multi-v1',
    min_pixels=min_pixels,
    max_pixels=max_pixels
)

# Set padding side to left
model.padding_side = "left"
processor.tokenizer.padding_side = "left"

# Load the Dataset

In [16]:
# Load the dataset.
ds = load_dataset("hiyouga/geometry3k")
dataset = ds["train"]

In [27]:
dataset = ds["test"]
dataset

Dataset({
    features: ['images', 'problem', 'answer', 'id', 'choices', 'ground_truth'],
    num_rows: 601
})

# Evaluation Function

In [38]:
def evaluate_model_on_dataset(model, processor, dataset, device="cuda:0", max_samples=None):
    """
    Evaluate the given model on the dataset.

    Each sample should have:
      - "images": the image to process,
      - "problem": the problem statement (dynamic prompt),
      - "ground_truth" or "answer": the expected output.

    Args:
      model: The image-text model to evaluate.
      processor: The processor associated with the model.
      dataset: A Hugging Face dataset.
      device: Device to run the evaluation on (default "cuda:0").
      max_samples: Optional integer limit to the number of samples to evaluate.

    Returns:
      A list of dictionaries with 'prediction', 'ground_truth', and 'problem' keys.
    """
    results = []

    for idx, sample in enumerate(dataset):
        if max_samples is not None and idx >= max_samples:
            break

        # Build the dynamic prompt using the sample's problem statement.
        prompt = (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            "<|im_start|>user\n" + sample["problem"] + "\n" +
            "<|vision_start|><|image_pad|><|vision_end|>" +
            "Please answer the above problem using the given image.<|im_end|>\n"
            "<|endoftext|>"
        )

        # Retrieve the image from the sample.
        image = sample["images"]

        # Get the ground truth from 'ground_truth', fallback to 'answer' if not available.
        ground_truth = sample.get("ground_truth", sample.get("answer", ""))

        # Process the inputs (both text prompt and image)
        inputs = processor(
            text=prompt,
            images=image,
            return_tensors="pt",
            padding="longest"
        ).to(device)

        # Directly generate output without calling prepare_inputs_for_generation.
        with torch.no_grad():
            output = model.generate(**inputs,max_new_tokens = 1)

        # Decode the result
        prediction = processor.decode(output[0], skip_special_tokens=True)

        results.append({
            "problem": sample["problem"],
            "prediction": prediction,
            "ground_truth": ground_truth
        })

        # Optional: Print progress every 10 samples
        if idx % 10 == 0:
            print(f"Processed sample {idx}")

    return results


# Run the Evaluation Function

In [49]:
# Evaluate the model on the dataset.
results = evaluate_model_on_dataset(model, processor, dataset, device="cuda:0", max_samples=None)

correct_predictions = 0
total_samples = len(results)
# Display the predictions along with their associated problem statement and ground truth.
for idx, item in enumerate(results):
    print(f"Sample {idx}:")
    print("Problem:", item["problem"])
    clean_prediction = item["prediction"][len(''' system
You are a helpful assistant.
user
<image> answer the above problem using the given image. ''')+len(item["problem"]):]

    print("Prediction:", clean_prediction)
    if(clean_prediction == item["ground_truth"]):
        correct_predictions += 1
    print("Ground Truth:", item["ground_truth"])
    print("-" * 50)
print("Accuricy:",correct_predictions/total_samples)

Processed sample 0
Processed sample 10
Processed sample 20
Processed sample 30
Processed sample 40
Processed sample 50
Processed sample 60
Processed sample 70
Processed sample 80
Processed sample 90
Processed sample 100
Processed sample 110
Processed sample 120
Processed sample 130
Processed sample 140
Processed sample 150
Processed sample 160
Processed sample 170
Processed sample 180
Processed sample 190
Processed sample 200
Processed sample 210
Processed sample 220
Processed sample 230
Processed sample 240
Processed sample 250
Processed sample 260
Processed sample 270
Processed sample 280
Processed sample 290
Processed sample 300
Processed sample 310
Processed sample 320
Processed sample 330
Processed sample 340
Processed sample 350
Processed sample 360
Processed sample 370
Processed sample 380
Processed sample 390
Processed sample 400
Processed sample 410
Processed sample 420
Processed sample 430
Processed sample 440
Processed sample 450
Processed sample 460
Processed sample 470
Pro