In [3]:
pip install datasets



In [4]:
import torch
import torch.nn as nn
from transformers import AutoImageProcessor, AutoModel, T5ForConditionalGeneration, T5Tokenizer
from PIL import Image
import requests
from datasets import load_dataset

ds = load_dataset("hiyouga/geometry3k")

class MultiModalT5Model(nn.Module):
    def __init__(self, vision_model_name, t5_model_name, device):
        super().__init__()
        self.device = device
        # Load the vision components.
        self.vision_processor = AutoImageProcessor.from_pretrained(vision_model_name)
        self.vision_encoder = AutoModel.from_pretrained(vision_model_name).to(device)

        # Load the T5 components.
        self.tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
        self.t5 = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)

        # T5 uses d_model as its hidden size.
        vision_hidden_size = self.vision_encoder.config.hidden_size
        t5_hidden_size = self.t5.config.d_model

        # Add a projection layer if the sizes differ.
        if vision_hidden_size != t5_hidden_size:
            self.projection = nn.Linear(vision_hidden_size, t5_hidden_size).to(device)
        else:
            self.projection = nn.Identity()

    def forward(self, image, prompt, max_length=50):
        # Process the image and obtain image embeddings.
        vision_inputs = self.vision_processor(images=image, return_tensors="pt").to(self.device)
        vision_outputs = self.vision_encoder(**vision_inputs)
        image_embeds = vision_outputs.last_hidden_state  # (batch, img_seq_len, vision_hidden_size)
        projected_image_embeds = self.projection(image_embeds)  # (batch, img_seq_len, t5_hidden_size)

        # Process the prompt text.
        prompt_tokens = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)
        # Use T5's shared embedding layer to get text embeddings.
        prompt_embeds = self.t5.shared(prompt_tokens.input_ids)  # (batch, text_seq_len, t5_hidden_size)

        # Concatenate image embeddings (as a prefix) with prompt embeddings.
        combined_embeds = torch.cat([projected_image_embeds, prompt_embeds], dim=1)

        # Generate output text using the T5 model.
        generated_ids = self.t5.generate(inputs_embeds=combined_embeds, max_length=max_length)
        generated_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        return generated_text

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vision_model_name = "facebook/dinov2-base"
    t5_model_name = "t5-small"

    multimodal_model = MultiModalT5Model(vision_model_name, t5_model_name, device)
    multimodal_model.eval()

    dataset = ds['test']
    image = dataset[0]['images']
    prompt = dataset[0]['problem']

    # Use autocast for mixed precision (reduces memory without full quantization)
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            output_text = multimodal_model(image, prompt, max_length=500)
            print(output_text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, 

s s


In [5]:
def evaluate_model_on_dataset(model, dataset, device="cuda:0", max_samples=None, max_length=50):
    """
    Evaluate the given multimodal model on the dataset.

    Each sample in the dataset should have:
      - "images": a PIL image,
      - "problem": the problem statement (a dynamic prompt),
      - "ground_truth" or "answer": the expected output.

    Args:
      model: The multimodal image-text model to evaluate. It should have a callable
             interface like model(image, prompt, max_length=...) that returns decoded text.
      dataset: A dataset (e.g., a Hugging Face dataset) where each sample is a dict.
      device: The device to run the evaluation on (default "cuda:0").
      max_samples: Optional integer limit on the number of samples to evaluate.
      max_length: Maximum length for generated text.

    Returns:
      A list of dictionaries with keys "problem", "prediction", and "ground_truth".
    """
    results = []

    for idx, sample in enumerate(dataset):
        if max_samples is not None and idx >= max_samples:
            break

        # Build the dynamic prompt using the sample's problem statement.
        prompt = (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            "<|im_start|>user\n" + sample["problem"] + "\n" +
            "<|vision_start|><|image_pad|><|vision_end|>" +
            "Please answer the above problem using the given image.<|im_end|>\n"
            "<|endoftext|>"
        )

        image = sample["images"]

        ground_truth = sample.get("ground_truth", sample.get("answer", ""))

        with torch.no_grad():
            output_text = model(image, prompt, max_length=max_length)

        results.append({
            "problem": sample["problem"],
            "prediction": output_text,
            "ground_truth": ground_truth
        })

        if idx % 10 == 0:
            print(f"Processed sample {idx}")

    return results


In [6]:
results = evaluate_model_on_dataset(
    model=multimodal_model,
    dataset=dataset,
    device="cuda:0",
    max_samples=10,
    max_length=500
)

# Print the results.
correct_count = 0

for res in results:
    print("Problem:", res["problem"])
    print("Prediction:", res["prediction"])
    print("Ground Truth:", res["ground_truth"])
    print("=" * 40)


Processed sample 0
Problem: <image>In the figure, $\overline{A D}$ is perpendicular to $\overline{B C}$ and $\overline{A B}$ is perpendicular to $\overline{A C}$. What is $B C ?$
Prediction: 
Ground Truth: C
Problem: <image>In $\odot M$, $FL=24,HJ=48$, and $m \widehat {HP}=65$. Find $m \widehat {PJ}$.
Prediction: 
Ground Truth: B
Problem: <image>Find $G I$ if $G H=9, G K=6,$ and $K J=4$
Prediction: ........................................................
Ground Truth: D
Problem: <image>in $\triangle XYZ$, $P$ is the centroid, $KP=3$, and $XJ=8$. Find $YJ$.
Prediction: s s s s s s s s s s s s. s s s s. s s. s s. s s. s s. s s. s s. s s. s s. s s s s. s s s s s s s s s
Ground Truth: C
Problem: <image>A regular pentagon and a square share a mutual vertex $X$. The sides $\overline{X Y}$ and $\overline{X Z}$ are sides of a third regular polygon with a vertex at $X .$ How many sides does this polygon have?
Prediction: ........................................................
Ground Truth: B
P