In [1]:
import os
import textwrap
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset
ds = load_dataset(
    "hwaseem04/Aya-testing",
    data_files={"xm3600_captioning": "data/xm3600_captioning-00000-of-00001.parquet"}
)

In [5]:
from datasets import load_dataset
from tqdm import tqdm
from PIL import Image
import os

import torch
from transformers import AutoProcessor, Gemma3ForConditionalGeneration

# Languages to iterate over
languages = ["en", "bn", "de", "ko", "ru", "zh"]

# Directory to save temp images (needed for this model)
os.makedirs("temp_images_caption", exist_ok=True)

# Load Gemma model and processor
model_id = "google/gemma-3-12b-it"
model = Gemma3ForConditionalGeneration.from_pretrained(model_id, device_map="auto").eval()
processor = AutoProcessor.from_pretrained(model_id)

# Set torch dtype based on device
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

dataset = ds['xm3600_captioning']

for sample in tqdm(dataset, desc="Iterating samples"):
    try:
        image = sample["image"]
        sample_id = sample["sample_id"]

        #### This part will be replaced when attack implementation is ready #####

        image_path = f"temp_images_caption/{sample_id}.jpg"
        if not os.path.exists(image_path):
            image.save(image_path)

        #########################################################################

        print(f"\n========== Sample ID: {sample_id} ==========")

        for lang in languages:
            prompt_col = f"prompt_{lang}"
            caption_col = f"captions_{lang}"

            # Safety check if caption exists
            if prompt_col not in sample or caption_col not in sample:
                print(f"[{lang}] Missing data.")
                continue

            prompt = sample[prompt_col]
            gt_caption = sample[caption_col]

            #### Run inference using Gemma model ####

            messages = [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": "You are a helpful assistant."}]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image_path},
                        {"type": "text", "text": prompt}
                    ]
                }
            ]

            inputs = processor.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt"
            ).to(model.device, dtype=torch_dtype)

            input_len = inputs["input_ids"].shape[-1]

            with torch.inference_mode():
                output = model.generate(**inputs, max_new_tokens=100, do_sample=False)
                pred_caption = processor.decode(output[0][input_len:], skip_special_tokens=True)

            ###########################################

            # Display result
            print(f"\n[{lang.upper()}]")
            print(f"Prompt: {prompt}")
            print(f"GT: {gt_caption}")
            print(f"Pred: {textwrap.fill(pred_caption, width=80)}")

        print("=" * 100)

    except Exception as e:
        print(f"Error processing sample {sample['sample_id']}: {e}")

    break

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.




[EN]
Prompt: Caption the image, short answer.
GT: ['a rooster and hens surrounded by green leaves .', 'a rooster with two hens on a rocky slope with some bushes .']
Pred: Here are a few short captions for the image:  *   Chickens in the wild. *   Farm
birds exploring. *   Rooster and hen.

[BN]
Prompt: ছবির ক্যাপশন দিন, সংক্ষিপ্ত উত্তর।
GT: ['জঙ্গলের মদহে দুইটি বাদামী ও কালো রঙের মুরগি আছে']
Pred: এখানে দুটি মুরগি একটি সবুজ প্রকৃতির মধ্যে ঘুরে বেড়াচ্ছে।

[DE]
Prompt: Bildunterschrift für das Bild, kurze Antwort.
GT: ['eine henne und ein hanh im steinigem garten im gras .', 'leicht verschwommene aufnahme von huhn und hahn im freien tagsüber .']
Pred: Hier ist eine kurze Bildunterschrift für das Bild:  Hühner erkunden eine
natürliche Umgebung.

[KO]
Prompt: 이미지에 캡션을 달아주세요. 짧은 답변으로.
GT: ['낙엽 과 잡초 가 많은 산길 의 돌 위에 서 있는 닭 두 마리', '시골 마을 에 풀어 놓고 키우는 닭 암수']
Pred: 다음은 이미지에 대한 캡션입니다.  두 마리의 닭이 숲에서 걷고 있습니다.

[RU]
Prompt: Подпишите изображение, короткий ответ.
GT: ['Яркий петух и его коричневые ку

Iterating samples:   0%|          | 0/3600 [00:16<?, ?it/s]


[ZH]
Prompt: 为图片添加标题，简短回答。
GT: ['在 山里 中 站着 两只鸡 ， 一只 黄色 另一 只 黑 黄色 ， 它们 俩 站着 看 向 同 一个 方向', '在 野外 绿植 地上 的 公鸡 和 母鸡 近景']
Pred: Here are a few short title options for the image:  *   Chickens in the wild *
Free-range fowl *   Country birds



