In [1]:
import os
import textwrap
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset
ds = load_dataset(
    "hwaseem04/Aya-testing",
    data_files={"xm3600_captioning": "data/xm3600_captioning-00000-of-00001.parquet"}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


xm3600_captioning-00000-of-00001.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

Generating xm3600_captioning split: 0 examples [00:00, ? examples/s]

In [6]:
ds['xm3600_captioning'][0]

{'sample_id': 1,
 'image_id': '000411001ff7dd4f',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480>,
 'prompt_en': 'Caption the image, short answer.',
 'captions_en': ['a rooster and hens surrounded by green leaves .',
  'a rooster with two hens on a rocky slope with some bushes .'],
 'prompt_ar': 'علّق على الصورة، بإجابة قصيرة.',
 'captions_ar': ['ديك و فرخة علي الأرض', 'ديك وفرخة بحديقة'],
 'prompt_bn': 'ছবির ক্যাপশন দিন, সংক্ষিপ্ত উত্তর।',
 'captions_bn': ['জঙ্গলের মদহে দুইটি বাদামী ও কালো রঙের মুরগি আছে'],
 'prompt_cs': 'Popište obrázek krátkou odpovědí.',
 'captions_cs': ['kohout a slepice v trávě',
  'hnědá slepice a kohout jdoucí po trávě v lese'],
 'prompt_da': 'Giv billedet en billedtekst, kort svar.',
 'captions_da': ['en brun høne og en flerfarvet hane i skovbunden',
  'hane og høne i have'],
 'prompt_de': 'Bildunterschrift für das Bild, kurze Antwort.',
 'captions_de': ['eine henne und ein hanh im steinigem garten im gras .',
  'leicht verschwommene 

In [8]:
from datasets import load_dataset
from tqdm import tqdm
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

# Languages to iterate over
languages = ["en", "bn", "de", "ko", "ru", "zh"]

# Directory to save temp images (needed for this model)
os.makedirs("temp_images_caption", exist_ok=True)

# Load Aya Vision model and processor
model_id = "CohereLabs/aya-vision-8b"
model = AutoModelForImageTextToText.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16).eval()
processor = AutoProcessor.from_pretrained(model_id)

dataset = ds['xm3600_captioning']

for sample in tqdm(dataset, desc="Iterating samples"):
    try:
        image = sample["image"]
        sample_id = sample["sample_id"]

        #### This part will be replaced when attack implementation is ready #####

        image_path = f"temp_images_caption/{sample_id}.jpg"
        if not os.path.exists(image_path):
            image.save(image_path)

        #########################################################################

        print(f"\n========== Sample ID: {sample_id} ==========")

        for lang in languages:
            prompt_col = f"prompt_{lang}"
            caption_col = f"captions_{lang}"

            # Safety check if caption exists
            if prompt_col not in sample or caption_col not in sample:
                print(f"[{lang}] Missing data.")
                continue

            prompt = sample[prompt_col]
            gt_caption = sample[caption_col]

            #### Run inference using Aya Vision model ####

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image_path},
                        {"type": "text", "text": prompt}
                    ]
                }
            ]

            inputs = processor.apply_chat_template(
                messages,
                padding=True,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt"
            ).to(model.device)

            with torch.inference_mode():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=300,
                    do_sample=True,
                    temperature=0.3
                )
                pred_caption = processor.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

            ###########################################

            # Display result
            print(f"\n[{lang.upper()}]")
            print(f"Prompt: {prompt}")
            print(f"GT: {gt_caption}")
            print(f"Pred: {textwrap.fill(pred_caption, width=80)}")

        print("=" * 100)

    except Exception as e:
        print(f"Error processing sample {sample['sample_id']}: {e}")

    break  # remove this to run the full dataset

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Iterating samples:   0%|          | 0/3600 [00:00<?, ?it/s]



[EN]
Prompt: Caption the image, short answer.
GT: ['a rooster and hens surrounded by green leaves .', 'a rooster with two hens on a rocky slope with some bushes .']
Pred: Two chickens, one brown and one black, are walking on a dirt path surrounded by
greenery and fallen leaves.

[BN]
Prompt: ছবির ক্যাপশন দিন, সংক্ষিপ্ত উত্তর।
GT: ['জঙ্গলের মদহে দুইটি বাদামী ও কালো রঙের মুরগি আছে']
Pred: ছবির ক্যাপশন দিন: সোম অর্থায় বুদ্ধ পর্ব।

[DE]
Prompt: Bildunterschrift für das Bild, kurze Antwort.
GT: ['eine henne und ein hanh im steinigem garten im gras .', 'leicht verschwommene aufnahme von huhn und hahn im freien tagsüber .']
Pred: Drei Hühner: ein braunes Huhn und zwei rote Hähne, die in einem natürlichen
Umfeld mit Grün und Steinen posieren.

[KO]
Prompt: 이미지에 캡션을 달아주세요. 짧은 답변으로.
GT: ['낙엽 과 잡초 가 많은 산길 의 돌 위에 서 있는 닭 두 마리', '시골 마을 에 풀어 놓고 키우는 닭 암수']
Pred: "자연 속에서 자유롭게 돌아다니는 닭들의 모습"

[RU]
Prompt: Подпишите изображение, короткий ответ.
GT: ['Яркий петух и его коричневые куры в парке', 'Курица 

Iterating samples:   0%|          | 0/3600 [00:10<?, ?it/s]


[ZH]
Prompt: 为图片添加标题，简短回答。
GT: ['在 山里 中 站着 两只鸡 ， 一只 黄色 另一 只 黑 黄色 ， 它们 俩 站着 看 向 同 一个 方向', '在 野外 绿植 地上 的 公鸡 和 母鸡 近景']
Pred: “乡村中的鸡群：在自然环境中漫步”



