In [7]:
import os
import textwrap
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [8]:
from datasets import load_dataset
ds = load_dataset(
    "hwaseem04/Aya-testing",
    data_files={"xGQA_vqa": "data/xGQA_vqa-00000-of-00001.parquet"}
)


In [9]:
ds['xGQA_vqa'][0]

{'sample_id': 1,
 'image_id': 'n161313',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x280>,
 'question_en': 'Is it overcast?',
 'answer_en': 'No, it is clear.',
 'question_bn': 'এটা কি মেঘাচ্ছন্ন?',
 'answer_bn': 'না, এটা স্পষ্ট।',
 'question_de': 'Ist es bewölkt?',
 'answer_de': 'Nein, es ist klar.',
 'question_id': 'Apakah langit mendung ?',
 'answer_id': 'Tidak, itu jelas.',
 'question_ko': '날이 흐린가요?',
 'answer_ko': '아니, 분명해요.',
 'question_ru': 'Это пасмурная погода?',
 'answer_ru': 'Нет, это ясно.',
 'question_zh': '是阴天吗？',
 'answer_zh': '不，很清楚。'}

In [6]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
from datasets import load_dataset
from tqdm import tqdm
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

# Load Aya Vision model
model_id = "CohereLabs/aya-vision-8b"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(
    model_id, device_map="auto", torch_dtype=torch.float16
)

# Languages to iterate over
languages = ["en", "bn", "de", "ko", "ru", "zh"]

# Directory to save temp images (needed for this model)
os.makedirs("temp_images_vqa", exist_ok=True)

dataset = ds['xGQA_vqa']

for sample in tqdm(dataset, desc="Iterating samples"):
    try:
        image = sample["image"]
        sample_id = sample["sample_id"]

        #### This part will be replaced when attack implementation is ready #####

        image_path = f"temp_images_vqa/{sample_id}.jpg"
        if not os.path.exists(image_path):
            image.save(image_path)

        #########################################################################

        print(f"\n========== Sample ID: {sample_id} ==========")

        for lang in languages:
            question_col = f"question_{lang}"
            answer_col = f"answer_{lang}"

            # Safety check if answer exists
            if question_col not in sample or answer_col not in sample:
                print(f"[{lang}] Missing data.")
                continue

            question = sample[question_col]
            gt_answer = sample[answer_col]

            # Aya Vision input formatting
            messages = [
                {"role": "user",
                 "content": [
                     {"type": "image", "url": image_path},
                     {"type": "text", "text": question},
                 ]},
            ]

            inputs = processor.apply_chat_template(
                messages,
                padding=True,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt"
            ).to(model.device)

            gen_tokens = model.generate(
                **inputs,
                max_new_tokens=300,
                do_sample=True,
                temperature=0.3,
            )

            pred_answer = processor.tokenizer.decode(
                gen_tokens[0][inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            )

            # Display result
            print(f"\n[{lang.upper()}]")
            print(f"Question: {question}")
            print(f"GT: {gt_answer}")
            print(f"Pred: {textwrap.fill(pred_answer, width=80)}")

        print("=" * 100)

    except Exception as e:
        print(f"Error processing sample {sample['sample_id']}: {e}")

    break


processor_config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/178 [00:00<?, ?B/s]

Iterating samples:   0%|          | 0/9666 [00:00<?, ?it/s]



[EN]
Question: Is it overcast?
GT: No, it is clear.
Pred: No, it is not overcast. The sky in the image is clear and blue, indicating a
sunny day. The sunlight is bright and casts shadows on the snow, which further
confirms that there are no clouds obscuring the sun.

[BN]
Question: এটা কি মেঘাচ্ছন্ন?
GT: না, এটা স্পষ্ট।
Pred: এই ছবিতে কোন মেঘ দেখা যায়। সাফ নীল আকাশ দেখা যায়, যা একটি স্বাচ্ছন্দ ও
স্বাচ্ছন্দ পরিবেশ প্রদর্শন করে। ছবি স্নোউবোর্ডিং পরিবেশের সুন্দরতা ও আকাশের
স্বাচ্ছন্দতা প্রকাশ করে, যেখানে মেঘের অভাব পরিবেশের স্বাচ্ছন্দ ও স্বাচ্ছন্দ
প্রকৃতিক সৌন্দর্য বৃদ্ধি করে।

[DE]
Question: Ist es bewölkt?
GT: Nein, es ist klar.
Pred: Nein, es ist nicht bewölkt. Der Himmel ist klar und blau, ohne jegliche Wolken,
was eine ideale Bedingung für Aktivitäten wie Snowboarden bietet. Die Sicht ist
ausgezeichnet, und die Sonne scheint hell, was das Erlebnis noch angenehmer
macht.

[KO]
Question: 날이 흐린가요?
GT: 아니, 분명해요.
Pred: 아니요, 날씨는 흐린 것이 아닙니다. 사진에서 하늘은 맑고 푸르며, 구름 한 점 없이 밝은 햇살이 비추고 있습니다. 이

Iterating samples:   0%|          | 0/9666 [00:31<?, ?it/s]


[ZH]
Question: 是阴天吗？
GT: 不，很清楚。
Pred: 根据图片，天空是晴朗的，没有云彩。这表明天气晴朗，没有阴天。



