### pip

In [1]:
!pip install datasets
!pip install fugashi
!pip install emoji
!pip install rouge_score
!pip install unidic-lite # for MeCab



### import

In [2]:
import os
import re
import emoji
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
from typing import Callable
import abc
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datasets import Dataset, load_dataset
from concurrent.futures import ProcessPoolExecutor, Future
from fugashi import Tagger
from rouge_score import rouge_scorer, scoring
import unicodedata


## LLMjp package

In [3]:
@dataclass
class TaskConfig:
    max_dataset_len: int | None = None
    rotate_choices: bool = False


class Task(abc.ABC):
    def __init__(self, config: TaskConfig):
        self.config = config

        if self.config.max_dataset_len is not None:
            self.dataset = self._prepare_dataset().select(
                range(self.config.max_dataset_len)
            )
        else:
            self.dataset = self._prepare_dataset()

    @abc.abstractmethod
    def _prepare_dataset(self) -> Dataset:
        """Prepares the dataset."""
        pass

    @abc.abstractmethod
    def doc_to_text(self, doc) -> str:
        """Converts a document to text."""
        pass

    @abc.abstractmethod
    def doc_to_visual(self, doc) -> list[Image.Image]:
        """Converts a document to visual."""
        pass

    @abc.abstractmethod
    def doc_to_id(self, doc) -> str:
        """Converts a document to id."""
        pass

    @abc.abstractmethod
    def doc_to_answer(self, doc) -> str:
        """Converts a document to answer."""
        pass

In [4]:
class JaVLMBenchIntheWild(Task):
    default_metric = "rougel"

    @staticmethod
    def _prepare_dataset() -> Dataset:
        # データセットをロード
        ds = load_dataset("SakanaAI/JA-VLM-Bench-In-the-Wild", split="test")
        ds = ds.rename_column("question", "input_text")
        ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
        return ds

    @staticmethod
    def doc_to_text(doc) -> str:
        return doc["input_text"]

    @staticmethod
    def doc_to_visual(doc) -> list[Image.Image]:
        return [doc["image"]]

    @staticmethod
    def doc_to_id(doc) -> str:
        return str(doc["question_id"])

    @staticmethod
    def doc_to_answer(doc) -> str:
        return doc["answer"]

In [5]:
class TaskRegistry:
    """Registry to map metrics to their corresponding scorer classes."""

    _tasks: dict[str, Callable[[TaskConfig], Task]] = {
        "ja-vlm-bench-in-the-wild": JaVLMBenchIntheWild,
    }

    @classmethod
    def get_task_list(cls):
        return list(cls._tasks.keys())

    @classmethod
    def load_task(cls, task_name: str, task_config: TaskConfig = TaskConfig()) -> Task:
        try:
            return cls._tasks[task_name](task_config)  # type: ignore
        except KeyError:
            raise ValueError(f"Task '{task_name}' is not supported.")

### scorer

In [6]:
@dataclass
class AggregateOutput:
    overall_score: float
    details: dict[str, float]


@dataclass
class ScorerConfig:
    docs: dict | None = None
    judge_model: str | None = None
    batch_size: int = 10
    random_choice: bool = False


class Scorer(ABC):
    def __init__(self, config: ScorerConfig):
        self.config = config

    @abstractmethod
    def score(self, refs: list[str], preds: list[str]) -> list:
        raise NotImplementedError

    @abstractmethod
    def aggregate(self, scores: list) -> AggregateOutput:
        raise NotImplementedError

In [7]:
class MecabTokenizer:
    def __init__(self) -> None:
        self.tagger = Tagger("-Owakati")

    def normalize_answer(self, text: str) -> str:
        """Lower case text, remove punctuation and extra whitespace, etc."""

        def white_space_fix(text: str) -> str:
            return " ".join(text.split())

        def remove_emoji(text: str) -> str:
            text = "".join(["" if emoji.is_emoji(c) else c for c in text])
            emoji_pattern = re.compile(
                "["
                "\U0001f600-\U0001f64f"  # emoticons
                "\U0001f300-\U0001f5ff"  # symbols & pictographs
                "\U0001f680-\U0001f6ff"  # transport & map symbols
                "\U0001f1e0-\U0001f1ff"  # flags (iOS)
                "\U00002702-\U000027b0"
                "]+",
                flags=re.UNICODE,
            )
            return emoji_pattern.sub(r"", text)

        text = remove_emoji(text)
        # see neologdn docs for details, but handles things like full/half width variation
        # text = neologdn.normalize(text) FIXME: fix c++12 error when installing neologdn
        text = unicodedata.normalize("NFKC", text)
        text = white_space_fix(text)
        return text

    def tokenize(self, text):
        return self.tagger.parse(self.normalize_answer(text)).split()

def rouge_ja(refs: list[str], preds: list[str]) -> dict:
    """Compute ROUGE-L scores for Japanese text.
    Args:
        refs: list of reference strings
        preds: list of predicted strings
    Returns:
        dict: dictionary with keys: { 'rouge1', 'rouge2', 'rougeL' }
        Each value is a float representing the ROUGE score (f-measure) * 100.
    """
    assert isinstance(refs, list) and isinstance(
        preds, list
    ), "refs and preds must be lists."
    tokenizer = MecabTokenizer()
    rouge_types = ["rouge1", "rouge2", "rougeL"]
    # mecab-based rouge
    scorer = rouge_scorer.RougeScorer(
        rouge_types,
        tokenizer=tokenizer,
    )

    # Accumulate confidence intervals.
    aggregator = scoring.BootstrapAggregator()
    for ref, pred in zip(refs, preds):
        aggregator.add_scores(scorer.score(ref, pred))
    result = aggregator.aggregate()
    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}


class RougeLScorer(Scorer):
    @staticmethod
    def score(refs: list[str], preds: list[str]) -> list[float]:
        futures: list[Future[dict[str, float]]] = []
        with ProcessPoolExecutor() as executor:
            for ref, pred in zip(refs, preds):
                future = executor.submit(rouge_ja, [ref], [pred])
                futures.append(future)
        scores = [f.result()["rougeL"] for f in futures]
        return scores

    @staticmethod
    def aggregate(scores: list[float]) -> AggregateOutput:
        mean = sum(scores) / len(scores)
        return AggregateOutput(mean, {"rougel": mean})

In [8]:
class ScorerRegistry:
    """Registry to map metrics to their corresponding scorer classes."""

    _scorers: dict[str, Callable[[ScorerConfig], Scorer]] = {
        "rougel": RougeLScorer,
    }

    @classmethod
    def get_metric_list(cls) -> list[str]:
        """Get a list of supported metrics."""
        return list(cls._scorers.keys())

    @classmethod
    def load_scorer(
        cls, metric: str, scorer_config: ScorerConfig = ScorerConfig()
    ) -> Scorer:
        """Load a scorer instance from the scorer registry."""
        try:
            return cls._scorers[metric](scorer_config)  # type: ignore
        except KeyError:
            raise ValueError(f"Metric '{metric}' is not supported.")

## Gemma Wrapper

In [9]:
# 1. GemmaVLMWrapper クラス: マージ済みモデルをロードして generate() を実装
class GemmaVLMWrapper:
    def __init__(self, model_dir: str):
        self.model = AutoModelForImageTextToText.from_pretrained(
            model_dir, torch_dtype=torch.bfloat16, device_map="auto"
        )
        self.processor = AutoProcessor.from_pretrained(model_dir, use_fast=True)

    def generate(
        self, images: list[Image.Image], text: str, max_new_tokens: int = 50
    ) -> str:
        """
        images: PIL.Image のリスト。必ず1枚以上の画像が渡されることを前提とします。
        text: 入力テキスト（質問文など）。
        max_new_tokens: 生成する最大トークン数。
        """
        # チャット形式で入力プロンプトを作成します。
        # collate_fn の中と同様に、ユーザー発話で画像とテキストを組み合わせたメッセージリストを作ります。
        messages = [
            {
                "role": "user",
                "content": [{"type": "image", "image": img} for img in images]
                + [{"type": "text", "text": text}],
            }
        ]
        # ここでチャットテンプレートを適用し、画像トークンが含まれるプロンプト文字列を生成します。
        prompt = self.processor.apply_chat_template(
            messages, add_generation_prompt=False, tokenize=False
        )

        # processor に渡す際は、画像はもともとリストのまま（さらにリストでラップ）にする必要があります。
        inputs = self.processor(
            text=[prompt], images=[images], return_tensors="pt", padding=True
        )
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

        with torch.no_grad():
            output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
        return self.processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)

## Evaluate Benchmark

In [10]:
task = TaskRegistry.load_task("ja-vlm-bench-in-the-wild")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
# Gemmaモデルのラッパーを初期化（保存済みマージ済みモデルのディレクトリ）
model_dir = "/content/drive/MyDrive/gemma3-jicvqa-finetuned"
model = GemmaVLMWrapper(model_dir)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [25]:
# 評価用に全サンプルから予測と正解をリストに収集する
references = []
predictions = []

# 2. タスクのすべてのサンプルをループ処理する
for example in task.dataset:
    # 各サンプルから入力テキスト、視覚入力、正解回答を取得
    input_text = task.doc_to_text(example)
    image = task.doc_to_visual(
        example
    )  # images は list[Image.Image] として返される前提
    reference = task.doc_to_answer(example)

    # モデルから予測を生成
    pred = model.generate(image, input_text)

    # 予測と正解をリストに追加
    predictions.append(pred)
    references.append(reference)

In [13]:
# 3. スコアラーで評価を行う
scorer = ScorerRegistry.load_scorer("rougel", ScorerConfig(docs=task.dataset))
scores = scorer.score(references, predictions)
result = scorer.aggregate(scores)
print("Evaluation result:", result)

Evaluation result: AggregateOutput(overall_score=np.float64(29.21825823674845), details={'rougel': np.float64(29.21825823674845)})
