## pip

In [10]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [27]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=54c8fb03068cd2c897b18c11f58dda73505c4201d944684b25c348b0035ea5e4
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [31]:
!pip install fugashi
!pip install emoji

Collecting fugashi
  Downloading fugashi-1.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Downloading fugashi-1.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (698 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/698.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m698.0/698.0 kB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fugashi
Successfully installed fugashi-1.4.0
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [44]:
!pip install unidic-lite # for MeCab

Collecting unidic-lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658817 sha256=2ebdc67ddd145b4210a32382fba2bb152538af9444648070d1873c7f3fd7cb04
  Stored in directory: /root/.cache/pip/wheels/b7/fd/e9/ea4459b868e6d2902e8d80e82dbacb6203e05b3b3a58c64966
Successfully built unidic-lite
Installing collected packages: unidic-lite
Successfully installed unidic-lite-1.0.8


## importする

In [9]:
import os

import torch
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor
from typing import Callable

In [46]:
import abc
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datasets import Dataset, concatenate_datasets, load_dataset
from rouge_score import rouge_scorer, scoring

In [24]:
from concurrent.futures import ProcessPoolExecutor, Future

In [33]:
from fugashi import Tagger

In [34]:
import emoji

In [35]:
import unicodedata
import re

## LLMjpのコードを一部利用

### TaskConfig

In [12]:
@dataclass
class TaskConfig:
    max_dataset_len: int | None = None
    rotate_choices: bool = False

### Task

In [13]:
class Task(abc.ABC):
    def __init__(self, config: TaskConfig):
        self.config = config

        if self.config.max_dataset_len is not None:
            self.dataset = self._prepare_dataset().select(
                range(self.config.max_dataset_len)
            )
        else:
            self.dataset = self._prepare_dataset()

    @abc.abstractmethod
    def _prepare_dataset(self) -> Dataset:
        """Prepares the dataset."""
        pass

    @abc.abstractmethod
    def doc_to_text(self, doc) -> str:
        """Converts a document to text."""
        pass

    @abc.abstractmethod
    def doc_to_visual(self, doc) -> list[Image.Image]:
        """Converts a document to visual."""
        pass

    @abc.abstractmethod
    def doc_to_id(self, doc) -> str:
        """Converts a document to id."""
        pass

    @abc.abstractmethod
    def doc_to_answer(self, doc) -> str:
        """Converts a document to answer."""
        pass

### JaVGVQA500

In [16]:
class JaVGVQA500(Task):
    default_metric = "rougel"

    @staticmethod
    def _prepare_dataset() -> Dataset:
        ds = load_dataset("SakanaAI/JA-VG-VQA-500", split="test")

        def flatten_sample(sample):
            dataset = {
                "image_id": [sample["image_id"] for _ in sample["qas"]],
                "image": [sample["image"] for _ in sample["qas"]],
                "qa_id": [qa["qa_id"] for qa in sample["qas"]],
                "question": [qa["question"] for qa in sample["qas"]],
                "answer": [qa["answer"] for qa in sample["qas"]],
            }
            return Dataset.from_dict(dataset)

        fragments = []
        for i, sample in enumerate(ds):
            data_fragment = flatten_sample(sample)
            fragments.append(data_fragment)

        ds = concatenate_datasets(fragments)
        ds = ds.rename_column("question", "input_text")
        ds = ds.rename_column("qa_id", "question_id")

        return ds

    @staticmethod
    def doc_to_text(doc) -> str:
        return doc["input_text"]

    @staticmethod
    def doc_to_visual(doc) -> list[Image.Image]:
        return [doc["image"]]

    @staticmethod
    def doc_to_id(doc) -> str:
        return str(doc["question_id"])

    @staticmethod
    def doc_to_answer(doc) -> str:
        return doc["answer"]


### TaskRegistry

In [18]:
class TaskRegistry:
    """Registry to map metrics to their corresponding scorer classes."""

    _tasks: dict[str, Callable[[TaskConfig], Task]] = {
        "ja-vg-vqa-500": JaVGVQA500,
    }

    @classmethod
    def get_task_list(cls):
        return list(cls._tasks.keys())

    @classmethod
    def load_task(cls, task_name: str, task_config: TaskConfig = TaskConfig()) -> Task:
        try:
            return cls._tasks[task_name](task_config)  # type: ignore
        except KeyError:
            raise ValueError(f"Task '{task_name}' is not supported.")

### AggregateOutput

In [20]:
@dataclass
class AggregateOutput:
    overall_score: float
    details: dict[str, float]

### ScorerConfig

In [21]:
@dataclass
class ScorerConfig:
    docs: dict | None = None
    judge_model: str | None = None
    batch_size: int = 10
    random_choice: bool = False

### Scorer

In [23]:
class Scorer(ABC):
    def __init__(self, config: ScorerConfig):
        self.config = config

    @abstractmethod
    def score(self, refs: list[str], preds: list[str]) -> list:
        raise NotImplementedError

    @abstractmethod
    def aggregate(self, scores: list) -> AggregateOutput:
        raise NotImplementedError

### MecabTokenizer

In [36]:
class MecabTokenizer:
    def __init__(self) -> None:
        self.tagger = Tagger("-Owakati")

    def normalize_answer(self, text: str) -> str:
        """Lower case text, remove punctuation and extra whitespace, etc."""

        def white_space_fix(text: str) -> str:
            return " ".join(text.split())

        def remove_emoji(text: str) -> str:
            text = "".join(["" if emoji.is_emoji(c) else c for c in text])
            emoji_pattern = re.compile(
                "["
                "\U0001f600-\U0001f64f"  # emoticons
                "\U0001f300-\U0001f5ff"  # symbols & pictographs
                "\U0001f680-\U0001f6ff"  # transport & map symbols
                "\U0001f1e0-\U0001f1ff"  # flags (iOS)
                "\U00002702-\U000027b0"
                "]+",
                flags=re.UNICODE,
            )
            return emoji_pattern.sub(r"", text)

        text = remove_emoji(text)
        # see neologdn docs for details, but handles things like full/half width variation
        # text = neologdn.normalize(text) FIXME: fix c++12 error when installing neologdn
        text = unicodedata.normalize("NFKC", text)
        text = white_space_fix(text)
        return text

    def tokenize(self, text):
        return self.tagger.parse(self.normalize_answer(text)).split()

### rouge_ja

In [37]:
def rouge_ja(refs: list[str], preds: list[str]) -> dict:
    """Compute ROUGE-L scores for Japanese text.
    Args:
        refs: list of reference strings
        preds: list of predicted strings
    Returns:
        dict: dictionary with keys: { 'rouge1', 'rouge2', 'rougeL' }
        Each value is a float representing the ROUGE score (f-measure) * 100.
    """
    assert isinstance(refs, list) and isinstance(
        preds, list
    ), "refs and preds must be lists."
    tokenizer = MecabTokenizer()
    rouge_types = ["rouge1", "rouge2", "rougeL"]
    # mecab-based rouge
    scorer = rouge_scorer.RougeScorer(
        rouge_types,
        tokenizer=tokenizer,
    )

    # Accumulate confidence intervals.
    aggregator = scoring.BootstrapAggregator()
    for ref, pred in zip(refs, preds):
        aggregator.add_scores(scorer.score(ref, pred))
    result = aggregator.aggregate()
    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}


### RougeLScorer

In [38]:
class RougeLScorer(Scorer):
    @staticmethod
    def score(refs: list[str], preds: list[str]) -> list[float]:
        futures: list[Future[dict[str, float]]] = []
        with ProcessPoolExecutor() as executor:
            for ref, pred in zip(refs, preds):
                future = executor.submit(rouge_ja, [ref], [pred])
                futures.append(future)
        scores = [f.result()["rougeL"] for f in futures]
        return scores

    @staticmethod
    def aggregate(scores: list[float]) -> AggregateOutput:
        mean = sum(scores) / len(scores)
        return AggregateOutput(mean, {"rougel": mean})

### ScorerRegistry

In [39]:
class ScorerRegistry:
    """Registry to map metrics to their corresponding scorer classes."""

    _scorers: dict[str, Callable[[ScorerConfig], Scorer]] = {
        "rougel": RougeLScorer,
    }

    @classmethod
    def get_metric_list(cls) -> list[str]:
        """Get a list of supported metrics."""
        return list(cls._scorers.keys())

    @classmethod
    def load_scorer(
        cls, metric: str, scorer_config: ScorerConfig = ScorerConfig()
    ) -> Scorer:
        """Load a scorer instance from the scorer registry."""
        try:
            return cls._scorers[metric](scorer_config)  # type: ignore
        except KeyError:
            raise ValueError(f"Metric '{metric}' is not supported.")

## Gemmaのwrapperクラス。LLMjp eval-mmを回すためにwrapperする

In [40]:
# 1. GemmaVLMWrapper クラス: マージ済みモデルをロードして generate() を実装
class GemmaVLMWrapper:
    def __init__(self, model_dir: str):
        self.model = AutoModelForImageTextToText.from_pretrained(
            model_dir, torch_dtype=torch.bfloat16, device_map="auto"
        )
        self.processor = AutoProcessor.from_pretrained(model_dir, use_fast=True)

    def generate(
        self, images: list[Image.Image], text: str, max_new_tokens: int = 50
    ) -> str:
        """
        images: PIL.Image のリスト。必ず1枚以上の画像が渡されることを前提とします。
        text: 入力テキスト（質問文など）。
        max_new_tokens: 生成する最大トークン数。
        """
        # チャット形式で入力プロンプトを作成します。
        # collate_fn の中と同様に、ユーザー発話で画像とテキストを組み合わせたメッセージリストを作ります。
        messages = [
            {
                "role": "user",
                "content": [{"type": "image", "image": img} for img in images]
                + [{"type": "text", "text": text}],
            }
        ]
        # ここでチャットテンプレートを適用し、画像トークンが含まれるプロンプト文字列を生成します。
        prompt = self.processor.apply_chat_template(
            messages, add_generation_prompt=False, tokenize=False
        )

        # processor に渡す際は、画像はもともとリストのまま（さらにリストでラップ）にする必要があります。
        inputs = self.processor(
            text=[prompt], images=[images], return_tensors="pt", padding=True
        )
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

        with torch.no_grad():
            output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
        return self.processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)

## Benchmarkを回す

In [41]:
# 2. メイン処理: LLMjp のタスク "japanese-heron-bench" のサンプルをロードし評価実施

  # 1. タスクをロード
task = TaskRegistry.load_task("ja-vg-vqa-500")

In [29]:
# Gemmaモデルのラッパーを初期化（保存済みマージ済みモデルのディレクトリ）
model_dir = "/content/drive/MyDrive/gemma3-jicvqa-finetuned" # todo; fix model loading path
model = GemmaVLMWrapper(model_dir)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [42]:
# 2. タスクのすべてのサンプルをループ処理する
# 評価用に全サンプルから予測と正解をリストに収集する
references = []
predictions = []
for example in task.dataset:
    # 各サンプルから入力テキスト、視覚入力、正解回答を取得
    input_text = task.doc_to_text(example)
    images = task.doc_to_visual(
        example
    )  # images は list[Image.Image] として返される前提
    reference = task.doc_to_answer(example)

    # モデルから予測を生成
    pred = model.generate(images, input_text)

    # 予測と正解をリストに追加
    predictions.append(pred)
    references.append(reference)

In [47]:
# 3. スコアラーで評価を行う
scorer = ScorerRegistry.load_scorer("rougel", ScorerConfig(docs=task.dataset))
scores = scorer.score(references, predictions)
result = scorer.aggregate(scores)
print("Evaluation result:", result)

Evaluation result: AggregateOutput(overall_score=np.float64(8.497517725345373), details={'rougel': np.float64(8.497517725345373)})
