【Feature】【Part1】AISBench support the VBench 1.0 Video Quality Evaluation Pipeline.#273
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces VBench 1.0 evaluation support, adding a new VBenchEvalTask for video quality metrics and a VBenchSummarizer for result aggregation. Feedback focused on fixing a potential RuntimeError from repeated distributed initialization, ensuring all dimension scores are exposed for the summarizer, and correcting weighted average calculations when dimensions are missing. Additionally, improvements were suggested for normalization logic, error handling, and code organization.
| for dataset_cfg in self.dataset_cfgs: | ||
| eval_cfg = dataset_cfg.get('eval_cfg') or {} | ||
| # videos_path: required, from path or videos_path | ||
| videos_path = dataset_cfg.get('videos_path') or dataset_cfg.get('path') | ||
| if not videos_path or not osp.isdir(videos_path): | ||
| raise ValueError( | ||
| f"VBench dataset must have 'path' or 'videos_path' pointing to a video directory, got: {videos_path}" | ||
| ) | ||
| # device: cuda | npu | None (auto-detect) | ||
| device_str = eval_cfg.get('device') | ||
| if device_str is not None and device_str not in ('cuda', 'npu'): | ||
| device_str = None | ||
| dist_init(device=device_str) | ||
| device_str = get_device() | ||
| # full_json_dir: VBench full info json | ||
| full_json_dir = dataset_cfg.get('full_json_dir') or eval_cfg.get('full_json_dir') | ||
| if not full_json_dir or not osp.isfile(full_json_dir): | ||
| # default under third_party/vbench | ||
| pkg_root = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) | ||
| default_full = osp.join(pkg_root, 'third_party', 'vbench', 'VBench_full_info.json') | ||
| if osp.isfile(default_full): | ||
| full_json_dir = default_full | ||
| else: | ||
| raise FileNotFoundError( | ||
| f"VBench full_info json not found. Set dataset full_json_dir or place VBench_full_info.json at {default_full}" | ||
| ) | ||
| # dimension_list | ||
| dimension_list = dataset_cfg.get('dimension_list') or eval_cfg.get('dimension_list') | ||
| if not dimension_list: | ||
| dimension_list = [ | ||
| 'subject_consistency', 'background_consistency', 'aesthetic_quality', | ||
| 'imaging_quality', 'object_class', 'multiple_objects', 'color', | ||
| 'spatial_relationship', 'scene', 'temporal_style', 'overall_consistency', | ||
| 'human_action', 'temporal_flickering', 'motion_smoothness', 'dynamic_degree', | ||
| 'appearance_style', | ||
| ] | ||
| # output dir: work_dir/results/<model_abbr>/ | ||
| model_abbr = model_abbr_from_cfg(self.model_cfg) | ||
| dataset_abbr = dataset_abbr_from_cfg(dataset_cfg) | ||
| output_dir = osp.join(self.work_dir, self.output_subdir, model_abbr) | ||
| os.makedirs(output_dir, exist_ok=True) | ||
|
|
||
| if get_rank() == 0: | ||
| self.logger.info( | ||
| f"VBench eval: videos_path={videos_path}, device={device_str}, " | ||
| f"dimensions={len(dimension_list)}, output_dir={output_dir}" | ||
| ) | ||
|
|
||
| import torch | ||
| device = torch.device(device_str) | ||
| vbench = VBench(device, full_json_dir, output_dir) | ||
|
|
||
| # 注册进度回调,将 VBench 内部的维度进度映射到 TaskStateManager | ||
| if task_state_manager is not None: | ||
| def _on_progress(dimension: str, finished: int, total: int, video_path: str | None = None, **_): | ||
| # 仅在 rank0 上上报,避免多卡重复 | ||
| if get_rank() != 0: | ||
| return | ||
| state = { | ||
| "status": "evaluating", | ||
| "total_count": total, | ||
| "finish_count": finished, | ||
| "other_kwargs": { | ||
| "dimension": dimension, | ||
| }, | ||
| } | ||
| task_state_manager.update_task_state(state) | ||
|
|
||
| set_progress_callback(_on_progress) | ||
|
|
||
| # Infer mode if not explicitly provided | ||
| mode = self._infer_mode(dataset_cfg, eval_cfg) | ||
|
|
||
| prompt_list = dataset_cfg.get('prompt_list') or eval_cfg.get('prompt_list') or [] | ||
| prompt_file = eval_cfg.get('prompt_file') | ||
| if prompt_file and osp.isfile(prompt_file): | ||
| with open(prompt_file, 'r') as f: | ||
| prompt_list = json.load(f) | ||
| assert isinstance(prompt_list, dict), "prompt_file must be JSON dict {video_path: prompt}" | ||
|
|
||
| kwargs = {} | ||
| if eval_cfg.get('category'): | ||
| kwargs['category'] = eval_cfg['category'] | ||
| if eval_cfg.get('imaging_quality_preprocessing_mode'): | ||
| kwargs['imaging_quality_preprocessing_mode'] = eval_cfg['imaging_quality_preprocessing_mode'] | ||
|
|
||
| try: | ||
| raw_results = vbench.evaluate( | ||
| videos_path=videos_path, | ||
| name=dataset_abbr, | ||
| prompt_list=prompt_list, | ||
| dimension_list=dimension_list, | ||
| local=eval_cfg.get('load_ckpt_from_local', False), | ||
| read_frame=eval_cfg.get('read_frame', False), | ||
| mode=mode, | ||
| **kwargs, | ||
| ) | ||
|
|
||
| if get_rank() == 0: | ||
| # Wrap raw VBench results to {accuracy, details} schema and save. | ||
| wrapped = self._wrap_results(raw_results) | ||
| final_out = get_infer_output_path( | ||
| self.model_cfg, | ||
| dataset_cfg, | ||
| osp.join(self.work_dir, self.output_subdir), | ||
| ) | ||
| os.makedirs(osp.dirname(final_out), exist_ok=True) | ||
| with open(final_out, 'w', encoding='utf-8') as f: | ||
| json.dump(wrapped, f, ensure_ascii=False, indent=4) | ||
| self.logger.info(f"VBench wrapped results saved to {final_out}") | ||
| finally: | ||
| dist_destroy() |
There was a problem hiding this comment.
每次初始化的device独立,不存在重复初始化
| def _wrap_results(self, raw_results: dict) -> dict: | ||
| """Convert raw VBench per-dimension results to {accuracy, details} schema.""" | ||
| details = {} | ||
| scores = [] | ||
| for dim, value in raw_results.items(): | ||
| dim_detail = {} | ||
| if isinstance(value, dict): | ||
| dim_detail = value | ||
| score = value.get('score') or value.get('mean_score') | ||
| elif isinstance(value, (list, tuple)) and len(value) == 2: | ||
| score, video_results = value | ||
| dim_detail = {'score': score, 'video_results': video_results} | ||
| else: | ||
| dim_detail = {'value': value} | ||
| score = None | ||
| if isinstance(score, (int, float)): | ||
| scores.append(score) | ||
| details[dim] = dim_detail | ||
| accuracy = statistics.mean(scores) if scores else 0.0 | ||
| return {'accuracy': accuracy * 100, 'details': details} |
There was a problem hiding this comment.
当前的 _wrap_results 实现仅返回聚合的 accuracy。由于 DefaultSummarizer 在加载结果时会过滤掉 details 字段,导致 VBenchSummarizer 无法获取各个维度的具体得分进行加权汇总。建议将各个维度的得分也平铺到返回字典的顶层,并统一加上 vbench_ 前缀,以便汇总器识别。
def _wrap_results(self, raw_results: dict) -> dict:
"""Convert raw VBench per-dimension results to {accuracy, details} schema."""
details = {}
scores = []
wrapped = {}
for dim, value in raw_results.items():
dim_detail = {}
if isinstance(value, dict):
dim_detail = value
score = value.get('score') or value.get('mean_score')
elif isinstance(value, (list, tuple)) and len(value) == 2:
score, video_results = value
dim_detail = {'score': score, 'video_results': video_results}
else:
dim_detail = {'value': value}
score = None
if isinstance(score, (int, float)):
scores.append(score)
# Flatten dimension score for Summarizer
wrapped[f'vbench_{dim}'] = score * 100
details[dim] = dim_detail
accuracy = statistics.mean(scores) if scores else 0.0
wrapped.update({'accuracy': accuracy * 100, 'details': details})
return wrapped| for abbr, data in model_results.items(): | ||
| if not abbr.startswith('vbench_'): | ||
| continue | ||
| acc = data.get('accuracy') | ||
| if acc is None or not isinstance(acc, (int, float)): | ||
| continue | ||
| const_key = _abbr_to_const_key(abbr) | ||
| vbench_scores[const_key] = acc |
There was a problem hiding this comment.
汇总逻辑目前是按数据集(abbr)遍历并提取其 accuracy。然而,一个 VBench 数据集通常包含多个维度的指标。如果按照建议修改了 VBenchEvalTask 的输出结构,这里应该改为遍历数据集下的所有指标(metrics),提取以 vbench_ 开头的维度得分。
| for abbr, data in model_results.items(): | |
| if not abbr.startswith('vbench_'): | |
| continue | |
| acc = data.get('accuracy') | |
| if acc is None or not isinstance(acc, (int, float)): | |
| continue | |
| const_key = _abbr_to_const_key(abbr) | |
| vbench_scores[const_key] = acc | |
| for abbr, data in model_results.items(): | |
| for metric_name, score in data.items(): | |
| if not metric_name.startswith('vbench_'): | |
| continue | |
| if metric_name in ['vbench_quality', 'vbench_semantic', 'vbench_total']: | |
| continue | |
| const_key = _abbr_to_const_key(metric_name) | |
| vbench_scores[const_key] = score |
| """Normalize and apply DIM_WEIGHT per cal_final_score.py.""" | ||
| if const_key not in NORMALIZE_DIC or const_key not in DIM_WEIGHT: | ||
| return 0.0 | ||
| raw = raw_score / 100.0 if raw_score > 1 else raw_score |
| quality_num = sum(normalized.get(k, 0) for k in QUALITY_LIST) | ||
| quality_denom = sum(DIM_WEIGHT.get(k, 0) for k in QUALITY_LIST) | ||
| quality_score = ( | ||
| quality_num / quality_denom if quality_denom else 0.0 | ||
| ) | ||
|
|
||
| semantic_num = sum(normalized.get(k, 0) for k in SEMANTIC_LIST) | ||
| semantic_denom = sum(DIM_WEIGHT.get(k, 0) for k in SEMANTIC_LIST) | ||
| semantic_score = ( | ||
| semantic_num / semantic_denom if semantic_denom else 0.0 | ||
| ) |
There was a problem hiding this comment.
计算加权平均分时,分母使用了完整的维度列表权重和。如果用户只运行了部分维度的评测,这会导致总分被错误地拉低。建议分母仅累加当前结果中实际存在的维度的权重。
| quality_num = sum(normalized.get(k, 0) for k in QUALITY_LIST) | |
| quality_denom = sum(DIM_WEIGHT.get(k, 0) for k in QUALITY_LIST) | |
| quality_score = ( | |
| quality_num / quality_denom if quality_denom else 0.0 | |
| ) | |
| semantic_num = sum(normalized.get(k, 0) for k in SEMANTIC_LIST) | |
| semantic_denom = sum(DIM_WEIGHT.get(k, 0) for k in SEMANTIC_LIST) | |
| semantic_score = ( | |
| semantic_num / semantic_denom if semantic_denom else 0.0 | |
| ) | |
| quality_num = sum(normalized.get(k, 0) for k in QUALITY_LIST) | |
| quality_denom = sum(DIM_WEIGHT.get(k, 0) for k in QUALITY_LIST if k in normalized) | |
| quality_score = ( | |
| quality_num / quality_denom if quality_denom else 0.0 | |
| ) | |
| semantic_num = sum(normalized.get(k, 0) for k in SEMANTIC_LIST) | |
| semantic_denom = sum(DIM_WEIGHT.get(k, 0) for k in SEMANTIC_LIST if k in normalized) | |
| semantic_score = ( | |
| semantic_num / semantic_denom if semantic_denom else 0.0 | |
| ) |
| if prompt_file and osp.isfile(prompt_file): | ||
| with open(prompt_file, 'r') as f: | ||
| prompt_list = json.load(f) | ||
| assert isinstance(prompt_list, dict), "prompt_file must be JSON dict {video_path: prompt}" |
There was a problem hiding this comment.
避免在生产代码中使用 assert 语句进行运行时参数校验,因为在 Python 优化模式(python -O)下 assert 会被跳过。建议改为显式的类型检查并抛出异常。
| assert isinstance(prompt_list, dict), "prompt_file must be JSON dict {video_path: prompt}" | |
| if not isinstance(prompt_list, dict): | |
| raise ValueError("prompt_file must be JSON dict {video_path: prompt}") |
| f"dimensions={len(dimension_list)}, output_dir={output_dir}" | ||
| ) | ||
|
|
||
| import torch |
| dimension_list = [ | ||
| 'subject_consistency', 'background_consistency', 'aesthetic_quality', | ||
| 'imaging_quality', 'object_class', 'multiple_objects', 'color', | ||
| 'spatial_relationship', 'scene', 'temporal_style', 'overall_consistency', | ||
| 'human_action', 'temporal_flickering', 'motion_smoothness', 'dynamic_degree', | ||
| 'appearance_style', | ||
| ] |
There was a problem hiding this comment.
三方依赖中 ais_bench/third_party/vbench/init.py build_full_dimension_list 函数会自动初始化这些评估维度,此处不需要重复做硬编码。
| def _ensure_vbench_in_path(self): | ||
| """Prepend third_party and third_party/detectron2 to sys.path so vbench and detectron2 resolve to ais_bench copy.""" | ||
| # __file__ = ais_bench/benchmark/tasks/vbench_eval.py -> pkg_root = ais_bench | ||
| pkg_root = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) |
There was a problem hiding this comment.
[review] issue: 这里把 pkg_root 解析成了 ais_bench 目录,但仓库里的 third_party 实际在仓库根目录下,不在 ais_bench 下,导致后续拼出的 third_party 和 detectron2 路径在当前仓库结构中是错的,运行时很可能找不到 vbench 依赖。
suggestion: 不要从当前文件只回退两级,改为统一解析仓库根目录后再拼 third_party;更稳妥的做法是抽成一个公共 helper,避免同类路径逻辑在文件里重复。
| device_str = eval_cfg.get('device') | ||
| if device_str is not None and device_str not in ('cuda', 'npu'): | ||
| device_str = None | ||
| dist_init(device=device_str) |
There was a problem hiding this comment.
[review] issue: dist_init 在这里执行,但真正的 try/finally 从 187 行才开始;如果中间任何一步抛异常,比如 full_json_dir 校验失败、torch 导入失败、VBench 构造失败或 build_full_dimension_list 失败,dist_destroy 不会被调用,分布式上下文和设备资源会泄漏。
suggestion: 把 try/finally 提前到 dist_init 之后立刻开始,确保只要初始化成功就一定能执行 dist_destroy;更好的是把分布式初始化封装成上下文管理器。
| full_json_dir = dataset_cfg.get('full_json_dir') or eval_cfg.get('full_json_dir') | ||
| if not full_json_dir or not osp.isfile(full_json_dir): | ||
| # default under third_party/vbench | ||
| pkg_root = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) |
There was a problem hiding this comment.
[review] issue: 这里对 VBench_full_info.json 的默认路径回退复用了和 47 行同样的错误根目录假设,会去找 ais_bench/third_party/vbench/VBench_full_info.json,而不是仓库根目录下的 third_party/vbench/VBench_full_info.json,默认配置下会直接报找不到文件。
suggestion: 和依赖路径解析共用同一个“仓库根目录定位”函数,避免路径规则分散且重复出错。
| dim_detail = {} | ||
| if isinstance(value, dict): | ||
| dim_detail = value | ||
| score = value.get('score') or value.get('mean_score') |
There was a problem hiding this comment.
[review] issue: score = value.get('score') or value.get('mean_score') 会把 0 或 0.0 当成 False 处理,导致合法的零分被错误替换成 mean_score 或 None,最终 accuracy 统计会失真。
suggestion: 显式判断键是否存在,例如优先判断 'score' in value,再取值;不要用 or 处理数值型字段。
| sys.path.insert(0, os.getcwd()) | ||
| script_path = __file__ | ||
| python = sys.executable | ||
| command = f'{python} {script_path} {cfg_path}' |
There was a problem hiding this comment.
[review] issue: 命令通过字符串直接拼接 python、script_path 和 cfg_path,没有做平台相关的参数转义;一旦路径中包含空格、括号或特殊字符,命令容易执行失败,在通过 shell 模板执行时也会扩大注入风险。
suggestion: 优先返回参数列表而不是命令字符串;如果必须返回字符串,至少使用平台兼容的安全转义方式生成命令。
| self.num_gpus = 1 | ||
|
|
||
| def get_command(self, cfg_path, template): | ||
| sys.path.insert(0, os.getcwd()) |
There was a problem hiding this comment.
[review] issue: get_command 的职责只是生成命令,但这里顺带修改了当前进程的 sys.path,会给调用方带来隐式全局副作用,也让命令生成逻辑和运行环境耦合在一起。
suggestion: 把环境准备放到实际执行路径里,get_command 保持纯函数;如果确实需要注入 PYTHONPATH,应该显式体现在命令模板或运行器配置中。
|
|
||
| prompt_list = dataset_cfg.get('prompt_list') or eval_cfg.get('prompt_list') or [] | ||
| prompt_file = eval_cfg.get('prompt_file') | ||
| if prompt_file and osp.isfile(prompt_file): |
There was a problem hiding this comment.
[review] issue: 配置了prompt_file但文件不存在时,这里会静默跳过并继续使用空prompt_list或其他回退值,最终可能以错误模式跑完评估,且排障成本很高。
suggestion: 只要配置了 prompt_file,就应先判断文件是否存在,不存在时直接抛出明确异常,而不是静默降级。
| try: | ||
| task = VBenchEvalTask(cfg) | ||
| task.run(task_state_manager) | ||
| except Exception as e: |
There was a problem hiding this comment.
[review] issue: except 中使用 raise e 会重置异常抛出位置,损失原始 traceback 的可读性,不利于定位真实故障点。
suggestion: 改为直接使用 raise,保留原始调用栈。
| if span <= 0: | ||
| norm = 1.0 if raw >= max_val else 0.0 | ||
| else: | ||
| norm = (raw - min_val) / span |
There was a problem hiding this comment.
[review] issue: 归一化后没有做边界限制,原始分数一旦超出预期范围,可能得到负值或大于1的结果,进一步导致最终总分出现负分或超过100分。
suggestion: 对归一化结果做[0, 1]截断;如果必须严格复刻上游逻辑,也至少在越界时记录warning,避免异常值静默进入最终汇总。
| from ais_bench.benchmark.summarizers.default import DefaultSummarizer | ||
|
|
||
| # VBench official constants from scripts/constant.py | ||
| NORMALIZE_DIC = { |
There was a problem hiding this comment.
[review]issue: NORMALIZE_DIC、DIM_WEIGHT、QUALITY_LIST、SEMANTIC_LIST 都硬编码在文件内,而注释又说明它们来自官方脚本,后续官方规则变更时这里容易与上游漂移。
suggestion: 将这些常量抽到独立配置或版本化资源文件,并标注来源版本或 commit;必要时增加一致性校验。
| model_results = parsed_results.get(model_abbr, {}) | ||
| vbench_scores = {} | ||
| for abbr, data in model_results.items(): | ||
| if not abbr.startswith('vbench_'): |
There was a problem hiding this comment.
[review] issue: 通过abbr.startswith('vbench_') 识别 VBench 结果,过度依赖命名约定;一旦配置自定义 abbr 或命名规则变化,相关维度会被静默跳过,最终不生成聚合指标。
suggestion: 改为基于明确元数据识别,例如数据集类型、任务标记或结果中的 benchmark 标识,而不是依赖字符串前缀。
| for k, v in vbench_scores.items() | ||
| } | ||
|
|
||
| quality_num = sum(normalized.get(k, 0) for k in QUALITY_LIST) |
There was a problem hiding this comment.
[review] issue: quality_denom固定按QUALITY_LIST全量权重求和,缺失维度会被隐式按0分处理,导致部分维度评测时总分系统性偏低,结果混淆了“模型表现差”和“输入不完整”。
suggestion: 分母只统计实际存在的维度;如果设计上要求必须全量维度,则在聚合前显式校验缺失项并报错。
| quality_num / quality_denom if quality_denom else 0.0 | ||
| ) | ||
|
|
||
| semantic_num = sum(normalized.get(k, 0) for k in SEMANTIC_LIST) |
There was a problem hiding this comment.
[review] issue: semantic_denom同样按全量语义维度计算,和质量分一样会在部分维度缺失时产生失真结果,而且调用方很难从结果中看出这是输入不完整造成的。
suggestion: 只基于实际参与聚合的维度计算分母,或者在缺项时返回明确错误/告警信息。
| ]: | ||
| raw_results[model_abbr].setdefault(name, {})['accuracy'] = score | ||
| parsed_results[model_abbr].setdefault(name, {})['accuracy'] = score | ||
| if name not in dataset_metrics: |
There was a problem hiding this comment.
[review] issue: 这里把vbench_quality、vbench_semantic、vbench_total直接写入dataset_metrics的全局key空间,派生汇总指标与真实数据集结果处于同一层级,存在命名冲突和语义混淆风险。
suggestion: 为聚合指标使用独立命名空间,或复用父类的group/summary机制,避免把派生指标伪装成普通数据集结果。
Thanks for your contribution; we appreciate it a lot. The following instructions will make your pull request healthier and help you get feedback more easily. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers.
感谢您的贡献,我们非常重视。以下说明将使您的拉取请求更健康,更易于获得反馈。如果您不理解某些项目,请不要担心,只需提交拉取请求并从维护人员那里寻求帮助即可。
PR Type / PR类型
Related Issue | 关联 Issue
(暂无)
🔍 Motivation / 变更动机
在 ais_bench 中接入 VBench 1.0 类视频/图像质量评测:对给定视频目录跑官方多维指标(质量、语义、风格等),评测结果结构与现有 benchmark 任务输出对齐,并可通过配置控制缓存与运行模式。子进程通过仓库内
third_party与detectron2路径解析依赖;VBENCH_CACHE_DIR/vbench_cache_dir在分区阶段写入任务配置,供子进程在 import vbench 前生效。📝 Modification / 修改内容
VBenchEvalTask(ais_bench/benchmark/tasks/vbench_eval.py),解析 dataset / eval 配置,推断运行模式(vbench_standard/vbench_category/custom_input等),调用 VBench 并将各维度原始结果规整为{accuracy, details}。VBenchDataset(ais_bench/benchmark/datasets/vbench.py)及注册,提供最小占位load();实际评测由任务直接读取 dataset 配置完成。VBenchSummarizer(ais_bench/benchmark/summarizers/vbench.py),使用官方归一化区间与维度权重聚合。configs/summarizers/vbench.py;更新tasks/__init__.py、datasets/__init__.py、summarizers/__init__.py。BasePartitioner默认keep_keys增加VBENCH_CACHE_DIR、vbench_cache_dir,保证子进程任务 cfg 可复制上述键。📐 Associated Test Results / 关联测试结果
否。本次为新增任务、数据集占位类、汇总器与可选顶层配置键传递,不改变既有任务默认行为。
未专门评估;VBench 本身依赖 GPU/NPU 与模型推理,开销由评测工作量决定。
🌟 Use cases (Optional) / 使用案例(可选)
custom_input场景(由eval_cfg/ dataset 配置共同决定)。✅ Checklist / 检查列表
Before PR:
After PR:
👥 Collaboration Info / 协作信息
🌟 Useful CI Command / 实用的CI命令
/gemini review/gemini summary/gemini help/readthedocs build