# IntelHealth — Inference VRAM Benchmark (Colab)

测量每个 Agent 模型在 GPU 上的推理显存占用和速度。

- 逐个加载 merged 模型（或回退到 base model）
- 记录：加载后显存、推理峰值显存、推理时间、输出 token 数
- 生成对比表格和柱状图

**前置条件**: merged 模型已上传到 Google Drive `models/merged/` 目录

In [None]:
# 0. Clone repo + install deps
import os
repo_dir = '/content/Intel_Health'
if not os.path.exists(repo_dir):
    !git clone https://github.com/DemonRain7/Intel_Health.git {repo_dir}
else:
    !git -C {repo_dir} pull
%cd {repo_dir}

# Colab 已预装 torch，不要重装，否则会循环导入报错
!pip -q install "transformers>=4.46" sentencepiece accelerate

# HuggingFace login (for base model fallback)
try:
    from google.colab import userdata
    os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
    print("HF_TOKEN loaded from Colab Secrets")
except (ImportError, Exception):
    from huggingface_hub import login
    login()

In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

DRIVE_ROOT = '/content/drive/MyDrive/Code_Project/IntelHealth'
MERGED_MODELS_DIR = f'{DRIVE_ROOT}/models/merged'

# 检查有哪些 merged 模型
if os.path.isdir(MERGED_MODELS_DIR):
    print('Merged models found:')
    for d in sorted(os.listdir(MERGED_MODELS_DIR)):
        full = os.path.join(MERGED_MODELS_DIR, d)
        if os.path.isdir(full):
            has_weights = any(f.endswith('.safetensors') or f.endswith('.bin') for f in os.listdir(full))
            print(f'  {d} {"✓" if has_weights else "(empty)"}')
else:
    print(f'WARNING: {MERGED_MODELS_DIR} not found, will use base models only')

In [None]:
# 2. 配置
import gc
import json
import time
from pathlib import Path

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Agent 定义: agent_name → (base_model_id, has_sft)
AGENTS = {
    "symptom_normalizer":     ("Qwen/Qwen3-0.6B", True),
    "symptom_quality_grader": ("Qwen/Qwen3-0.6B", True),
    "rag_relevance_grader":   ("Qwen/Qwen3-0.6B", True),
    "drug_evidence_grader":   ("Qwen/Qwen3-0.6B", True),
    "diagnosis_generator":    ("Qwen/Qwen3-1.7B", True),
    "drug_recommender":       ("Qwen/Qwen3-0.6B", False),
    "diagnosis_reviewer":     ("Qwen/Qwen3-1.7B", False),
}

# 测试 prompt（模拟真实 pipeline 输入）
TEST_MESSAGES = [
    {"role": "system", "content": "你是医学助理，请将用户口语症状整理为专业、结构化描述。只输出JSON。"},
    {"role": "user", "content": (
        "身体部位: 背部\n主要症状: 肩颈疼痛，脊椎僵硬\n"
        "其他症状: 洗碗久了腰就不舒服\n严重程度: 3\n持续时间: 超过4周\n\n"
        '{"optimized_symptoms": "...", "rag_keywords": ["..."]}'
    )},
]

MAX_NEW_TOKENS = 200

print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
print(f'Agents to benchmark: {len(AGENTS)}')

In [None]:
# 3. Benchmark 函数

def get_gpu_mem_mb():
    return torch.cuda.memory_allocated() / 1024 / 1024

def get_gpu_peak_mb():
    return torch.cuda.max_memory_allocated() / 1024 / 1024

def get_model_param_mb(model):
    return sum(p.numel() * p.element_size() for p in model.parameters()) / 1024 / 1024

def resolve_model_source(agent_name):
    """优先使用 merged 模型，不存在则回退到 base model。"""
    merged_path = os.path.join(MERGED_MODELS_DIR, agent_name)
    if os.path.isdir(merged_path):
        has_weights = any(
            f.endswith('.safetensors') or f.endswith('.bin')
            for f in os.listdir(merged_path)
        )
        if has_weights:
            return merged_path, 'merged'
    base_model, _ = AGENTS[agent_name]
    return base_model, 'base'


def benchmark_one(agent_name):
    """对单个 agent 做推理 benchmark，返回结果 dict。"""
    source, source_type = resolve_model_source(agent_name)
    base_model, has_sft = AGENTS[agent_name]

    # 清理显存
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    # 加载模型
    t0 = time.time()
    tokenizer = AutoTokenizer.from_pretrained(source, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        source,
        torch_dtype=torch.float16,
        device_map='cuda',
        trust_remote_code=True,
    )
    model.eval()
    load_time = time.time() - t0

    params_mb = get_model_param_mb(model)
    mem_after_load = get_gpu_mem_mb()

    # 构建输入
    text = tokenizer.apply_chat_template(TEST_MESSAGES, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors='pt')
    inputs = {k: v.to('cuda') for k, v in inputs.items()}
    input_len = inputs['input_ids'].shape[1]

    # 推理
    torch.cuda.reset_peak_memory_stats()
    t1 = time.time()
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=0.2,
            top_p=0.9,
            do_sample=True,
        )
    infer_time = time.time() - t1
    output_tokens = output_ids.shape[1] - input_len
    mem_peak_infer = get_gpu_peak_mb()

    # 清理
    del model, tokenizer, inputs, output_ids
    gc.collect()
    torch.cuda.empty_cache()

    return {
        'agent': agent_name,
        'base_model': base_model,
        'sft': 'Yes' if has_sft else 'No (base)',
        'source_type': source_type,
        'params_mb': round(params_mb, 1),
        'load_time_s': round(load_time, 1),
        'infer_time_s': round(infer_time, 1),
        'output_tokens': output_tokens,
        'tok_per_s': round(output_tokens / max(infer_time, 0.01), 1),
        'mem_after_load_mb': round(mem_after_load, 1),
        'mem_peak_infer_mb': round(mem_peak_infer, 1),
    }

In [None]:
# 4. 运行 Benchmark（逐个 agent）
results = []

for agent_name in AGENTS:
    print(f'\n{"="*50}')
    print(f'Benchmarking: {agent_name}')
    print(f'{"="*50}')
    try:
        r = benchmark_one(agent_name)
        results.append(r)
        print(f'  Source:     {r["source_type"]}')
        print(f'  Params:     {r["params_mb"]} MB')
        print(f'  Load:       {r["load_time_s"]}s')
        print(f'  Inference:  {r["infer_time_s"]}s ({r["output_tokens"]} tokens, {r["tok_per_s"]} tok/s)')
        print(f'  VRAM load:  {r["mem_after_load_mb"]} MB')
        print(f'  VRAM peak:  {r["mem_peak_infer_mb"]} MB')
    except Exception as e:
        print(f'  ERROR: {e}')
        results.append({'agent': agent_name, 'error': str(e)})

print(f'\n{"="*50}')
print(f'All benchmarks complete! ({len(results)} agents)')

In [None]:
# 5. 结果表格
import pandas as pd

valid = [r for r in results if 'error' not in r]

df = pd.DataFrame(valid)
display_cols = ['agent', 'base_model', 'sft', 'source_type', 'params_mb',
                'load_time_s', 'infer_time_s', 'output_tokens', 'tok_per_s',
                'mem_after_load_mb', 'mem_peak_infer_mb']
df = df[display_cols]
df.columns = ['Agent', 'Base Model', 'SFT', 'Source', 'Params(MB)',
              'Load(s)', 'Infer(s)', 'Tokens', 'Tok/s',
              'VRAM Load(MB)', 'VRAM Peak(MB)']

print('\n=== Inference VRAM Benchmark ===')
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'Total GPU: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
print(f'Precision: float16')
print(f'max_new_tokens: {MAX_NEW_TOKENS}')
print()
display(df)

total_infer = sum(r['infer_time_s'] for r in valid)
print(f'\nPipeline 总推理时间（不含加载）: {total_infer:.1f}s')

In [None]:
# 6. 可视化
import matplotlib.pyplot as plt
import numpy as np

valid = [r for r in results if 'error' not in r]
agents = [r['agent'].replace('_', '\n') for r in valid]
x = np.arange(len(agents))

# 按模型大小着色: 0.6B=蓝, 1.7B=橙
colors = ['#2196F3' if '0.6B' in r['base_model'] else '#FF9800' for r in valid]

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# (1) VRAM 加载后
ax = axes[0, 0]
ax.bar(x, [r['mem_after_load_mb'] for r in valid], color=colors)
ax.set_xticks(x)
ax.set_xticklabels(agents, fontsize=8)
ax.set_ylabel('MB')
ax.set_title('VRAM After Model Load (MB)')
ax.grid(axis='y', alpha=0.3)

# (2) VRAM 推理峰值
ax = axes[0, 1]
ax.bar(x, [r['mem_peak_infer_mb'] for r in valid], color=colors)
ax.set_xticks(x)
ax.set_xticklabels(agents, fontsize=8)
ax.set_ylabel('MB')
ax.set_title('VRAM Peak During Inference (MB)')
ax.grid(axis='y', alpha=0.3)

# (3) 推理时间
ax = axes[1, 0]
ax.bar(x, [r['infer_time_s'] for r in valid], color=colors)
ax.set_xticks(x)
ax.set_xticklabels(agents, fontsize=8)
ax.set_ylabel('Seconds')
ax.set_title('Inference Time (s)')
ax.grid(axis='y', alpha=0.3)

# (4) Token 吞吐量
ax = axes[1, 1]
ax.bar(x, [r['tok_per_s'] for r in valid], color=colors)
ax.set_xticks(x)
ax.set_xticklabels(agents, fontsize=8)
ax.set_ylabel('Tokens/s')
ax.set_title('Throughput (tok/s)')
ax.grid(axis='y', alpha=0.3)

# Legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#2196F3', label='Qwen3-0.6B'),
                   Patch(facecolor='#FF9800', label='Qwen3-1.7B')]
fig.legend(handles=legend_elements, loc='upper center', ncol=2, fontsize=11,
           bbox_to_anchor=(0.5, 1.02))

plt.suptitle(f'IntelHealth Inference Benchmark — {torch.cuda.get_device_name(0)} (fp16)',
             fontsize=13, y=1.05)
plt.tight_layout()
plt.savefig('inference_benchmark.png', dpi=150, bbox_inches='tight')
plt.show()
print('Chart saved to inference_benchmark.png')

In [None]:
# 7. 保存结果到 Google Drive
import json

output_dir = f'{DRIVE_ROOT}/docs/benchmark'
os.makedirs(output_dir, exist_ok=True)

# JSON
json_path = f'{output_dir}/inference_benchmark.json'
with open(json_path, 'w') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
print(f'JSON saved to {json_path}')

# Markdown 报告
gpu_name = torch.cuda.get_device_name(0)
gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3

lines = [
    '# IntelHealth 推理显存 Benchmark',
    '',
    f'- **GPU**: {gpu_name} ({gpu_total:.1f} GB)',
    f'- **精度**: float16',
    f'- **max_new_tokens**: {MAX_NEW_TOKENS}',
    '',
    '| Agent | Base Model | SFT | Params(MB) | Load(s) | Infer(s) | Tokens | Tok/s | VRAM Load(MB) | VRAM Peak(MB) |',
    '|-------|-----------|-----|-----------|--------|--------|--------|-------|--------------|--------------|',
]
for r in results:
    if 'error' in r:
        lines.append(f'| {r["agent"]} | ERROR | | | | | | | | {r["error"]} |')
    else:
        lines.append(
            f'| {r["agent"]} | {r["base_model"]} | {r["sft"]} | '
            f'{r["params_mb"]} | {r["load_time_s"]} | {r["infer_time_s"]} | '
            f'{r["output_tokens"]} | {r["tok_per_s"]} | '
            f'{r["mem_after_load_mb"]} | {r["mem_peak_infer_mb"]} |'
        )

md_path = f'{output_dir}/inference_benchmark.md'
with open(md_path, 'w') as f:
    f.write('\n'.join(lines))
print(f'Markdown saved to {md_path}')

# 也拷贝图表
import shutil
if os.path.exists('inference_benchmark.png'):
    shutil.copy('inference_benchmark.png', f'{output_dir}/inference_benchmark.png')
    print(f'Chart copied to {output_dir}/inference_benchmark.png')