# MateConv mini Inference and Evaluate

## 1. 两种微调架构的模型导入--FFN和MOE

In [1]:
import itertools
import re
import json
import jsonlines
import psutil
import ujson
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from datasets import load_dataset
import os
from tqdm import tqdm
import torch
from model.model import Transformer  # 确保路径正确
from model.LMConfig import LMConfig
from model.LMConfig_FFN import LMConfig_FFN   # 导入 LMConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 定义BOS和EOS标记
bos_token = "<s>"
eos_token = "</s>"

In [3]:
# 加载训练好的分词器路径
tokenizer = AutoTokenizer.from_pretrained('./model/mateconv_tokenizer', use_fast=False)
print(f'加载的tokenizer词表大小: {len(tokenizer)}')

加载的tokenizer词表大小: 6400


In [4]:
# 创建配置对象
lm_config_moe = LMConfig()
lm_config_ffn = LMConfig_FFN()

In [5]:
# 初始化 Transformer 模型
model_moe = Transformer(lm_config_moe)
model_ffn = Transformer(lm_config_ffn)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
device

device(type='cuda')

In [8]:
model_moe.to(device)

# 检查模型结构和参数
print(model_moe)

Transformer(
  (tok_embeddings): Embedding(6400, 512)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-7): 8 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=512, out_features=512, bias=False)
        (wk): Linear(in_features=512, out_features=256, bias=False)
        (wv): Linear(in_features=512, out_features=256, bias=False)
        (wo): Linear(in_features=512, out_features=512, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
      (feed_forward): MOEFeedForward(
        (experts): ModuleList(
          (0-3): 4 x FeedForward(
            (w1): Linear(in_features=512, out_features=1408, bias=False)
            (w2): Linear(in_features=1408, out_features=512, bias=False)
            (w3): Linear(in_features=512, out_features=1408, bias=False)
            (dropout): Dropout(p=0.0, inpla

In [9]:
model_ffn.to(device)
print(model_ffn)

Transformer(
  (tok_embeddings): Embedding(6400, 512)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-7): 8 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=512, out_features=512, bias=False)
        (wk): Linear(in_features=512, out_features=256, bias=False)
        (wv): Linear(in_features=512, out_features=256, bias=False)
        (wo): Linear(in_features=512, out_features=512, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
      (feed_forward): FeedForward(
        (w1): Linear(in_features=512, out_features=1408, bias=False)
        (w2): Linear(in_features=1408, out_features=512, bias=False)
        (w3): Linear(in_features=512, out_features=1408, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=512, 

In [10]:
def find_state_dict(ckpt_obj):
    # 常见保存格式：直接为 state_dict，或包含 'model_state' / 'state_dict' 键的 dict
    if isinstance(ckpt_obj, dict):
        if 'model_state' in ckpt_obj:
            return ckpt_obj['model_state']
        if 'state_dict' in ckpt_obj:
            return ckpt_obj['state_dict']
        # 有时候保存时把 state_dict 放在第一个 value 中
        # 尝试从 values 中找到第一个看起来像 state_dict 的对象
        for v in ckpt_obj.values():
            if isinstance(v, dict) and all(isinstance(x, torch.Tensor) for x in v.values()):
                return v
        # 否则，可能本身就是 state_dict（键->tensor）
        if all(isinstance(x, torch.Tensor) for x in ckpt_obj.values()):
            return ckpt_obj
        # 不能识别
        return None
    else:
        return None

def strip_prefix(state_dict, prefix):
    new_state = {}
    for k, v in state_dict.items():
        if k.startswith(prefix):
            new_state[k[len(prefix):]] = v
        else:
            new_state[k] = v
    return new_state

In [11]:
# 加载ffn微调模型
# 加载模型权重（支持 raw state_dict 或训练时保存的 checkpoint dict）
ckpt_path = 'out/full_sft_512.pth'  # 改为你的 checkpoint 路径，或 'out/pretrain_512.pth'
ckpt = torch.load(ckpt_path, map_location=device)
state_dict = find_state_dict(ckpt)
if state_dict is None:
    raise RuntimeError(f"无法从 checkpoint 中识别出 state_dict，checkpoint keys: {list(ckpt.keys()) if isinstance(ckpt, dict) else type(ckpt)}")
# 移除常见的分布式/包装前缀
if any(k.startswith('module.') for k in state_dict.keys()):
    state_dict = strip_prefix(state_dict, 'module.')
if any(k.startswith('_orig_mod.') for k in state_dict.keys()):
    state_dict = strip_prefix(state_dict, '_orig_mod.')

# 加载到模型（使用 strict=False 以便更好定位缺失/多余键）
res = model_ffn.load_state_dict(state_dict, strict=False)
print('load_state_dict result:', res)
model_ffn.eval()  # 切换到评估模式

load_state_dict result: <All keys matched successfully>


Transformer(
  (tok_embeddings): Embedding(6400, 512)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-7): 8 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=512, out_features=512, bias=False)
        (wk): Linear(in_features=512, out_features=256, bias=False)
        (wv): Linear(in_features=512, out_features=256, bias=False)
        (wo): Linear(in_features=512, out_features=512, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
      (feed_forward): FeedForward(
        (w1): Linear(in_features=512, out_features=1408, bias=False)
        (w2): Linear(in_features=1408, out_features=512, bias=False)
        (w3): Linear(in_features=512, out_features=1408, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=512, 

In [12]:
# 加载moe模型
# 加载模型权重（支持 raw state_dict 或训练时保存的 checkpoint dict）
ckpt_path_moe = 'out/full_sft_512_moe.pth'  # 改为你的 checkpoint 路径，或 'out/pretrain_512.pth'
ckpt_moe = torch.load(ckpt_path_moe, map_location=device)
state_dict_moe = find_state_dict(ckpt_moe)
if state_dict_moe is None:
    raise RuntimeError(f"无法从 checkpoint 中识别出 state_dict，checkpoint keys: {list(ckpt.keys()) if isinstance(ckpt, dict) else type(ckpt)}")
# 移除常见的分布式/包装前缀
if any(k.startswith('module.') for k in state_dict_moe.keys()):
    state_dict_moe = strip_prefix(state_dict_moe, 'module.')
if any(k.startswith('_orig_mod.') for k in state_dict_moe.keys()):
    state_dict_moe = strip_prefix(state_dict_moe, '_orig_mod.')

# 加载到模型（使用 strict=False 以便更好定位缺失/多余键）
res = model_moe.load_state_dict(state_dict_moe, strict=False)
print('load_state_dict result:', res)
model_moe.eval()  # 切换到评估模式

load_state_dict result: <All keys matched successfully>


Transformer(
  (tok_embeddings): Embedding(6400, 512)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-7): 8 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=512, out_features=512, bias=False)
        (wk): Linear(in_features=512, out_features=256, bias=False)
        (wv): Linear(in_features=512, out_features=256, bias=False)
        (wo): Linear(in_features=512, out_features=512, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
      (feed_forward): MOEFeedForward(
        (experts): ModuleList(
          (0-3): 4 x FeedForward(
            (w1): Linear(in_features=512, out_features=1408, bias=False)
            (w2): Linear(in_features=1408, out_features=512, bias=False)
            (w3): Linear(in_features=512, out_features=1408, bias=False)
            (dropout): Dropout(p=0.0, inpla

## 2. BenchMark评价模型（Token-F1 / ROUGE-L）

In [13]:
# Part 1. 环境变量与函数定义
import os, re, io, random, contextlib, ast
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import jieba

# 评测尽量可复现：固定随机种子
torch.manual_seed(2025)
random.seed(2025)
np.random.seed(2025)

# —— 上下文：临时关闭确定性（避免 cuBLAS 报错），结束后恢复原状态
from contextlib import contextmanager
@contextmanager
def temporarily_disable_determinism():
    prev = torch.are_deterministic_algorithms_enabled()
    if prev:
        torch.use_deterministic_algorithms(False)
    try:
        yield
    finally:
        if prev:
            torch.use_deterministic_algorithms(True)

# ===== 1) 数据导入 =====
DATA_PATH = "./dataset/sft_data_mixed_single.csv"
assert os.path.exists(DATA_PATH), f"找不到数据文件：{DATA_PATH}"
df1 = pd.read_csv(DATA_PATH)

# ===== 2) 解析 & 构造 prompt =====
def parse_history(raw):
    if raw is None:
        return []
    if isinstance(raw, list):
        return raw
    s = str(raw).strip()
    if not s or s == "[]" or s.lower() == "nan":
        return []
    try:
        return ast.literal_eval(s)
    except Exception:
        return []

def build_messages(q, history):
    msgs = []
    for h in history:
        if not isinstance(h, (list, tuple)) or len(h) < 2:
            continue
        msgs.append({"role": "user", "content": str(h[0])})
        msgs.append({"role": "assistant", "content": str(h[1])})
    msgs.append({"role": "user", "content": str(q)})
    return msgs

def safe_postprocess(text: str):
    return str(text).strip()

# ===== 3) 生成（评测默认“确定化”，预览可覆盖）=====
def pick_eos_id(tokenizer):
    for tok in ["<|eot_id|>", "<|end|>", "</s>", "<eos>"]:
        try:
            tid = tokenizer.convert_tokens_to_ids(tok)
            if isinstance(tid, int) and tid >= 0:
                return tid
        except Exception:
            pass
    return getattr(tokenizer, "eos_token_id", None)

@torch.no_grad()
def infer_one_with_model(
    q, history, tokenizer, model, device,
    max_new_tokens=256, temperature=0.0, top_k=0, rp=1.0, debug=False
):
    messages = build_messages(q, history)
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    enc = tokenizer(prompt, return_tensors="pt")
    input_ids = enc.input_ids.to(device)
    eos_id = pick_eos_id(tokenizer)

    out_ids = next(model.generate(
        idx=input_ids,
        eos=eos_id,
        max_new_tokens=max_new_tokens,
        temperature=temperature,   # 评测：0.0；预览可覆盖
        top_k=top_k,               # 评测：0；   预览可覆盖
        rp=rp,                     # 评测：1.0； 预览可覆盖
        stream=False
    ))

    out_ids = out_ids[0]
    gen_part = out_ids[input_ids.shape[1]:]
    text_raw_gen = tokenizer.decode(gen_part, skip_special_tokens=False)
    pred = safe_postprocess(text_raw_gen)

    if debug:
        print("\n[DEBUG] prompt_len:", int(input_ids.shape[1]),
              " total_len:", int(out_ids.shape[0]),
              " gen_len:", int(gen_part.shape[0]),
              " eos_id:", eos_id)
        print("[DEBUG] RAW_GEN(head):", repr(text_raw_gen[:200]))
        print("[DEBUG] PRED(head):   ", repr(pred[:200]))
    return pred

# ===== 4) 指标（中文 Token-F1 / ROUGE-L）=====
CN_PUNC = "，。、“”《》；：？！【】（）—…·‘’"
EN_PUNC = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
PUNC_TABLE = str.maketrans("", "", EN_PUNC + CN_PUNC)

def _normalize_text_for_zh(s: str):
    s = str(s).strip().lower()
    s = re.sub(r"\s+", "", s)
    s = s.translate(PUNC_TABLE)
    return s

def _to_word_tokens(s: str):
    s = _normalize_text_for_zh(s)
    return [w for w in jieba.lcut(s) if w]

def metric_token_f1(pred, ref):
    p_tokens = _to_word_tokens(pred)
    r_tokens = _to_word_tokens(ref)
    if not p_tokens and not r_tokens:
        return 1.0
    if not p_tokens or not r_tokens:
        return 0.0
    cp, cr = Counter(p_tokens), Counter(r_tokens)
    overlap = sum((cp & cr).values())
    prec = overlap / max(1, len(p_tokens))
    rec  = overlap / max(1, len(r_tokens))
    if prec == 0 and rec == 0:
        return 0.0
    return 2 * prec * rec / (prec + rec)

def _lcs_len(a_tokens, b_tokens):
    n, m = len(a_tokens), len(b_tokens)
    dp = [[0]*(m+1) for _ in range(n+1)]
    for i in range(n):
        ai = a_tokens[i]
        row = dp[i]
        row_next = dp[i+1]
        for j in range(m):
            if ai == b_tokens[j]:
                row_next[j+1] = row[j] + 1
            else:
                row_next[j+1] = max(row[j+1], row_next[j])
    return dp[n][m]

def metric_rougeL(pred, ref):
    p = _to_word_tokens(pred)
    r = _to_word_tokens(ref)
    if not p or not r:
        return 0.0
    lcs = _lcs_len(p, r)
    prec = lcs / len(p)
    rec  = lcs / len(r)
    if prec == 0 and rec == 0:
        return 0.0
    return (2 * prec * rec) / (prec + rec)

# ===== 5) 模型概览 =====
def n_params(m): 
    return sum(p.numel() for p in m.parameters())

def summarize_model(name, m):
    print(f"\n[{name}] 参数量: {n_params(m):,}")
    moe_like = [k for k,_ in m.named_modules() if ("moe" in k.lower() or "expert" in k.lower())]
    print(f"[{name}] 模块名含 'moe/experts' 的组件数: {len(moe_like)}")
    if len(moe_like) > 0:
        print("  示例：", moe_like[:5])

# ===== 6) 两模型并排基准（可打印每样本；批量时会被静默）=====
def benchmark_pair(df, tokenizer, model_moe, model_ffn, device,
                   n_samples=3, save_path=None):
    assert len(df) >= n_samples, f"数据不足 {n_samples} 条"
    data = df.head(n_samples).copy()

    summarize_model("MOE", model_moe)
    summarize_model("FFN", model_ffn)

    rows = []
    agg = {"moe_F1": [], "moe_RL": [], "ffn_F1": [], "ffn_RL": []}

    for i, (_, row) in enumerate(data.iterrows(), 1):
        q = str(row.get("q", "")).strip()
        ref = str(row.get("a", "")).strip()
        history = parse_history(row.get("history", "[]"))

        pred_moe = infer_one_with_model(q, history, tokenizer, model_moe, device,
                                        max_new_tokens=256, temperature=0.0, top_k=0, rp=1.0)
        pred_ffn = infer_one_with_model(q, history, tokenizer, model_ffn, device,
                                        max_new_tokens=256, temperature=0.0, top_k=0, rp=1.0)

        moe_f1 = metric_token_f1(pred_moe, ref);     ffn_f1 = metric_token_f1(pred_ffn, ref)
        moe_rl = metric_rougeL(pred_moe, ref);       ffn_rl = metric_rougeL(pred_ffn, ref)

        agg["moe_F1"].append(moe_f1); agg["moe_RL"].append(moe_rl)
        agg["ffn_F1"].append(ffn_f1); agg["ffn_RL"].append(ffn_rl)

        rows.append({
            "q": q, "ref": ref,
            "pred_moe": pred_moe, "pred_ffn": pred_ffn,
            "F1_moe": moe_f1, "ROUGE-L_moe": moe_rl,
            "F1_ffn": ffn_f1, "ROUGE-L_ffn": ffn_rl,
        })

        # 单样本打印（批量跑时会被外层静默掉）
        print(f"\n--- Sample #{i} ---")
        print("Q  :", q[:400])
        print("REF:", ref[:400])
        print("[MOE]:", pred_moe[:400])
        print("[FFN]:", pred_ffn[:400])
        print(f"F1/RL (MOE): {moe_f1:.3f}/{moe_rl:.3f}")
        print(f"F1/RL (FFN): {ffn_f1:.3f}/{ffn_rl:.3f}")

    out = pd.DataFrame(rows)

    def _avg(xs): 
        return float(sum(xs)/max(1,len(xs)))

    print("\n=== Benchmark 汇总(前{}条) ===".format(n_samples))
    print("MOE : F1={:.3f}  ROUGE-L={:.3f}".format(_avg(agg['moe_F1']), _avg(agg['moe_RL'])))
    print("FFN : F1={:.3f}  ROUGE-L={:.3f}".format(_avg(agg['ffn_F1']), _avg(agg['ffn_RL'])))

    if save_path:
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        out.to_csv(save_path, index=False, encoding="utf-8-sig")
        print(f"\nSaved to: {save_path}")

    return out

# ===== 7) 采样与 CI 工具 =====
def sample_df(df, n=500, seed=2025):
    if "task_type" in df.columns:
        rng = np.random.default_rng(seed)
        parts = []
        for t, grp in df.groupby("task_type"):
            k = max(1, int(round(len(grp) / len(df) * n)))
            parts.append(grp.sample(n=min(k, len(grp)), random_state=int(rng.integers(1, 1_000_000_000))))
        out = pd.concat(parts, ignore_index=True)
        if len(out) > n:
            out = out.sample(n=n, random_state=seed)
        return out.reset_index(drop=True)
    if len(df) <= n:
        return df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return df.sample(n=n, random_state=seed).reset_index(drop=True)

def mean_ci(values, B=1000, alpha=0.05, seed=123):
    rng = np.random.default_rng(seed)
    vals = np.asarray(values, dtype=float)
    if len(vals) == 0:
        return float("nan"), float("nan"), float("nan")
    boots = []
    for _ in range(B):
        s = rng.choice(vals, size=len(vals), replace=True)
        boots.append(s.mean())
    lo, hi = np.percentile(boots, [100*alpha/2, 100*(1-alpha/2)])
    return float(vals.mean()), float(lo), float(hi)

def paired_diff_ci(values_a, values_b, B=1000, alpha=0.05, seed=321):
    rng = np.random.default_rng(seed)
    a = np.asarray(values_a, dtype=float)
    b = np.asarray(values_b, dtype=float)
    assert len(a) == len(b) and len(a) > 0
    d = a - b
    boots = []
    for _ in range(B):
        idx = rng.integers(0, len(d), size=len(d))
        boots.append(d[idx].mean())
    lo, hi = np.percentile(boots, [100*alpha/2, 100*(1-alpha/2)])
    return float(d.mean()), float(lo), float(hi)

# ===== 8) 预览（带温度采样）=====
def preview_random_samples(
    df, tokenizer, model_moe, model_ffn, device,
    n=3, seed=2025, max_new_tokens=256,
    temperature=0.7, top_k=50, rp=1.15
):
    rng = np.random.default_rng(seed)
    sub = df if len(df) <= n else df.sample(n=n, random_state=int(rng.integers(1, 1_000_000_000)))

    # 预览：临时关闭确定性（避免 cuBLAS 限制报错）
    with temporarily_disable_determinism():
        print(f"\n[Preview] n={len(sub)}  (temp={temperature}, top_k={top_k}, rp={rp})")
        for i, (_, row) in enumerate(sub.iterrows(), 1):
            q = str(row.get("q", "")).strip()
            ref = str(row.get("a", "")).strip()
            history = parse_history(row.get("history", "[]"))

            moe_pred = infer_one_with_model(
                q, history, tokenizer, model_moe, device,
                max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, rp=rp
            )
            ffn_pred = infer_one_with_model(
                q, history, tokenizer, model_ffn, device,
                max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, rp=rp
            )

            print(f"\n--- Preview #{i} ---")
            print("Q  :", q[:400])
            if ref:
                print("REF:", ref[:400])
            print("[MOE]:", moe_pred[:500])
            print("[FFN]:", ffn_pred[:500])


  import pkg_resources


In [14]:
# Part 2. 浏览/预览（看 3 条，带温度）
preview_random_samples(
    df1, tokenizer, model_moe, model_ffn, device,
    n=3, seed=2025, temperature=0.7, top_k=50, rp=1.15
)


[Preview] n=3  (temp=0.7, top_k=50, rp=1.15)

--- Preview #1 ---
Q  : 上联：剪一缕春风裱画
下联：
REF: 拈几桢梅影酬朋
[MOE]: 曲歌三更，绣阁秋光万里香</s>
</s>
最爱桃花映水晶湖。</s>
烟笼柳絮飘零雨，明朝停杯话别离</s>
</s>
谁为倚栏愁怀客是此间，无人知否？</s>
绝胜杏花楼外笑语声，一笑泪湿相思衣冠</s>
悄然逝去，只愿今宵醉眼看山行。</s>
垂老大陆孤魂渺渺渺。</s>
酒已成尸魂黍离死不再回，谁道人生如梦中来急！诗成有意同溯远方，天涯海角犹未归。唐虞之时，亦自得其乐
[FFN]: 春光正好</s>
</s>
此来新年喜郎吝？</s>
看春色满庭院里花，迎接清明几时休。</s>
梅花易逢红娘来，花开不待艳阳天</s>
多少年前爱恨情仇难消受
红颜不负春事未卜君</s>
月色常在日暮余华</s>
风雨同归身俗者成病也，时有阴沉忧愁心事空
忙碌又遭遇命运多舛重，今宵无眠中分割，世事苟且悬匿
万物皆与物是人间非所趋
梦醒来何处可数莺声鸣，夜半明月我独影寂寥。</s>
小桃花

--- Preview #2 ---
Q  : 1. In the field of quantum mechanics, the Heisenberg uncertainty principle states that it is impossible to simultaneously determine both the position and momentum of a particle with precise accuracy.
REF: 1. What is the Heisenberg uncertainty principle?
[MOE]: ? If they were very easy to move forward after healing with our own strengths and weaknesses.</s>
are you will have a sense of understanding the process or honesty in order that works well as the resolution of your

随机抽样 500 条做评测，并计算 Token-F1 / ROUGE-L 的 95% 置信区间

In [15]:
# Part 3. 正式运行（500 样本，分批汇总 + 最终 95%CI）

BATCH_SIZE = 100
TOTAL_N    = 500
SEED       = 2025

# 抽样一次并打乱
pool = sample_df(df1, n=min(TOTAL_N, len(df1)), seed=SEED).reset_index(drop=True)

all_chunks = []
num_batches = (len(pool) + BATCH_SIZE - 1) // BATCH_SIZE

for b in range(num_batches):
    start = b * BATCH_SIZE
    end   = min((b + 1) * BATCH_SIZE, len(pool))
    if end <= start:
        break
    chunk = pool.iloc[start:end].reset_index(drop=True)

    # 静默运行 benchmark_pair + 临时关闭确定性（避免 cuBLAS 报错）
    with contextlib.redirect_stdout(io.StringIO()):
        with temporarily_disable_determinism():
            out_chunk = benchmark_pair(
                df=chunk,
                tokenizer=tokenizer,
                model_moe=model_moe,
                model_ffn=model_ffn,
                device=device,
                n_samples=len(chunk),
                save_path=None
            )

    # 批次均值
    moe_f1_mean = float(np.mean(out_chunk["F1_moe"]))
    ffn_f1_mean = float(np.mean(out_chunk["F1_ffn"]))
    moe_rl_mean = float(np.mean(out_chunk["ROUGE-L_moe"]))
    ffn_rl_mean = float(np.mean(out_chunk["ROUGE-L_ffn"]))

    print(f"\n=== Batch {b+1}/{num_batches} ({start}-{end-1}) ===")
    print(f"MOE  F1 : {moe_f1_mean:.3f}")
    print(f"FFN  F1 : {ffn_f1_mean:.3f}")
    print(f"MOE  RL : {moe_rl_mean:.3f}")
    print(f"FFN  RL : {ffn_rl_mean:.3f}")

    all_chunks.append(out_chunk)

# 汇总与 CI
_out_all = pd.concat(all_chunks, ignore_index=True) if all_chunks else pd.DataFrame(
    columns=["F1_moe","F1_ffn","ROUGE-L_moe","ROUGE-L_ffn"]
)

moe_f1_mean, moe_f1_lo, moe_f1_hi = mean_ci(_out_all["F1_moe"])
ffn_f1_mean, ffn_f1_lo, ffn_f1_hi = mean_ci(_out_all["F1_ffn"])
moe_rl_mean, moe_rl_lo, moe_rl_hi = mean_ci(_out_all["ROUGE-L_moe"])
ffn_rl_mean, ffn_rl_lo, ffn_rl_hi = mean_ci(_out_all["ROUGE-L_ffn"])

diff_f1_mean, diff_f1_lo, diff_f1_hi = paired_diff_ci(_out_all["F1_moe"], _out_all["F1_ffn"])
diff_rl_mean, diff_rl_lo, diff_rl_hi = paired_diff_ci(_out_all["ROUGE-L_moe"], _out_all["ROUGE-L_ffn"])

print("\n=== 95% CI (bootstrap, n={}) ===".format(len(_out_all)))
print(f"MOE  F1 : {moe_f1_mean:.3f} [{moe_f1_lo:.3f}, {moe_f1_hi:.3f}]")
print(f"FFN  F1 : {ffn_f1_mean:.3f} [{ffn_f1_lo:.3f}, {ffn_f1_hi:.3f}]")
print(f"MOE  RL : {moe_rl_mean:.3f} [{moe_rl_lo:.3f}, {moe_rl_hi:.3f}]")
print(f"FFN  RL : {ffn_rl_mean:.3f} [{ffn_rl_lo:.3f}, {ffn_rl_hi:.3f}]")

print("\n=== Paired diff (MOE − FFN) 95% CI ===")
print(f"ΔF1  : {diff_f1_mean:.3f} [{diff_f1_lo:.3f}, {diff_f1_hi:.3f}]")
print(f"ΔRL  : {diff_rl_mean:.3f} [{diff_rl_lo:.3f}, {diff_rl_hi:.3f}]")

if not (diff_f1_lo <= 0.0 <= diff_f1_hi):
    print("→ F1 差异在 95% CI 下显著。")
if not (diff_rl_lo <= 0.0 <= diff_rl_hi):
    print("→ ROUGE-L 差异在 95% CI 下显著。")

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.611 seconds.
Prefix dict has been built successfully.



=== Batch 1/5 (0-99) ===
MOE  F1 : 0.141
FFN  F1 : 0.143
MOE  RL : 0.110
FFN  RL : 0.110

=== Batch 2/5 (100-199) ===
MOE  F1 : 0.148
FFN  F1 : 0.133
MOE  RL : 0.121
FFN  RL : 0.106

=== Batch 3/5 (200-299) ===
MOE  F1 : 0.170
FFN  F1 : 0.158
MOE  RL : 0.136
FFN  RL : 0.125

=== Batch 4/5 (300-399) ===
MOE  F1 : 0.135
FFN  F1 : 0.122
MOE  RL : 0.104
FFN  RL : 0.095

=== Batch 5/5 (400-499) ===
MOE  F1 : 0.180
FFN  F1 : 0.173
MOE  RL : 0.137
FFN  RL : 0.137

=== 95% CI (bootstrap, n=500) ===
MOE  F1 : 0.155 [0.142, 0.168]
FFN  F1 : 0.146 [0.134, 0.157]
MOE  RL : 0.122 [0.112, 0.131]
FFN  RL : 0.115 [0.106, 0.123]

=== Paired diff (MOE − FFN) 95% CI ===
ΔF1  : 0.009 [0.001, 0.018]
ΔRL  : 0.007 [0.000, 0.014]
→ F1 差异在 95% CI 下显著。
→ ROUGE-L 差异在 95% CI 下显著。
