In [1]:

import os
import re
import json
import math
import argparse
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoConfig
import matplotlib.pyplot as plt
from tqdm import tqdm
# 复用熵度量实现
try:
    from entropy_metrics.metrics import token_entropy
except Exception:
    from metrics import token_entropy

# 集成现有推理/打分封装
try:
    from entropy_metrics.infer_vllm import VLLMGenerator
    from entropy_metrics.infer_transformers import HFScorer
except Exception:
    from infer_vllm import VLLMGenerator
    from infer_transformers import HFScorer

# matplotlib中文

import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

plt.rcParams['axes.unicode_minus'] = False  # avoid minus sign as tofu

# Pick one that exists on your system (both are in your fc-list output)
font_path = "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc"


INFO 10-27 15:05:25 [__init__.py:239] Automatically detected platform cuda.


In [2]:
data_path = "/home/llama/test-rlif/datasets/amc12/data/train-00000-of-00001.parquet"
amc12 = pd.read_parquet(data_path)


In [3]:

args ={
    "seed":42,
    "temperature":0.6,
    "top_p":0.9,
    "top_k":-1,
    "max_model_len":4096,
}



In [4]:

def generate_with_scores(bundle, prompt_text: str, max_new_tokens: int, temperature: float, top_p: float, top_k: int) -> Dict[str, Any]:
    final_prompt = apply_chat_template(bundle, prompt_text)

    vout_list = bundle[0].generate(
        prompts=[final_prompt],
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        top_k=top_k,
    )
    vout = vout_list[0]
    response_text = vout.get("text", "")

    scored = bundle[1].score_next_token_logits(prompts=[final_prompt], generations=[response_text])
    if not scored:
        entropy_series = []
        avg_logprob = None
    else:
        item = scored[0]
        logits_seq = item["logits_seq"]
        target_ids = item["target_ids"]
        ent = token_entropy(logits_seq)
        entropy_series = [float(x) for x in ent]
        logp = torch.log_softmax(logits_seq, dim=-1)
        tgt_logp = logp.gather(dim=-1, index=target_ids.unsqueeze(-1)).squeeze(-1)
        avg_logprob = float(tgt_logp.mean()) if tgt_logp.numel() > 0 else None

    return {
        "response_text": response_text,
        "entropy_series": entropy_series,
        "avg_logprob": avg_logprob,
    }

In [7]:
amc12.head()

Unnamed: 0,id,problem,answer,url
0,0,$\frac{m}{n}$ is the Irreducible fraction valu...,142.0,https://artofproblemsolving.com/wiki/index.php...
1,1,How many ways are there to split the integers ...,144.0,https://artofproblemsolving.com/wiki/index.php...
2,2,What is the product of all real numbers $x$ su...,81.0,https://artofproblemsolving.com/wiki/index.php...
3,3,Let $M$ be the midpoint of $\overline{AB}$ in ...,4.0,https://artofproblemsolving.com/wiki/index.php...
4,4,Let $\mathcal{R}$ be the region in the complex...,13.0,https://artofproblemsolving.com/wiki/index.php...


In [8]:
prompt_pre = """solve the following problem step by step by using 1. 2. 3. ... 
 please give the answer at the end in \\boxed{}
 problem:"""

In [9]:
def evaluate_correctness(correct_answer: float, extracted: str) -> Tuple[bool, float]:
    pred = to_float_or_none(extracted)
    if pred is None:
        return False, float("nan")
    try:
        is_ok = float(correct_answer) == float(pred)
    except Exception:
        is_ok = False
    return is_ok, float(pred)


def apply_chat_template(bundle, prompt_text: str) -> str:
    tok = bundle[2]
    if hasattr(tok, "apply_chat_template"):
        try:
            messages = [{"role": "user", "content": prompt_text}]
            return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        except Exception:
            return prompt_text
    return prompt_text
    
def extract_answer(response: str) -> str:
    # 优先提取 \boxed{...}
    m = re.search(r"\\boxed\{(.+?)\}", response)
    if m:
        return m.group(1).strip()
    # 回退：提取最后一个整数或分数/小数
    # 简单策略：抓取最后一个连续的数字片段
    nums = re.findall(r"[-+]?[0-9]*\.?[0-9]+", response)
    return nums[-1].strip() if nums else ""


def to_float_or_none(s: str):
    try:
        return float(s)
    except Exception:
        return None

In [None]:
import torch, gc
temp = []
models_path = [
    "/home/llama/test-rlif/checkpoints/TTRL/0826-152919/global_step_240/actor_hf_model",
    "/home/llama/test-rlif/checkpoints/rent/global_step_150/actor_hf_model",
    "/home/llama/test-rlif/checkpoints/Intuitor/global_step_116/actor_hf_model",
    "/home/llama/test-rlif/checkpoints/EMPO/0824-052825/global_step_2499",
    "/home/llama/test-rlif/Qwen2.5-Math-1.5B"
]
models_name = [
        "ttrl", 
        "rent", 
        "intuitor", 
        "empo", 
        "origin"
    ]
for idx, path in tqdm(enumerate(models_path)):
    ids = idx
    vllm = VLLMGenerator(model_path=path, tensor_parallel_size=1, max_model_len=args['max_model_len'])
    scorer = HFScorer(model_path=path, torch_dtype="auto", device_map="auto")
    tok = AutoTokenizer.from_pretrained(path)
    model = (vllm, scorer,tok, ids)
    name = models_name[ids]
    gc.collect()       # 强制回收
    torch.cuda.empty_cache()  # 清空缓存
    for _, row in tqdm(amc12.iterrows()):
        problem = row['problem']
        idxrow = row['id']
        sol = row['answer']
        prompt = apply_chat_template(model, prompt_pre + problem)
        gen = generate_with_scores(model, prompt, args['max_model_len'], args['temperature'], args['top_p'], args['top_k'])
        ex_ans = extract_answer(gen['response_text'])
        is_ok, pred_val = evaluate_correctness(sol, ex_ans)
        temp.append(
            {
                "id": idxrow, # 题号
                "model_name": name,
                "response": gen["response_text"],
                "extracted_answer": ex_ans,
                "predicted_value": pred_val,
                "is_correct": is_ok,
                "entropy_series": gen["entropy_series"],
                "avg_logprob": gen["avg_logprob"],
            }
        )
res = pd.DataFrame(temp)


83it [29:24, 21.26s/it]
1it [30:01, 1801.73s/it]

INFO 10-27 15:55:36 [config.py:717] This model supports multiple tasks: {'reward', 'generate', 'score', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 10-27 15:55:36 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 10-27 15:55:40 [__init__.py:239] Automatically detected platform cuda.
INFO 10-27 15:55:42 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/llama/test-rlif/Qwen2.5-Math-1.5B', speculative_config=None, tokenizer='/home/llama/test-rlif/Qwen2.5-Math-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_

2025-10-27 15:55:43,111 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


INFO 10-27 15:55:43 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 10-27 15:55:43 [cuda.py:221] Using Flash Attention backend on V1 engine.
INFO 10-27 15:55:43 [topk_topp_sampler.py:44] Currently, FlashInfer top-p & top-k sampling sampler is disabled because FlashInfer>=v0.2.3 is not backward compatible. Falling back to the PyTorch-native implementation of top-p & top-k sampling.
INFO 10-27 15:55:43 [gpu_model_runner.py:1329] Starting to load model /home/llama/test-rlif/Qwen2.5-Math-1.5B...


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.56it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.56it/s]



INFO 10-27 15:55:44 [loader.py:458] Loading weights took 0.69 seconds
INFO 10-27 15:55:44 [gpu_model_runner.py:1347] Model loading took 2.8798 GiB and 0.852142 seconds
INFO 10-27 15:55:51 [backends.py:420] Using cache directory: /home/llama/.cache/vllm/torch_compile_cache/71c42d8c82/rank_0_0 for vLLM's torch.compile
INFO 10-27 15:55:51 [backends.py:430] Dynamo bytecode transform time: 6.55 s
INFO 10-27 15:55:56 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 4.379 s
INFO 10-27 15:55:57 [monitor.py:33] torch.compile takes 6.55 s in total
ERROR 10-27 15:55:57 [core.py:396] EngineCore failed to start.
ERROR 10-27 15:55:57 [core.py:396] Traceback (most recent call last):
ERROR 10-27 15:55:57 [core.py:396]   File "/home/llama/miniconda3/envs/empo/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1544, in _dummy_sampler_run
ERROR 10-27 15:55:57 [core.py:396]     sampler_output = self.sampler(logits=logits,
ERROR 10-27 15:55:57 [c

Process EngineCore_0:
Traceback (most recent call last):
  File "/home/llama/miniconda3/envs/empo/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1544, in _dummy_sampler_run
    sampler_output = self.sampler(logits=logits,
  File "/home/llama/miniconda3/envs/empo/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/llama/miniconda3/envs/empo/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/llama/miniconda3/envs/empo/lib/python3.10/site-packages/vllm/v1/sample/sampler.py", line 49, in forward
    sampled = self.sample(logits, sampling_metadata)
  File "/home/llama/miniconda3/envs/empo/lib/python3.10/site-packages/vllm/v1/sample/sampler.py", line 115, in sample
    random_sampled = self.topk_topp_sampler(
  File "/home/llama/miniconda3/envs/empo/lib/python3.10/site-packages/torch/nn/

RuntimeError: Engine core initialization failed. See root cause above.

: 

In [None]:
import matplotlib.pyplot as problem_title
from matplotlib.ticker import MaxNLocator
font_prop = FontProperties(fname=font_path)
def plot_entropy_curves(df, save_dir=None):
    """
    为每个模型绘制正确/错误答案的熵变曲线
    
    参数:
        df: 包含模型结果的DataFrame，需包含'model_name', 'id', 'is_correct', 'entropy_series'列
        save_dir: 图像保存目录（若为None则不保存，仅显示）
    """
    # 按模型分组处理
    for model_name, model_group in df.groupby('model_name'):
        print(f"正在绘制模型 {model_name} 的熵变图...")
        
        # 区分正确和错误的样本
        correct_samples = model_group[model_group['is_correct'] == True]
        incorrect_samples = model_group[model_group['is_correct'] == False]
        
        # 绘制正确答案的熵变图
        if not correct_samples.empty:
            plt.figure(figsize=(12, 6))
            for _, row in correct_samples.iterrows():
                problem_id = row['id']
                entropy_series = row['entropy_series']
                # 熵序列长度即生成步骤数（x轴）
                steps = np.arange(1, len(entropy_series) + 1)
                plt.plot(steps, entropy_series, label=f"问题ID: {problem_id}")
            
            plt.title(f"{model_name} - 答案正确的熵变曲线", fontsize=14,fontproperties=font_prop)
            plt.xlabel("生成步骤", fontsize=12,fontproperties=font_prop)
            plt.ylabel("熵值", fontsize=12,fontproperties=font_prop)
            plt.xlim(1, max(len(s) for s in correct_samples['entropy_series']))  # x轴范围适配最长序列
            plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))  # x轴只显示整数（步骤）
            plt.grid(alpha=0.3)
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left',fontproperties=font_prop)  # 图例放在图外右侧，避免遮挡
            plt.tight_layout()  # 自动调整布局
            
            if save_dir:
                import os
                os.makedirs(save_dir, exist_ok=True)
                plt.savefig(f"{save_dir}/{model_name}_correct_entropy.png", dpi=300, bbox_inches='tight')
            else:
                plt.show()
            plt.close()
        
        # 绘制错误答案的熵变图
        if not incorrect_samples.empty:
            plt.figure(figsize=(12, 6))
            for _, row in incorrect_samples.iterrows():
                problem_id = row['id']
                entropy_series = row['entropy_series']
                steps = np.arange(1, len(entropy_series) + 1)
                plt.plot(steps, entropy_series, label=f"问题ID: {problem_id}")
            
            plt.title(f"{model_name} - 答案错误的熵变曲线", fontsize=14,fontproperties=font_prop)
            plt.xlabel("生成步骤", fontsize=12,fontproperties=font_prop)
            plt.ylabel("熵值", fontsize=12,fontproperties=font_prop)
            plt.xlim(1, max(len(s) for s in incorrect_samples['entropy_series']))
            plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
            plt.grid(alpha=0.3)
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left',fontproperties=font_prop)
            plt.tight_layout()
            
            if save_dir:
                plt.savefig(f"{save_dir}/{model_name}_incorrect_entropy.png", dpi=300, bbox_inches='tight')
            else:
                plt.show()
            plt.close()

# 调用函数绘图（假设结果数据已存入res变量）
# 若需要保存图像，指定save_dir参数，例如：save_dir="./entropy_plots"
plot_entropy_curves(res, save_dir="./outputs")