In [1]:
import warnings
warnings.filterwarnings("ignore")  # 忽略所有警告
import subprocess
from utils import load_jsonl, extract_code_block, extract_obj, change_variable_types
import numpy as np
from vllm import LLM, SamplingParams        
from transformers import AutoTokenizer                                      
from langchain.prompts import PromptTemplate
from rule_prompt_utils import system_prompt_temp
import os

def generate_with_model(model, prompt, sampling_params):   
    response = model.generate(prompt, sampling_params) 
    result_text = [g.outputs[0].text for g in response]
    return result_text

def mp_worker(item):
    prompt = [
        {
            "role": "system",
            "content": zeroshot_prompt_system.format(question=item['en_question']).strip()
        },
        {
            "role": "user",
            "content": zeroshot_prompt_user.format(question=item['en_question']).strip()
        }
    ]
    text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return text

def check_result(result_str, item, solver_name='gurobi'):
    sub_answer = item['en_answer']
    # Convert sub_answer to float or None
    sub_answer = None if sub_answer == "No Best Solution" or "-9999" in str(sub_answer) else float(sub_answer)
    
    # Extract code snippet
    code_snippet = extract_code_block(result_str, solver_name)
    if not code_snippet:
        return 2
    
    # Run code snippet
    try:
        result = subprocess.run(['python3', '-c', code_snippet], capture_output=True, text=True, timeout=100)
    except subprocess.TimeoutExpired:
        return 1 if sub_answer is None else 0
    
    # Check if execution failed
    if result.returncode != 0:
        return 3
    
    # Extract solver result
    solver_result = extract_obj(result.stdout)
    
    # Handle infeasible case or numerical mismatch
    if 'nfeasible' in result.stdout or (solver_result is not None and sub_answer is not None and np.abs(solver_result - sub_answer) / (np.abs(sub_answer) + 1) > 1e-6):
        # Try re-running with modified variables
        result_str = change_variable_types(result_str)
        if result_str:
            try:
                code_snippet = extract_code_block(result_str, solver_name)
                result = subprocess.run(['python3', '-c', code_snippet], capture_output=True, text=True, timeout=100)
                if result.returncode == 0:
                    new_result = extract_obj(result.stdout)
                    if 'nfeasible' not in result.stdout:
                        if new_result is not None and sub_answer is not None and np.abs(new_result - sub_answer) / (np.abs(sub_answer) + 1) < 1e-6:
                            return 1
                        if new_result == sub_answer:
                            return 1
            except subprocess.TimeoutExpired:
                print("over_time")
    
    # Handle infeasible case after retry
    if 'nfeasible' in result.stdout:
        return 1 if sub_answer is None else 0
    
    # Final comparison
    if solver_result is not None and sub_answer is not None:
        return 1 if np.abs(solver_result - sub_answer) / (np.abs(sub_answer) + 1) < 1e-6 else 0
    return 1 if solver_result == sub_answer else 40

model_path = '/DATA/disk2/checkpoints/system/partialkl/'
tensor_parallel_size = 1
# loading models
solver_name = 'gurobi'
print("Loading model", model_path)
model = LLM(
    model=model_path,
    tensor_parallel_size=tensor_parallel_size,
    trust_remote_code=True
)
print("Model initialized.")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Load decode strategy
topk = 1
max_tokens = 8192
repetition_penalty = 1.02
zeroshot_prompt_system = PromptTemplate.from_template(system_prompt_temp['system'])
zeroshot_prompt_user = PromptTemplate.from_template(system_prompt_temp['user'])
stop_tokens = ["</s>"]
sampling_params = SamplingParams(
    n=topk,
    temperature=0.5,
    top_p=0.9,
    max_tokens=max_tokens,
    stop=stop_tokens,
    repetition_penalty=repetition_penalty
)

# Load data
datapath = 'test_data'
for filepath in os.listdir(datapath):
    print('Loading data', filepath)
    loaded_data = load_jsonl(os.path.join(datapath, filepath))
    if loaded_data:
        pass
    test_data = loaded_data
    print('Finish Loading')
    prompt_list = []
    for item in test_data:
        prompt_list.append(mp_worker(item))
    result_strs = generate_with_model(model, prompt_list, sampling_params)
    snippet_package_cor = []
    score = []
    data_name = filepath.split("/")[-1].split(".")[0]
    for result_str, item in zip(result_strs, test_data):
        snippet_package_cor.append(check_result(result_str, item, solver_name))
    result = np.bincount(snippet_package_cor)
    print(data_name)
    print(result[1] / sum(result))

2025-05-20 15:57:18,625	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Loading model /DATA/disk2/checkpoints/system/partialkl/
INFO 05-20 15:57:22 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='/DATA/disk2/checkpoints/system/partialkl/', speculative_config=None, tokenizer='/DATA/disk2/checkpoints/system/partialkl/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/DATA/disk2/checkpoints/system/parti

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.72s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.38s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.43s/it]



INFO 05-20 15:57:28 model_runner.py:1071] Loading model weights took 14.2487 GB
INFO 05-20 15:57:29 gpu_executor.py:122] # GPU blocks: 61034, # CPU blocks: 4681
INFO 05-20 15:57:29 gpu_executor.py:126] Maximum concurrency for 32768 tokens per request: 29.80x
INFO 05-20 15:57:32 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-20 15:57:32 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-20 15:57:38 model_runner.py:1530] Graph capturing finished in 6 secs.
Model initialized.
Loading data IndustryOR_fixed.json
Finish Loading


Processed prompts: 100%|██████████| 100/100 [00:24<00:00,  4.09it/s, est. speed input: 1737.75 toks/s, output: 3677.23 toks/s]


IndustryOR_fixed
0.35
Loading data OPTMATH_BENCH.jsonl
Finish Loading


Processed prompts: 100%|██████████| 166/166 [01:46<00:00,  1.55it/s, est. speed input: 1843.13 toks/s, output: 2628.06 toks/s]


OPTMATH_BENCH
0.25301204819277107
Loading data MAMO_ComplexLP.json
Finish Loading


Processed prompts: 100%|██████████| 211/211 [00:52<00:00,  4.00it/s, est. speed input: 2745.86 toks/s, output: 5083.94 toks/s]


MAMO_ComplexLP
0.6113744075829384
Loading data MAMO_EasyLP.json
Finish Loading


Processed prompts: 100%|██████████| 652/652 [01:19<00:00,  8.16it/s, est. speed input: 3464.04 toks/s, output: 7003.36 toks/s]


MAMO_EasyLP
0.9003067484662577
Loading data NL4OPT.jsonl
Finish Loading


Processed prompts: 100%|██████████| 245/245 [00:25<00:00,  9.49it/s, est. speed input: 3096.86 toks/s, output: 7121.14 toks/s]


NL4OPT
0.9714285714285714
Loading data OptMATH_Bench.jsonl
Finish Loading


Processed prompts: 100%|██████████| 193/193 [02:01<00:00,  1.59it/s, est. speed input: 1905.13 toks/s, output: 2983.82 toks/s]


OptMATH_Bench
0.3005181347150259
