In [1]:
import warnings
warnings.filterwarnings("ignore") 
import subprocess
from utils import load_jsonl, extract_code_block, extract_obj, change_variable_types
import numpy as np
from vllm import LLM, SamplingParams        
from transformers import AutoTokenizer                                      
from langchain.prompts import PromptTemplate
from rule_prompt_utils import system_prompt_temp
import os




2025-06-01 16:17:37,357	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
# load checkpoints and tokenizer

model_path = '/DATA/disk1/cml/config/MScache/models/oneday88/SIRL-7B'
tensor_parallel_size = 1
solver_name = 'gurobi'
print("Loading model", model_path)
model = LLM(
    model=model_path,
    tensor_parallel_size=tensor_parallel_size,
    trust_remote_code=True
)
print("Model initialized.")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)



Loading model /DATA/disk1/cml/config/MScache/models/oneday88/SIRL-7B
INFO 06-01 16:17:45 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='/DATA/disk1/cml/config/MScache/models/oneday88/SIRL-7B', speculative_config=None, tokenizer='/DATA/disk1/cml/config/MScache/models/oneday88/SIRL-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_na

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.36s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.10s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.14s/it]



INFO 06-01 16:17:49 model_runner.py:1071] Loading model weights took 14.2487 GB
INFO 06-01 16:17:50 gpu_executor.py:122] # GPU blocks: 61034, # CPU blocks: 4681
INFO 06-01 16:17:50 gpu_executor.py:126] Maximum concurrency for 32768 tokens per request: 29.80x
INFO 06-01 16:17:52 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-01 16:17:52 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-01 16:18:00 model_runner.py:1530] Graph capturing finished in 8 secs.
Model initialized.


In [3]:
# load prompt template and functions for generation
zeroshot_prompt_system = PromptTemplate.from_template(system_prompt_temp['system'])
zeroshot_prompt_user = PromptTemplate.from_template(system_prompt_temp['user'])
def mp_worker(item):
    prompt = [
        {
            "role": "system",
            "content": zeroshot_prompt_system.format(question=item['en_question']).strip()
        },
        {
            "role": "user",
            "content": zeroshot_prompt_user.format(question=item['en_question']).strip()
        }
    ]
    text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return text

def generate_with_model(model, prompt, sampling_params):   
    response = model.generate(prompt, sampling_params) 
    result_text = [g.outputs[0].text for g in response]
    return result_text

In [4]:
# Load decode strategy
topk = 1
max_tokens = 8192
repetition_penalty = 1.02 # To avoid the occasional occurrence of repeated tokens
stop_tokens = ["</s>"]

# top-p strategy
sampling_params = SamplingParams(
    n=topk,
    temperature=0.5,
    top_p=0.9,
    max_tokens=max_tokens,
    stop=stop_tokens,
    repetition_penalty=repetition_penalty
)


In [5]:
# check the pass@1 accuracy
def check_result(result_str, item, solver_name='gurobi'):
    sub_answer = item['en_answer']
    # Convert sub_answer to float or None
    sub_answer = None if sub_answer == "No Best Solution" or "-9999" in str(sub_answer) else float(sub_answer)
    
    # Extract code snippet
    code_snippet = extract_code_block(result_str, solver_name)
    if not code_snippet:
        return 2
    
    # Run code snippet
    try:
        result = subprocess.run(['python3', '-c', code_snippet], capture_output=True, text=True, timeout=100)
    except subprocess.TimeoutExpired:
        return 1 if sub_answer is None else 0
    
    # Check if execution failed
    if result.returncode != 0:
        return 3
    
    # Extract solver result
    solver_result = extract_obj(result.stdout)
    
    # check the first time
    if solver_result is not None and sub_answer is not None and np.abs(solver_result - sub_answer) / (np.abs(sub_answer) + 1) <= 1e-6:
        return 1
    # Handle infeasible case or numerical mismatch since we ignore the variable types error
    if 'nfeasible' in result.stdout or (solver_result is not None and sub_answer is not None and np.abs(solver_result - sub_answer) / (np.abs(sub_answer) + 1) > 1e-6):
        # Try re-running with modified variables: we ignore the variable types error
        result_str = change_variable_types(result_str) # change the type of variables
        if result_str:
            try:
                code_snippet = extract_code_block(result_str, solver_name)
                result = subprocess.run(['python3', '-c', code_snippet], capture_output=True, text=True, timeout=100)
                if result.returncode == 0:
                    new_result = extract_obj(result.stdout)
                    if 'nfeasible' not in result.stdout: # infeasible and Infeasible
                        if new_result is not None and sub_answer is not None and np.abs(new_result - sub_answer) / (np.abs(sub_answer) + 1) < 1e-6:
                            return 1
                        if new_result == sub_answer:
                            return 1
            except subprocess.TimeoutExpired:
                print("over_time")
    
    # Handle infeasible case after retry
    if 'nfeasible' in result.stdout:
        return 1 if sub_answer is None else 0
    
    # Final comparison
    if solver_result is not None and sub_answer is not None:
        return 1 if np.abs(solver_result - sub_answer) / (np.abs(sub_answer) + 1) < 1e-6 else 0
    return 1 if solver_result == sub_answer else 0

In [6]:
# if you want to check pass@1 accuracy, please run this cell
# Test the checkpoint
datapath = 'test_data'
testdataset = ['NL4OPT.jsonl', 'MAMO_EasyLP.json', 'MAMO_ComplexLP.json', 'IndustryOR_fixed.json', 'OptMATH_Bench_193.jsonl', 'OptMATH_Bench_166.jsonl','OptiBench.jsonl']
for filepath in testdataset:
    
    # loading data
    print('Loading data', filepath)
    test_data = load_jsonl(os.path.join(datapath, filepath))
    print('Finish Loading')
    
    # generation 
    
    prompt_list = []
    for item in test_data:
        prompt_list.append(mp_worker(item))
    result_strs = generate_with_model(model, prompt_list, sampling_params)
    snippet_package_cor = []
    score = []
    # check the pass@1 accuracy
    
    for result_str, item in zip(result_strs, test_data):
        snippet_package_cor.append(check_result(result_str, item, solver_name))
    result = np.bincount(snippet_package_cor)
    print(f'Numbers of test cases in dataset {filepath}: {sum(result)}')
    print(f'Numbers of pass@1 cases in dataset {filepath}: {result[1]}')
    print(f'pass@1 accuracy for dataset {filepath}: {result[1]}/{sum(result)} = {result[1] / sum(result)}')
    print('-------------------------------------------------------------------')
  

Loading data NL4OPT.jsonl
Finish Loading


Processed prompts: 100%|██████████| 245/245 [00:24<00:00,  9.80it/s, est. speed input: 3199.33 toks/s, output: 7358.92 toks/s]


Numbers of test cases in dataset NL4OPT.jsonl: 245
Numbers of pass@1 cases in dataset NL4OPT.jsonl: 235
pass@1 accuracy for dataset NL4OPT.jsonl: 235/245 = 0.9591836734693877
-------------------------------------------------------------------
Loading data MAMO_EasyLP.json
Finish Loading


Processed prompts: 100%|██████████| 652/652 [01:18<00:00,  8.28it/s, est. speed input: 3514.86 toks/s, output: 7094.86 toks/s]


Numbers of test cases in dataset MAMO_EasyLP.json: 652
Numbers of pass@1 cases in dataset MAMO_EasyLP.json: 587
pass@1 accuracy for dataset MAMO_EasyLP.json: 587/652 = 0.9003067484662577
-------------------------------------------------------------------
Loading data MAMO_ComplexLP.json
Finish Loading


Processed prompts: 100%|██████████| 211/211 [00:53<00:00,  3.96it/s, est. speed input: 2720.62 toks/s, output: 5038.06 toks/s]


Numbers of test cases in dataset MAMO_ComplexLP.json: 211
Numbers of pass@1 cases in dataset MAMO_ComplexLP.json: 131
pass@1 accuracy for dataset MAMO_ComplexLP.json: 131/211 = 0.6208530805687204
-------------------------------------------------------------------
Loading data IndustryOR_fixed.json
Finish Loading


Processed prompts: 100%|██████████| 100/100 [00:23<00:00,  4.17it/s, est. speed input: 1774.68 toks/s, output: 3796.97 toks/s]


Numbers of test cases in dataset IndustryOR_fixed.json: 100
Numbers of pass@1 cases in dataset IndustryOR_fixed.json: 33
pass@1 accuracy for dataset IndustryOR_fixed.json: 33/100 = 0.33
-------------------------------------------------------------------
Loading data OptMATH_Bench_193.jsonl
Finish Loading


Processed prompts: 100%|██████████| 193/193 [01:57<00:00,  1.64it/s, est. speed input: 1966.92 toks/s, output: 2941.46 toks/s]


Numbers of test cases in dataset OptMATH_Bench_193.jsonl: 193
Numbers of pass@1 cases in dataset OptMATH_Bench_193.jsonl: 56
pass@1 accuracy for dataset OptMATH_Bench_193.jsonl: 56/193 = 0.29015544041450775
-------------------------------------------------------------------
Loading data OptMATH_Bench_166.jsonl
Finish Loading


Processed prompts: 100%|██████████| 166/166 [01:49<00:00,  1.52it/s, est. speed input: 1804.35 toks/s, output: 2685.20 toks/s]


Numbers of test cases in dataset OptMATH_Bench_166.jsonl: 166
Numbers of pass@1 cases in dataset OptMATH_Bench_166.jsonl: 42
pass@1 accuracy for dataset OptMATH_Bench_166.jsonl: 42/166 = 0.25301204819277107
-------------------------------------------------------------------
Loading data OptiBench.jsonl
Finish Loading


Processed prompts: 100%|██████████| 605/605 [01:09<00:00,  8.68it/s, est. speed input: 3161.29 toks/s, output: 7034.89 toks/s]


Numbers of test cases in dataset OptiBench.jsonl: 605
Numbers of pass@1 cases in dataset OptiBench.jsonl: 351
pass@1 accuracy for dataset OptiBench.jsonl: 351/605 = 0.5801652892561984
-------------------------------------------------------------------


In [7]:
# if you want to check pass@8 accuracy, please run this cell
# Test the checkpoint
datapath = 'test_data'
testdataset = ['NL4OPT.jsonl', 'MAMO_EasyLP.json', 'MAMO_ComplexLP.json', 'IndustryOR_fixed.json', 'OptMATH_Bench_193.jsonl', 'OptMATH_Bench_166.jsonl','OptiBench.jsonl']
for filepath in testdataset:
    
    # loading data
    print('Loading data', filepath)
    test_data = [i for i in load_jsonl(os.path.join(datapath, filepath)) for _ in range(8)]
    print('Finish Loading')
    
    # generation 
    
    prompt_list = []
    for item in test_data:
        prompt_list.append(mp_worker(item))
    result_strs = generate_with_model(model, prompt_list, sampling_params)
    snippet_package_cor = []
    score = []
    snippet_package_tmp=[]
    # check the pass@8 accuracy
    
    result_chunks = [result_strs[i:i + 8] for i in range(0, len(result_strs), 8)]
    test_data_chunks = [test_data[i:i + 8] for i in range(0, len(test_data), 8)]
    for result_chunk, items in zip(result_chunks,test_data_chunks):
        for chunk, item in zip(result_chunk, items):
            snippet_package_tmp.append(check_result(chunk, item, solver_name))
        if 1 in snippet_package_tmp:
            snippet_package_cor.append(1)
        else:
            snippet_package_cor.append(0)
        snippet_package_tmp.clear()
    result = np.bincount(snippet_package_cor)
    print(f'Numbers of test cases in dataset {filepath}: {sum(result)}')
    print(f'Numbers of pass@8 cases in dataset {filepath}: {result[1]}')
    print(f'pass@8 accuracy for dataset {filepath}: {result[1]}/{sum(result)} = {result[1] / sum(result)}')
    print('-------------------------------------------------------------------')
    


Loading data NL4OPT.jsonl
Finish Loading


Processed prompts: 100%|██████████| 1960/1960 [03:06<00:00, 10.50it/s, est. speed input: 3428.87 toks/s, output: 7892.34 toks/s]


Numbers of test cases in dataset NL4OPT.jsonl: 245
Numbers of pass@8 cases in dataset NL4OPT.jsonl: 238
pass@8 accuracy for dataset NL4OPT.jsonl: 238/245 = 0.9714285714285714
-------------------------------------------------------------------
Loading data MAMO_EasyLP.json
Finish Loading


Processed prompts: 100%|██████████| 5216/5216 [09:39<00:00,  9.01it/s, est. speed input: 3822.21 toks/s, output: 7719.13 toks/s]


Numbers of test cases in dataset MAMO_EasyLP.json: 652
Numbers of pass@8 cases in dataset MAMO_EasyLP.json: 589
pass@8 accuracy for dataset MAMO_EasyLP.json: 589/652 = 0.9033742331288344
-------------------------------------------------------------------
Loading data MAMO_ComplexLP.json
Finish Loading


Processed prompts: 100%|██████████| 1688/1688 [05:11<00:00,  5.42it/s, est. speed input: 3726.87 toks/s, output: 6914.64 toks/s]


Numbers of test cases in dataset MAMO_ComplexLP.json: 211
Numbers of pass@8 cases in dataset MAMO_ComplexLP.json: 133
pass@8 accuracy for dataset MAMO_ComplexLP.json: 133/211 = 0.6303317535545023
-------------------------------------------------------------------
Loading data IndustryOR_fixed.json
Finish Loading


Processed prompts: 100%|██████████| 800/800 [02:34<00:00,  5.17it/s, est. speed input: 2199.02 toks/s, output: 4772.10 toks/s]


Numbers of test cases in dataset IndustryOR_fixed.json: 100
Numbers of pass@8 cases in dataset IndustryOR_fixed.json: 39
pass@8 accuracy for dataset IndustryOR_fixed.json: 39/100 = 0.39
-------------------------------------------------------------------
Loading data OptMATH_Bench_193.jsonl
Finish Loading


Processed prompts: 100%|██████████| 1544/1544 [09:32<00:00,  2.70it/s, est. speed input: 3230.20 toks/s, output: 5030.66 toks/s]


Numbers of test cases in dataset OptMATH_Bench_193.jsonl: 193
Numbers of pass@8 cases in dataset OptMATH_Bench_193.jsonl: 68
pass@8 accuracy for dataset OptMATH_Bench_193.jsonl: 68/193 = 0.35233160621761656
-------------------------------------------------------------------
Loading data OptMATH_Bench_166.jsonl
Finish Loading


Processed prompts: 100%|██████████| 1328/1328 [07:53<00:00,  2.81it/s, est. speed input: 3330.11 toks/s, output: 4967.84 toks/s]


Numbers of test cases in dataset OptMATH_Bench_166.jsonl: 166
Numbers of pass@8 cases in dataset OptMATH_Bench_166.jsonl: 46
pass@8 accuracy for dataset OptMATH_Bench_166.jsonl: 46/166 = 0.27710843373493976
-------------------------------------------------------------------
Loading data OptiBench.jsonl
Finish Loading


Processed prompts: 100%|██████████| 4840/4840 [08:32<00:00,  9.44it/s, est. speed input: 3435.35 toks/s, output: 7643.26 toks/s]


Numbers of test cases in dataset OptiBench.jsonl: 605
Numbers of pass@8 cases in dataset OptiBench.jsonl: 379
pass@8 accuracy for dataset OptiBench.jsonl: 379/605 = 0.6264462809917355
-------------------------------------------------------------------
