In [1]:
import warnings
warnings.filterwarnings("ignore") 
import subprocess
from utils import load_jsonl, extract_code_block, extract_obj, change_variable_types
import numpy as np
from vllm import LLM, SamplingParams        
from transformers import AutoTokenizer                                      
from langchain.prompts import PromptTemplate
from rule_prompt_utils import system_prompt_temp
import os




In [2]:
# load checkpoints and tokenizer

model_path = '/DATA/disk2/checkpoints/system/partialkl/'
tensor_parallel_size = 1
solver_name = 'gurobi'
print("Loading model", model_path)
model = LLM(
    model=model_path,
    tensor_parallel_size=tensor_parallel_size,
    trust_remote_code=True
)
print("Model initialized.")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)



Loading model /DATA/disk2/checkpoints/system/partialkl/
INFO 05-20 18:24:35 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='/DATA/disk2/checkpoints/system/partialkl/', speculative_config=None, tokenizer='/DATA/disk2/checkpoints/system/partialkl/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/DATA/disk2/checkpoints/system/parti

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-20 18:24:41 model_runner.py:1071] Loading model weights took 14.2487 GB
INFO 05-20 18:24:42 gpu_executor.py:122] # GPU blocks: 61034, # CPU blocks: 4681
INFO 05-20 18:24:42 gpu_executor.py:126] Maximum concurrency for 32768 tokens per request: 29.80x
INFO 05-20 18:24:44 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-20 18:24:44 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-20 18:24:51 model_runner.py:1530] Graph capturing finished in 6 secs.
Model initialized.


In [3]:
# load prompt template and functions for generation
zeroshot_prompt_system = PromptTemplate.from_template(system_prompt_temp['system'])
zeroshot_prompt_user = PromptTemplate.from_template(system_prompt_temp['user'])
def mp_worker(item):
    prompt = [
        {
            "role": "system",
            "content": zeroshot_prompt_system.format(question=item['en_question']).strip()
        },
        {
            "role": "user",
            "content": zeroshot_prompt_user.format(question=item['en_question']).strip()
        }
    ]
    text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return text

def generate_with_model(model, prompt, sampling_params):   
    response = model.generate(prompt, sampling_params) 
    result_text = [g.outputs[0].text for g in response]
    return result_text

In [4]:
# Load decode strategy
topk = 1
max_tokens = 8192
repetition_penalty = 1.02 # To avoid the occasional occurrence of repeated tokens
stop_tokens = ["</s>"]

# top-p strategy
sampling_params = SamplingParams(
    n=topk,
    temperature=0.5,
    top_p=0.9,
    max_tokens=max_tokens,
    stop=stop_tokens,
    repetition_penalty=repetition_penalty
)


In [5]:
# check the pass@1 accuracy
def check_result(result_str, item, solver_name='gurobi'):
    sub_answer = item['en_answer']
    # Convert sub_answer to float or None
    sub_answer = None if sub_answer == "No Best Solution" or "-9999" in str(sub_answer) else float(sub_answer)
    
    # Extract code snippet
    code_snippet = extract_code_block(result_str, solver_name)
    if not code_snippet:
        return 2
    
    # Run code snippet
    try:
        result = subprocess.run(['python3', '-c', code_snippet], capture_output=True, text=True, timeout=100)
    except subprocess.TimeoutExpired:
        return 1 if sub_answer is None else 0
    
    # Check if execution failed
    if result.returncode != 0:
        return 3
    
    # Extract solver result
    solver_result = extract_obj(result.stdout)
    
    # check the first time
    if solver_result is not None and sub_answer is not None and np.abs(solver_result - sub_answer) / (np.abs(sub_answer) + 1) <= 1e-6:
        return 1
    # Handle infeasible case or numerical mismatch since we ignore the variable types error
    if 'nfeasible' in result.stdout or (solver_result is not None and sub_answer is not None and np.abs(solver_result - sub_answer) / (np.abs(sub_answer) + 1) > 1e-6):
        # Try re-running with modified variables: we ignore the variable types error
        result_str = change_variable_types(result_str) # change the type of variables
        if result_str:
            try:
                code_snippet = extract_code_block(result_str, solver_name)
                result = subprocess.run(['python3', '-c', code_snippet], capture_output=True, text=True, timeout=100)
                if result.returncode == 0:
                    new_result = extract_obj(result.stdout)
                    if 'nfeasible' not in result.stdout: # infeasible and Infeasible
                        if new_result is not None and sub_answer is not None and np.abs(new_result - sub_answer) / (np.abs(sub_answer) + 1) < 1e-6:
                            return 1
                        if new_result == sub_answer:
                            return 1
            except subprocess.TimeoutExpired:
                print("over_time")
    
    # Handle infeasible case after retry
    if 'nfeasible' in result.stdout:
        return 1 if sub_answer is None else 0
    
    # Final comparison
    if solver_result is not None and sub_answer is not None:
        return 1 if np.abs(solver_result - sub_answer) / (np.abs(sub_answer) + 1) < 1e-6 else 0
    return 1 if solver_result == sub_answer else 0

In [6]:
# Test the checkpoint
datapath = 'test_data'
testdataset = ['NL4OPT.jsonl', 'MAMO_EasyLP.json', 'MAMO_ComplexLP.json', 'IndustryOR_fixed.json', 'OptMATH_Bench_193.jsonl', 'OptMATH_Bench_166.jsonl']
for filepath in testdataset:
    
    # loading data
    print('Loading data', filepath)
    test_data = load_jsonl(os.path.join(datapath, filepath))
    print('Finish Loading')
    
    # generation 
    prompt_list = []
    for item in test_data:
        prompt_list.append(mp_worker(item))
    result_strs = generate_with_model(model, prompt_list, sampling_params)
    snippet_package_cor = []
    score = []
    
    # check the pass@1 accuracy
    for result_str, item in zip(result_strs, test_data):
        snippet_package_cor.append(check_result(result_str, item, solver_name))
    result = np.bincount(snippet_package_cor)
    print(f'Numbers of test cases in dataset {filepath}: {sum(result)}')
    print(f'Numbers of pass@1 cases in dataset {filepath}: {result[1]}')
    print(f'pass@1 accuracy for dataset {filepath}: {result[1]}/{sum(result)} = {result[1] / sum(result)}')
    print('-------------------------------------------------------------------')

Loading data NL4OPT.jsonl
Finish Loading


Processed prompts: 100%|██████████| 245/245 [00:25<00:00,  9.73it/s, est. speed input: 3174.78 toks/s, output: 7302.45 toks/s]


Numbers of test cases in dataset NL4OPT.jsonl: 245
Numbers of pass@1 cases in dataset NL4OPT.jsonl: 235
pass@1 accuracy for dataset NL4OPT.jsonl: 235/245 = 0.9591836734693877
-------------------------------------------------------------------
Loading data MAMO_EasyLP.json
Finish Loading


Processed prompts: 100%|██████████| 652/652 [01:19<00:00,  8.17it/s, est. speed input: 3468.19 toks/s, output: 7000.65 toks/s]


Numbers of test cases in dataset MAMO_EasyLP.json: 652
Numbers of pass@1 cases in dataset MAMO_EasyLP.json: 587
pass@1 accuracy for dataset MAMO_EasyLP.json: 587/652 = 0.9003067484662577
-------------------------------------------------------------------
Loading data MAMO_ComplexLP.json
Finish Loading


Processed prompts: 100%|██████████| 211/211 [00:53<00:00,  3.93it/s, est. speed input: 2697.24 toks/s, output: 4994.77 toks/s]


Numbers of test cases in dataset MAMO_ComplexLP.json: 211
Numbers of pass@1 cases in dataset MAMO_ComplexLP.json: 131
pass@1 accuracy for dataset MAMO_ComplexLP.json: 131/211 = 0.6208530805687204
-------------------------------------------------------------------
Loading data IndustryOR_fixed.json
Finish Loading


Processed prompts: 100%|██████████| 100/100 [00:24<00:00,  4.16it/s, est. speed input: 1768.17 toks/s, output: 3783.02 toks/s]


Numbers of test cases in dataset IndustryOR_fixed.json: 100
Numbers of pass@1 cases in dataset IndustryOR_fixed.json: 33
pass@1 accuracy for dataset IndustryOR_fixed.json: 33/100 = 0.33
-------------------------------------------------------------------
Loading data OptMATH_Bench_193.jsonl
Finish Loading


Processed prompts: 100%|██████████| 193/193 [01:57<00:00,  1.64it/s, est. speed input: 1965.26 toks/s, output: 2938.97 toks/s]


Numbers of test cases in dataset OptMATH_Bench_193.jsonl: 193
Numbers of pass@1 cases in dataset OptMATH_Bench_193.jsonl: 56
pass@1 accuracy for dataset OptMATH_Bench_193.jsonl: 56/193 = 0.29015544041450775
-------------------------------------------------------------------
Loading data OptMATH_Bench_166.jsonl
Finish Loading


Processed prompts: 100%|██████████| 166/166 [01:50<00:00,  1.50it/s, est. speed input: 1785.33 toks/s, output: 2656.90 toks/s]


Numbers of test cases in dataset OptMATH_Bench_166.jsonl: 166
Numbers of pass@1 cases in dataset OptMATH_Bench_166.jsonl: 42
pass@1 accuracy for dataset OptMATH_Bench_166.jsonl: 42/166 = 0.25301204819277107
-------------------------------------------------------------------
