In [10]:
from vllm import LLM, SamplingParams
from typing import Callable, List, Tuple
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn
import re
import json
import os
from collections import Counter
import argparse
# QWEN_MATH_BASE_PATH = "/home/ubuntu/model/Qwen2.5-Math-1.5B"
QWEN_MATH_BASE_PATH = "/home/nova/cs336/assignment5-alignment/models/Qwen2.5-Math-1.5B"
PROMPT_PATH = "/home/nova/cs336/assignment5-alignment/cs336_alignment/prompts/r1_zero.prompt"
MATH_DATA_PATH = "/home/nova/cs336/assignment5-alignment/data/gsm8k"

ANS_RE = re.compile(r"####\s*([\-0-9\.\,]+)")

In [11]:

def extract_reference_answer(answer: str) -> str:
    match = ANS_RE.search(answer)
    if match:
        return match.group(1).strip().replace(",", "")
    return "[invalid]"

In [12]:
def run_vllm(vllm_model, prompts, sampling_params) -> List[str]:
    result = vllm_model.generate(prompts, sampling_params)
    texts = [output.outputs[0].text.strip() for output in result]
    return texts

In [13]:




def evaluate_vllm(
    vllm_model: LLM,
    reward_fn: Callable[[str, str], dict[str, float]],
    prompts: List[str],
    answers: List[str],
    eval_sampling_params: SamplingParams
):
    responses = run_vllm(vllm_model, prompts, eval_sampling_params)
    allinfo_dict_list = []
    for response, answer, prompt in zip(responses, answers, prompts):
        extracted_answer = extract_reference_answer(answer)
        reward_dict = reward_fn(response, extracted_answer)
        reward_dict["response"] = response
        reward_dict["answer"] = answer
        reward_dict["prompt"] = prompt
        reward_dict["extracted_answer"] = extracted_answer
        allinfo_dict_list.append(reward_dict)
    return allinfo_dict_list


In [14]:


def load_and_format_prompts(data_path: str, prompt_path: str):
    with open(prompt_path, "r") as file:
        prompt = file.read()
    prompts = []
    answers = []
    with open(data_path, "r") as file:
        for line in file:
            data = json.loads(line)
            prompts.append(prompt.format(question=data["question"]))
            answers.append(data["answer"])
    return prompts, answers


In [15]:

def build_llm_and_params(model_path: str) -> Tuple[LLM, SamplingParams]:
    llm = LLM(model_path)
    sampling_params = SamplingParams(
        temperature=1.0,
        top_p=1.0,
        max_tokens=1024,
        stop=["</answer>"],
        include_stop_str_in_output=True
    )
    return llm, sampling_params


In [None]:
prompts, answers = load_and_format_prompts(data_path=MATH_DATA_PATH+"/test.jsonl", prompt_path=PROMPT_PATH)
llm, sampling_params = build_llm_and_params(QWEN_MATH_BASE_PATH)
allinfo_dict_list = evaluate_vllm(llm, r1_zero_reward_fn, prompts, answers, sampling_params)
with open("baseline_result.jsonl", "w") as f:
    for i in allinfo_dict_list:
        json.dump(i, f)
        f.write("\n")

INFO 11-19 20:23:47 config.py:542] This model supports multiple tasks: {'embed', 'generate', 'classify', 'reward', 'score'}. Defaulting to 'generate'.
INFO 11-19 20:23:47 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/home/nova/cs336/assignment5-alignment/models/Qwen2.5-Math-1.5B', speculative_config=None, tokenizer='/home/nova/cs336/assignment5-alignment/models/Qwen2.5-Math-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execu

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 11-19 20:23:53 model_runner.py:1115] Loading model weights took 2.8797 GB
INFO 11-19 20:23:55 worker.py:267] Memory profiling takes 1.65 seconds
INFO 11-19 20:23:55 worker.py:267] the current vLLM instance can use total_gpu_memory (8.00GiB) x gpu_memory_utilization (0.90) = 7.20GiB
INFO 11-19 20:23:55 worker.py:267] model weights take 2.88GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 1.40GiB; the rest of the memory reserved for KV Cache is 2.89GiB.
INFO 11-19 20:23:55 executor_base.py:110] # CUDA blocks: 6768, # CPU blocks: 9362
INFO 11-19 20:23:55 executor_base.py:115] Maximum concurrency for 4096 tokens per request: 26.44x
INFO 11-19 20:23:58 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utiliz

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:21<00:00,  1.62it/s]

INFO 11-19 20:24:20 model_runner.py:1562] Graph capturing finished in 22 secs, took 0.00 GiB
INFO 11-19 20:24:20 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 27.17 seconds



Processed prompts:  75%|███████▌  | 992/1319 [01:19<00:22, 14.75it/s, est. speed input: 1902.85 toks/s, output: 2140.74 toks/s]



Processed prompts: 100%|██████████| 1319/1319 [01:50<00:00, 11.90it/s, est. speed input: 1832.65 toks/s, output: 2778.49 toks/s]


: 

In [None]:

parser = argparse.ArgumentParser()
parser.add_argument("--choice")
args = parser.parse_args()

if args.choice == "quick_inf":
    ## example for inference
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
        ]
    sampling_params = SamplingParams(
        temperature=1.0, top_p=1.0, max_tokens=1024, stop=["\n"]
    )
    
    llm = LLM(model=QWEN_MATH_BASE_PATH, trust_remote_code=True)

    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    ## end of example

if args.choice == "load_prompt_answer":
    prompts, answers = load_and_format_prompts(data_path=MATH_DATA_PATH+"/test.jsonl", prompt_path=PROMPT_PATH)
    for i,j in zip(prompts, answers):
        print (f"prompt:{i}, \n answer:{j}")
        break
else:
    prompts, answers = load_and_format_prompts(data_path=MATH_DATA_PATH+"/test.jsonl", prompt_path=PROMPT_PATH)
    llm, sampling_params = build_llm_and_params(QWEN_MATH_BASE_PATH)
    allinfo_dict_list = evaluate_vllm(llm, r1_zero_reward_fn, prompts, answers, sampling_params)
    with open("baseline_result.jsonl", "w") as f:
        for i in allinfo_dict_list:
            json.dump(i, f)
            f.write("\n")