From b71cf5cce326a7881a022547bc9c46d4fb0f08e4 Mon Sep 17 00:00:00 2001 From: bzgoogle Date: Fri, 4 Apr 2025 18:51:02 +0000 Subject: [PATCH 1/2] Modify MMLU prompt template to improve accuracy on DeepSeek models; Enable running whole dataset with num_prompt=-1 --- benchmarks/benchmark_serving.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index cb1b4965..dd14727c 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -342,7 +342,7 @@ def gen_mmlu_qa(data: Any, mmlu_method: str = "") -> str: f"(D) {row['D']}\n" ) - output += "\nCorrect answer: " + output += "\nCorrect answer:" if mmlu_method == "HELM": output += f"({row['answer']})\n\n" @@ -1133,11 +1133,16 @@ def main(args: argparse.Namespace): # A given args.max_output_length value is the max generation step, # when the args.max_output_length is default to None, the sample's golden # output length will be used to decide the generation step. + if args.num_prompts == -1: + num_requests = len(dataset) + else: + num_requests = args.num_prompts + input_requests = sample_requests( dataset=dataset, tokenizer=tokenizer, use_chat_template=use_chat_template, - num_requests=args.num_prompts, + num_requests=num_requests, dataset_type=args.dataset, max_output_length=args.max_output_length, min_input_length=args.min_input_length, From 1bff78fa0674e98e7e23bffa28321b7659fcd58e Mon Sep 17 00:00:00 2001 From: bzgoogle Date: Fri, 4 Apr 2025 20:54:53 +0000 Subject: [PATCH 2/2] change the default value of num_prompts to -1 --- benchmarks/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index dd14727c..45459f4e 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -938,7 +938,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--num-prompts", type=int, - default=1000, + default=-1, help=( "Number of prompts to process. (number of sample requests we randomly" " collect from dataset)"