From b71cf5cce326a7881a022547bc9c46d4fb0f08e4 Mon Sep 17 00:00:00 2001
From: bzgoogle <beinuoz@google.com>
Date: Fri, 4 Apr 2025 18:51:02 +0000
Subject: [PATCH 1/2] Modify MMLU prompt template to improve accuracy on
 DeepSeek models; Enable running whole dataset with num_prompt=-1

---
 benchmarks/benchmark_serving.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index cb1b4965..dd14727c 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -342,7 +342,7 @@ def gen_mmlu_qa(data: Any, mmlu_method: str = "") -> str:
         f"(D) {row['D']}\n"
     )
 
-    output += "\nCorrect answer: "
+    output += "\nCorrect answer:"
 
     if mmlu_method == "HELM":
       output += f"({row['answer']})\n\n"
@@ -1133,11 +1133,16 @@ def main(args: argparse.Namespace):
     # A given args.max_output_length value is the max generation step,
     # when the args.max_output_length is default to None, the sample's golden
     # output length will be used to decide the generation step.
+    if args.num_prompts == -1:
+      num_requests = len(dataset)
+    else:
+      num_requests = args.num_prompts
+
     input_requests = sample_requests(
         dataset=dataset,
         tokenizer=tokenizer,
         use_chat_template=use_chat_template,
-        num_requests=args.num_prompts,
+        num_requests=num_requests,
         dataset_type=args.dataset,
         max_output_length=args.max_output_length,
         min_input_length=args.min_input_length,

From 1bff78fa0674e98e7e23bffa28321b7659fcd58e Mon Sep 17 00:00:00 2001
From: bzgoogle <beinuoz@google.com>
Date: Fri, 4 Apr 2025 20:54:53 +0000
Subject: [PATCH 2/2] change the default value of num_prompts to -1

---
 benchmarks/benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index dd14727c..45459f4e 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -938,7 +938,7 @@ def parse_args() -> argparse.Namespace:
   parser.add_argument(
       "--num-prompts",
       type=int,
-      default=1000,
+      default=-1,
       help=(
           "Number of prompts to process. (number of sample requests we randomly"
           " collect from dataset)"