{
"schema_version": "1.0",
"suite_id": "suite_A",
"implementation_id": "nvidia_sglang_c43a8309",
"chip": {
"name": "NVIDIA A100-SXM4-40GB",
"vendor": "NVIDIA",
"count": 1,
"memory_gb": 40.0,
"interconnect_intra_node": null,
"interconnect_inter_node": null
},
"environment": {
"collected_at": "2026-05-06T11:15:11.081772+00:00",
"accelerators": [
{
"index": 0,
"name": "NVIDIA A100-SXM4-40GB",
"vendor": "NVIDIA",
"memory_gb": 40.0,
"driver_version": "565.57.01",
"firmware_version": null,
"compute_capability": "8.0",
"supports_bf16": true
}
],
"accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tPXB\tNODE\tSYS\t0-31,64-95\t0\t\tN/A\nNIC0\tPXB\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tPXB\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n",
"intra_node_interconnect": null,
"cpu": {
"model": "AMD EPYC 7532 32-Core Processor",
"physical_cores": 64,
"logical_cores": 128,
"numa_nodes": 2
},
"system_memory_gb": 1007.7,
"pcie_generation": "PCIe Gen 4",
"cpu_accelerator_bandwidth_gbs": null,
"network_interfaces": [
{
"name": "mlx5_0",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_1",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_2",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_3",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
}
],
"os": "Ubuntu 22.04.4 LTS",
"python_version": "3.10.20",
"kernel_version": "5.15.0-60-generic",
"runtime_version": "CUDA 12.8",
"pytorch_version": "2.9.1+cu128"
},
"software": {
"framework": "SGLang",
"framework_version": "0.5.6",
"driver_version": "565.57.01",
"runtime_version": "CUDA 12.8",
"os": "Ubuntu 22.04.4 LTS",
"python_version": "3.10.20"
},
"model": {
"model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
"model_name": null,
"model_note": null,
"model_source": "local",
"architecture": "dense",
"parameter_count_b": 8.0,
"precision": "BF16",
"effective_dtype": "bfloat16",
"quantization_method": null,
"model_format": "HuggingFace original"
},
"task": {
"scenarios_run": [
"offline",
"online",
"interactive",
"sustained",
"speculative",
"burst"
],
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"expert_parallel_size": 1,
"data_parallel_size": 1
},
"num_runs": 3,
"extra_config": null
},
"metrics": {
"derived": {},
"offline": {
"results_by_concurrency": [
{
"client_concurrency": 8,
"throughput_tokens_per_sec": 3144.73,
"throughput_tokens_per_sec_per_chip": 3144.73,
"elapsed_seconds_median": 11.2,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 32,
"throughput_tokens_per_sec": 3146.66,
"throughput_tokens_per_sec_per_chip": 3146.66,
"elapsed_seconds_median": 11.2,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 128,
"throughput_tokens_per_sec": 3146.09,
"throughput_tokens_per_sec_per_chip": 3146.09,
"elapsed_seconds_median": 11.2,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
}
]
},
"online": {
"sla_ttft_ms": 500,
"max_valid_qps": 100,
"results_by_qps": [
{
"target_qps": 5,
"achieved_qps": 5.0,
"ttft_ms_p50": 43.91,
"ttft_ms_p90": 62.26,
"ttft_ms_p99": 972.47,
"tpot_ms_p50": 15.63,
"tpot_ms_p90": 17.36,
"tpot_ms_p99": 18.58,
"elapsed_seconds_median": 66.1,
"sla_met": false
},
{
"target_qps": 25,
"achieved_qps": 25.0,
"ttft_ms_p50": 52.85,
"ttft_ms_p90": 67.65,
"ttft_ms_p99": 80.71,
"tpot_ms_p50": 36.16,
"tpot_ms_p90": 41.45,
"tpot_ms_p99": 57.42,
"elapsed_seconds_median": 17.1,
"sla_met": true
},
{
"target_qps": 100,
"achieved_qps": 100.0,
"ttft_ms_p50": 50.85,
"ttft_ms_p90": 62.88,
"ttft_ms_p99": 245.1,
"tpot_ms_p50": 41.47,
"tpot_ms_p90": 53.07,
"tpot_ms_p99": 177.42,
"elapsed_seconds_median": 10.8,
"sla_met": true
}
]
},
"interactive": {
"ttft_ms_p50": 32.52,
"ttft_ms_p90": 44.71,
"ttft_ms_p99": 61.84,
"tpot_ms_p50": 12.93,
"tpot_ms_p90": 12.98,
"tpot_ms_p99": 13.03,
"peak_memory_gb": null,
"elapsed_seconds_median": 381.4
},
"sustained": {
"sustained_concurrency": 8,
"duration_minutes": 30,
"warmup_minutes": 2,
"sample_interval_seconds": 60,
"samples": [
{
"minute": 1.0,
"is_warmup": true,
"throughput_tokens_per_sec": 477.0,
"tokens_out": 28638,
"tokens_in": 0,
"requests_completed": 154,
"ttft_ms_p50": 49.6,
"ttft_ms_p99": 6750.3
},
{
"minute": 2.0,
"is_warmup": false,
"throughput_tokens_per_sec": 566.2,
"tokens_out": 33972,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 44.6,
"ttft_ms_p99": 61.2
},
{
"minute": 3.0,
"is_warmup": false,
"throughput_tokens_per_sec": 560.8,
"tokens_out": 33639,
"tokens_in": 0,
"requests_completed": 177,
"ttft_ms_p50": 44.5,
"ttft_ms_p99": 61.3
},
{
"minute": 4.0,
"is_warmup": false,
"throughput_tokens_per_sec": 565.3,
"tokens_out": 33929,
"tokens_in": 0,
"requests_completed": 181,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 60.8
},
{
"minute": 5.0,
"is_warmup": false,
"throughput_tokens_per_sec": 561.4,
"tokens_out": 33685,
"tokens_in": 0,
"requests_completed": 179,
"ttft_ms_p50": 44.2,
"ttft_ms_p99": 61.6
},
{
"minute": 6.0,
"is_warmup": false,
"throughput_tokens_per_sec": 561.9,
"tokens_out": 33707,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 60.8
},
{
"minute": 7.0,
"is_warmup": false,
"throughput_tokens_per_sec": 570.0,
"tokens_out": 34190,
"tokens_in": 0,
"requests_completed": 179,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 61.4
},
{
"minute": 8.0,
"is_warmup": false,
"throughput_tokens_per_sec": 558.3,
"tokens_out": 33498,
"tokens_in": 0,
"requests_completed": 177,
"ttft_ms_p50": 44.6,
"ttft_ms_p99": 62.6
},
{
"minute": 9.0,
"is_warmup": false,
"throughput_tokens_per_sec": 563.3,
"tokens_out": 33801,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 61.8
},
{
"minute": 10.0,
"is_warmup": false,
"throughput_tokens_per_sec": 552.7,
"tokens_out": 33163,
"tokens_in": 0,
"requests_completed": 176,
"ttft_ms_p50": 44.5,
"ttft_ms_p99": 52.4
},
{
"minute": 11.0,
"is_warmup": false,
"throughput_tokens_per_sec": 569.3,
"tokens_out": 34157,
"tokens_in": 0,
"requests_completed": 181,
"ttft_ms_p50": 44.1,
"ttft_ms_p99": 60.4
},
{
"minute": 12.0,
"is_warmup": false,
"throughput_tokens_per_sec": 558.9,
"tokens_out": 33526,
"tokens_in": 0,
"requests_completed": 177,
"ttft_ms_p50": 44.2,
"ttft_ms_p99": 47.9
},
{
"minute": 13.0,
"is_warmup": false,
"throughput_tokens_per_sec": 568.4,
"tokens_out": 34113,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 44.5,
"ttft_ms_p99": 139.2
},
{
"minute": 14.0,
"is_warmup": false,
"throughput_tokens_per_sec": 557.2,
"tokens_out": 33424,
"tokens_in": 0,
"requests_completed": 178,
"ttft_ms_p50": 44.5,
"ttft_ms_p99": 61.5
},
{
"minute": 15.0,
"is_warmup": false,
"throughput_tokens_per_sec": 565.5,
"tokens_out": 33942,
"tokens_in": 0,
"requests_completed": 181,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 61.7
},
{
"minute": 16.0,
"is_warmup": false,
"throughput_tokens_per_sec": 554.1,
"tokens_out": 33238,
"tokens_in": 0,
"requests_completed": 175,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 61.2
},
{
"minute": 17.0,
"is_warmup": false,
"throughput_tokens_per_sec": 563.7,
"tokens_out": 33832,
"tokens_in": 0,
"requests_completed": 178,
"ttft_ms_p50": 44.4,
"ttft_ms_p99": 61.7
},
{
"minute": 18.0,
"is_warmup": false,
"throughput_tokens_per_sec": 563.3,
"tokens_out": 33783,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 44.5,
"ttft_ms_p99": 62.1
},
{
"minute": 19.0,
"is_warmup": false,
"throughput_tokens_per_sec": 565.0,
"tokens_out": 33912,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 44.6,
"ttft_ms_p99": 62.2
},
{
"minute": 20.0,
"is_warmup": false,
"throughput_tokens_per_sec": 563.3,
"tokens_out": 33773,
"tokens_in": 0,
"requests_completed": 179,
"ttft_ms_p50": 44.6,
"ttft_ms_p99": 61.6
},
{
"minute": 21.0,
"is_warmup": false,
"throughput_tokens_per_sec": 564.7,
"tokens_out": 33889,
"tokens_in": 0,
"requests_completed": 178,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 61.3
},
{
"minute": 22.0,
"is_warmup": false,
"throughput_tokens_per_sec": 564.3,
"tokens_out": 33853,
"tokens_in": 0,
"requests_completed": 179,
"ttft_ms_p50": 44.6,
"ttft_ms_p99": 61.1
},
{
"minute": 23.0,
"is_warmup": false,
"throughput_tokens_per_sec": 562.3,
"tokens_out": 33744,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 44.5,
"ttft_ms_p99": 61.2
},
{
"minute": 24.0,
"is_warmup": false,
"throughput_tokens_per_sec": 569.5,
"tokens_out": 34180,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 61.3
},
{
"minute": 25.0,
"is_warmup": false,
"throughput_tokens_per_sec": 550.8,
"tokens_out": 33047,
"tokens_in": 0,
"requests_completed": 176,
"ttft_ms_p50": 44.5,
"ttft_ms_p99": 61.5
},
{
"minute": 26.0,
"is_warmup": false,
"throughput_tokens_per_sec": 562.5,
"tokens_out": 33749,
"tokens_in": 0,
"requests_completed": 178,
"ttft_ms_p50": 44.5,
"ttft_ms_p99": 63.9
},
{
"minute": 27.0,
"is_warmup": false,
"throughput_tokens_per_sec": 561.7,
"tokens_out": 33689,
"tokens_in": 0,
"requests_completed": 179,
"ttft_ms_p50": 44.7,
"ttft_ms_p99": 61.6
},
{
"minute": 28.0,
"is_warmup": false,
"throughput_tokens_per_sec": 566.5,
"tokens_out": 34010,
"tokens_in": 0,
"requests_completed": 181,
"ttft_ms_p50": 44.7,
"ttft_ms_p99": 141.7
},
{
"minute": 29.0,
"is_warmup": false,
"throughput_tokens_per_sec": 558.2,
"tokens_out": 33464,
"tokens_in": 0,
"requests_completed": 178,
"ttft_ms_p50": 44.3,
"ttft_ms_p99": 60.9
}
],
"sustained_throughput_tokens_per_sec": 562.5,
"throttle_ratio": 0.966,
"throttle_onset_minute": null,
"ttft_p99_drift_ms": -0.3
},
"speculative": {
"results_by_concurrency": [
{
"client_concurrency": 8,
"throughput_tokens_per_sec": 705.16,
"throughput_tokens_per_sec_per_chip": 705.16,
"elapsed_seconds_median": 49.7,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 32,
"throughput_tokens_per_sec": 703.58,
"throughput_tokens_per_sec_per_chip": 703.58,
"elapsed_seconds_median": 49.8,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 128,
"throughput_tokens_per_sec": 704.29,
"throughput_tokens_per_sec_per_chip": 704.29,
"elapsed_seconds_median": 49.7,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
}
]
},
"burst": {
"sla_ttft_ms": 500,
"burst_steady_qps": 5,
"burst_peak_qps": 25,
"burst_duration_seconds": 30,
"burst_interval_seconds": 120,
"steady_requests_total": 1812,
"burst_requests_total": 2245,
"steady_ttft_p50_ms": 43.06,
"steady_ttft_p99_ms": 3985.36,
"burst_ttft_p50_ms": 57.82,
"burst_ttft_p99_ms": 99.11,
"sla_met_during_burst": true,
"burst_degradation_ratio": 0.025,
"results_by_cycle": [
{
"cycle": 1,
"steady_requests": 581,
"burst_requests": 760,
"steady_ttft_p99_ms": 5093.98,
"burst_ttft_p99_ms": 103.16
},
{
"cycle": 2,
"steady_requests": 595,
"burst_requests": 734,
"steady_ttft_p99_ms": 63.56,
"burst_ttft_p99_ms": 90.45
},
{
"cycle": 3,
"steady_requests": 636,
"burst_requests": 751,
"steady_ttft_p99_ms": 65.44,
"burst_ttft_p99_ms": 85.05
}
]
}
},
"accuracy": {
"subset_score": 0.61,
"baseline_delta": 0.01,
"valid": true,
"framework": "SGLang",
"precision": "BF16",
"notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark."
},
"meta": {
"submitted_by": "Gong-K",
"submission_type": "individual",
"date": "2026-05-06",
"time": "11:21:34",
"run_id": "958afbbd",
"run_name": "nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd",
"flagged": null,
"reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py",
"env_info_file": "../env_info.json",
"log_file": "run.log",
"samples_file": "samples.jsonl",
"notes": null,
"benchmark_start_time": "2026-05-06T11:19:15.947406+00:00",
"benchmark_end_time": "2026-05-06T11:21:34.758403+00:00",
"benchmark_elapsed_minutes": 74.2,
"model_load_seconds": 50.0,
"benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'speculative', 'burst'] scenarios.",
"scenario_dirs": {
"offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/offline",
"online": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/online",
"interactive": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/interactive",
"sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/sustained",
"speculative": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/speculative",
"burst": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/burst"
}
}
}
{ "schema_version": "1.0", "suite_id": "suite_A", "implementation_id": "nvidia_sglang_c43a8309", "chip": { "name": "NVIDIA A100-SXM4-40GB", "vendor": "NVIDIA", "count": 1, "memory_gb": 40.0, "interconnect_intra_node": null, "interconnect_inter_node": null }, "environment": { "collected_at": "2026-05-06T11:15:11.081772+00:00", "accelerators": [ { "index": 0, "name": "NVIDIA A100-SXM4-40GB", "vendor": "NVIDIA", "memory_gb": 40.0, "driver_version": "565.57.01", "firmware_version": null, "compute_capability": "8.0", "supports_bf16": true } ], "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tPXB\tNODE\tSYS\t0-31,64-95\t0\t\tN/A\nNIC0\tPXB\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tPXB\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n\n", "intra_node_interconnect": null, "cpu": { "model": "AMD EPYC 7532 32-Core Processor", "physical_cores": 64, "logical_cores": 128, "numa_nodes": 2 }, "system_memory_gb": 1007.7, "pcie_generation": "PCIe Gen 4", "cpu_accelerator_bandwidth_gbs": null, "network_interfaces": [ { "name": "mlx5_0", "type": "InfiniBand/RoCE", "bandwidth_gbps": null }, { "name": "mlx5_1", "type": "InfiniBand/RoCE", "bandwidth_gbps": null }, { "name": "mlx5_2", "type": "InfiniBand/RoCE", "bandwidth_gbps": null }, { "name": "mlx5_3", "type": "InfiniBand/RoCE", "bandwidth_gbps": null } ], "os": "Ubuntu 22.04.4 LTS", "python_version": "3.10.20", "kernel_version": "5.15.0-60-generic", "runtime_version": "CUDA 12.8", "pytorch_version": "2.9.1+cu128" }, "software": { "framework": "SGLang", "framework_version": "0.5.6", "driver_version": "565.57.01", "runtime_version": "CUDA 12.8", "os": "Ubuntu 22.04.4 LTS", "python_version": "3.10.20" }, "model": { "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", "model_name": null, "model_note": null, "model_source": "local", "architecture": "dense", "parameter_count_b": 8.0, "precision": "BF16", "effective_dtype": "bfloat16", "quantization_method": null, "model_format": "HuggingFace original" }, "task": { "scenarios_run": [ "offline", "online", "interactive", "sustained", "speculative", "burst" ], "parallelism": { "tensor_parallel_size": 1, "pipeline_parallel_size": 1, "expert_parallel_size": 1, "data_parallel_size": 1 }, "num_runs": 3, "extra_config": null }, "metrics": { "derived": {}, "offline": { "results_by_concurrency": [ { "client_concurrency": 8, "throughput_tokens_per_sec": 3144.73, "throughput_tokens_per_sec_per_chip": 3144.73, "elapsed_seconds_median": 11.2, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 32, "throughput_tokens_per_sec": 3146.66, "throughput_tokens_per_sec_per_chip": 3146.66, "elapsed_seconds_median": 11.2, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 128, "throughput_tokens_per_sec": 3146.09, "throughput_tokens_per_sec_per_chip": 3146.09, "elapsed_seconds_median": 11.2, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." } ] }, "online": { "sla_ttft_ms": 500, "max_valid_qps": 100, "results_by_qps": [ { "target_qps": 5, "achieved_qps": 5.0, "ttft_ms_p50": 43.91, "ttft_ms_p90": 62.26, "ttft_ms_p99": 972.47, "tpot_ms_p50": 15.63, "tpot_ms_p90": 17.36, "tpot_ms_p99": 18.58, "elapsed_seconds_median": 66.1, "sla_met": false }, { "target_qps": 25, "achieved_qps": 25.0, "ttft_ms_p50": 52.85, "ttft_ms_p90": 67.65, "ttft_ms_p99": 80.71, "tpot_ms_p50": 36.16, "tpot_ms_p90": 41.45, "tpot_ms_p99": 57.42, "elapsed_seconds_median": 17.1, "sla_met": true }, { "target_qps": 100, "achieved_qps": 100.0, "ttft_ms_p50": 50.85, "ttft_ms_p90": 62.88, "ttft_ms_p99": 245.1, "tpot_ms_p50": 41.47, "tpot_ms_p90": 53.07, "tpot_ms_p99": 177.42, "elapsed_seconds_median": 10.8, "sla_met": true } ] }, "interactive": { "ttft_ms_p50": 32.52, "ttft_ms_p90": 44.71, "ttft_ms_p99": 61.84, "tpot_ms_p50": 12.93, "tpot_ms_p90": 12.98, "tpot_ms_p99": 13.03, "peak_memory_gb": null, "elapsed_seconds_median": 381.4 }, "sustained": { "sustained_concurrency": 8, "duration_minutes": 30, "warmup_minutes": 2, "sample_interval_seconds": 60, "samples": [ { "minute": 1.0, "is_warmup": true, "throughput_tokens_per_sec": 477.0, "tokens_out": 28638, "tokens_in": 0, "requests_completed": 154, "ttft_ms_p50": 49.6, "ttft_ms_p99": 6750.3 }, { "minute": 2.0, "is_warmup": false, "throughput_tokens_per_sec": 566.2, "tokens_out": 33972, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 44.6, "ttft_ms_p99": 61.2 }, { "minute": 3.0, "is_warmup": false, "throughput_tokens_per_sec": 560.8, "tokens_out": 33639, "tokens_in": 0, "requests_completed": 177, "ttft_ms_p50": 44.5, "ttft_ms_p99": 61.3 }, { "minute": 4.0, "is_warmup": false, "throughput_tokens_per_sec": 565.3, "tokens_out": 33929, "tokens_in": 0, "requests_completed": 181, "ttft_ms_p50": 44.3, "ttft_ms_p99": 60.8 }, { "minute": 5.0, "is_warmup": false, "throughput_tokens_per_sec": 561.4, "tokens_out": 33685, "tokens_in": 0, "requests_completed": 179, "ttft_ms_p50": 44.2, "ttft_ms_p99": 61.6 }, { "minute": 6.0, "is_warmup": false, "throughput_tokens_per_sec": 561.9, "tokens_out": 33707, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 44.3, "ttft_ms_p99": 60.8 }, { "minute": 7.0, "is_warmup": false, "throughput_tokens_per_sec": 570.0, "tokens_out": 34190, "tokens_in": 0, "requests_completed": 179, "ttft_ms_p50": 44.3, "ttft_ms_p99": 61.4 }, { "minute": 8.0, "is_warmup": false, "throughput_tokens_per_sec": 558.3, "tokens_out": 33498, "tokens_in": 0, "requests_completed": 177, "ttft_ms_p50": 44.6, "ttft_ms_p99": 62.6 }, { "minute": 9.0, "is_warmup": false, "throughput_tokens_per_sec": 563.3, "tokens_out": 33801, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 44.3, "ttft_ms_p99": 61.8 }, { "minute": 10.0, "is_warmup": false, "throughput_tokens_per_sec": 552.7, "tokens_out": 33163, "tokens_in": 0, "requests_completed": 176, "ttft_ms_p50": 44.5, "ttft_ms_p99": 52.4 }, { "minute": 11.0, "is_warmup": false, "throughput_tokens_per_sec": 569.3, "tokens_out": 34157, "tokens_in": 0, "requests_completed": 181, "ttft_ms_p50": 44.1, "ttft_ms_p99": 60.4 }, { "minute": 12.0, "is_warmup": false, "throughput_tokens_per_sec": 558.9, "tokens_out": 33526, "tokens_in": 0, "requests_completed": 177, "ttft_ms_p50": 44.2, "ttft_ms_p99": 47.9 }, { "minute": 13.0, "is_warmup": false, "throughput_tokens_per_sec": 568.4, "tokens_out": 34113, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 44.5, "ttft_ms_p99": 139.2 }, { "minute": 14.0, "is_warmup": false, "throughput_tokens_per_sec": 557.2, "tokens_out": 33424, "tokens_in": 0, "requests_completed": 178, "ttft_ms_p50": 44.5, "ttft_ms_p99": 61.5 }, { "minute": 15.0, "is_warmup": false, "throughput_tokens_per_sec": 565.5, "tokens_out": 33942, "tokens_in": 0, "requests_completed": 181, "ttft_ms_p50": 44.3, "ttft_ms_p99": 61.7 }, { "minute": 16.0, "is_warmup": false, "throughput_tokens_per_sec": 554.1, "tokens_out": 33238, "tokens_in": 0, "requests_completed": 175, "ttft_ms_p50": 44.3, "ttft_ms_p99": 61.2 }, { "minute": 17.0, "is_warmup": false, "throughput_tokens_per_sec": 563.7, "tokens_out": 33832, "tokens_in": 0, "requests_completed": 178, "ttft_ms_p50": 44.4, "ttft_ms_p99": 61.7 }, { "minute": 18.0, "is_warmup": false, "throughput_tokens_per_sec": 563.3, "tokens_out": 33783, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 44.5, "ttft_ms_p99": 62.1 }, { "minute": 19.0, "is_warmup": false, "throughput_tokens_per_sec": 565.0, "tokens_out": 33912, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 44.6, "ttft_ms_p99": 62.2 }, { "minute": 20.0, "is_warmup": false, "throughput_tokens_per_sec": 563.3, "tokens_out": 33773, "tokens_in": 0, "requests_completed": 179, "ttft_ms_p50": 44.6, "ttft_ms_p99": 61.6 }, { "minute": 21.0, "is_warmup": false, "throughput_tokens_per_sec": 564.7, "tokens_out": 33889, "tokens_in": 0, "requests_completed": 178, "ttft_ms_p50": 44.3, "ttft_ms_p99": 61.3 }, { "minute": 22.0, "is_warmup": false, "throughput_tokens_per_sec": 564.3, "tokens_out": 33853, "tokens_in": 0, "requests_completed": 179, "ttft_ms_p50": 44.6, "ttft_ms_p99": 61.1 }, { "minute": 23.0, "is_warmup": false, "throughput_tokens_per_sec": 562.3, "tokens_out": 33744, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 44.5, "ttft_ms_p99": 61.2 }, { "minute": 24.0, "is_warmup": false, "throughput_tokens_per_sec": 569.5, "tokens_out": 34180, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 44.3, "ttft_ms_p99": 61.3 }, { "minute": 25.0, "is_warmup": false, "throughput_tokens_per_sec": 550.8, "tokens_out": 33047, "tokens_in": 0, "requests_completed": 176, "ttft_ms_p50": 44.5, "ttft_ms_p99": 61.5 }, { "minute": 26.0, "is_warmup": false, "throughput_tokens_per_sec": 562.5, "tokens_out": 33749, "tokens_in": 0, "requests_completed": 178, "ttft_ms_p50": 44.5, "ttft_ms_p99": 63.9 }, { "minute": 27.0, "is_warmup": false, "throughput_tokens_per_sec": 561.7, "tokens_out": 33689, "tokens_in": 0, "requests_completed": 179, "ttft_ms_p50": 44.7, "ttft_ms_p99": 61.6 }, { "minute": 28.0, "is_warmup": false, "throughput_tokens_per_sec": 566.5, "tokens_out": 34010, "tokens_in": 0, "requests_completed": 181, "ttft_ms_p50": 44.7, "ttft_ms_p99": 141.7 }, { "minute": 29.0, "is_warmup": false, "throughput_tokens_per_sec": 558.2, "tokens_out": 33464, "tokens_in": 0, "requests_completed": 178, "ttft_ms_p50": 44.3, "ttft_ms_p99": 60.9 } ], "sustained_throughput_tokens_per_sec": 562.5, "throttle_ratio": 0.966, "throttle_onset_minute": null, "ttft_p99_drift_ms": -0.3 }, "speculative": { "results_by_concurrency": [ { "client_concurrency": 8, "throughput_tokens_per_sec": 705.16, "throughput_tokens_per_sec_per_chip": 705.16, "elapsed_seconds_median": 49.7, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 32, "throughput_tokens_per_sec": 703.58, "throughput_tokens_per_sec_per_chip": 703.58, "elapsed_seconds_median": 49.8, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 128, "throughput_tokens_per_sec": 704.29, "throughput_tokens_per_sec_per_chip": 704.29, "elapsed_seconds_median": 49.7, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." } ] }, "burst": { "sla_ttft_ms": 500, "burst_steady_qps": 5, "burst_peak_qps": 25, "burst_duration_seconds": 30, "burst_interval_seconds": 120, "steady_requests_total": 1812, "burst_requests_total": 2245, "steady_ttft_p50_ms": 43.06, "steady_ttft_p99_ms": 3985.36, "burst_ttft_p50_ms": 57.82, "burst_ttft_p99_ms": 99.11, "sla_met_during_burst": true, "burst_degradation_ratio": 0.025, "results_by_cycle": [ { "cycle": 1, "steady_requests": 581, "burst_requests": 760, "steady_ttft_p99_ms": 5093.98, "burst_ttft_p99_ms": 103.16 }, { "cycle": 2, "steady_requests": 595, "burst_requests": 734, "steady_ttft_p99_ms": 63.56, "burst_ttft_p99_ms": 90.45 }, { "cycle": 3, "steady_requests": 636, "burst_requests": 751, "steady_ttft_p99_ms": 65.44, "burst_ttft_p99_ms": 85.05 } ] } }, "accuracy": { "subset_score": 0.61, "baseline_delta": 0.01, "valid": true, "framework": "SGLang", "precision": "BF16", "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark." }, "meta": { "submitted_by": "Gong-K", "submission_type": "individual", "date": "2026-05-06", "time": "11:21:34", "run_id": "958afbbd", "run_name": "nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd", "flagged": null, "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", "env_info_file": "../env_info.json", "log_file": "run.log", "samples_file": "samples.jsonl", "notes": null, "benchmark_start_time": "2026-05-06T11:19:15.947406+00:00", "benchmark_end_time": "2026-05-06T11:21:34.758403+00:00", "benchmark_elapsed_minutes": 74.2, "model_load_seconds": 50.0, "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'speculative', 'burst'] scenarios.", "scenario_dirs": { "offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/offline", "online": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/online", "interactive": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/interactive", "sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/sustained", "speculative": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/speculative", "burst": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/burst" } } }