Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
|---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
| NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
| NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
| AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
| Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
| Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# AccelMark runner config — nvidia_onecat_vllm_12a253c2 (1Cat-vLLM on V100)
# Copy to runner_nvidia_onecat_vllm_12a253c2.yaml (gitignored). See runner README.

tensor_parallel_size: 1
enforce_eager: false
max_num_seqs: 512
gpu_memory_utilization: 0.90

engine_kwargs:
enable_prefix_caching: false
enable_chunked_prefill: false
kv_cache_auto_trim_ratio: 0.0

suites:
suite_D:
max_num_seqs: 1
gpu_memory_utilization: 0.85
suite_C:
max_num_seqs: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"subset_score": 0.61,
"baseline_delta": null,
"valid": true,
"framework": "1Cat-vLLM",
"precision": "FP16",
"notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"collected_at": "2026-05-18T09:38:50.346241+00:00",
"accelerators": [
{
"index": 0,
"name": "Tesla V100-PCIE-32GB",
"vendor": "NVIDIA",
"memory_gb": 32.0,
"driver_version": "580.82.07",
"firmware_version": null,
"compute_capability": "7.0",
"supports_bf16": false
}
],
"accelerator_platform": "nvidia",
"accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n",
"intra_node_interconnect": null,
"cpu": {
"model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
"physical_cores": 26,
"logical_cores": 52,
"numa_nodes": 2
},
"system_memory_gb": 214.5,
"pcie_generation": "PCIe Gen 3",
"cpu_accelerator_bandwidth_gbs": null,
"network_interfaces": null,
"os": "Ubuntu 22.04.5 LTS",
"python_version": "3.12.13",
"kernel_version": "5.4.0-149-generic",
"runtime_version": "CUDA 12.8",
"pytorch_version": "2.9.1+cu128"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
{
"schema_version": "1.0",
"suite_id": "suite_A",
"implementation_id": "nvidia_onecat_vllm_12a253c2",
"chip": {
"name": "Tesla V100-PCIE-32GB",
"vendor": "NVIDIA",
"count": 1,
"memory_gb": 32.0,
"interconnect_intra_node": null,
"interconnect_inter_node": null
},
"environment": {
"collected_at": "2026-05-18T09:38:50.346241+00:00",
"accelerators": [
{
"index": 0,
"name": "Tesla V100-PCIE-32GB",
"vendor": "NVIDIA",
"memory_gb": 32.0,
"driver_version": "580.82.07",
"firmware_version": null,
"compute_capability": "7.0",
"supports_bf16": false
}
],
"accelerator_platform": "nvidia",
"accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n",
"intra_node_interconnect": null,
"cpu": {
"model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
"physical_cores": 26,
"logical_cores": 52,
"numa_nodes": 2
},
"system_memory_gb": 214.5,
"pcie_generation": "PCIe Gen 3",
"cpu_accelerator_bandwidth_gbs": null,
"network_interfaces": null,
"os": "Ubuntu 22.04.5 LTS",
"python_version": "3.12.13",
"kernel_version": "5.4.0-149-generic",
"runtime_version": "CUDA 12.8",
"pytorch_version": "2.9.1+cu128"
},
"software": {
"framework": "1Cat-vLLM",
"framework_version": "1.0.0+flash_attn_v100-1.0.0",
"driver_version": "580.82.07",
"runtime_version": "CUDA 12.8",
"os": "Ubuntu 22.04.5 LTS",
"python_version": "3.12.13"
},
"model": {
"model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
"model_name": null,
"model_note": null,
"model_source": "local",
"architecture": "dense",
"parameter_count_b": 8.0,
"precision": "FP16",
"effective_dtype": "float16",
"quantization_method": null,
"model_format": "HuggingFace original"
},
"task": {
"scenario": "offline",
"num_runs": 3,
"warmup_runs": 1,
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"expert_parallel_size": 1,
"data_parallel_size": 1
},
"extra_config": {
"tensor_parallel_size": 1,
"enforce_eager": false,
"max_num_seqs": 512,
"gpu_memory_utilization": 0.9,
"engine_kwargs": {
"enable_prefix_caching": false,
"enable_chunked_prefill": false,
"kv_cache_auto_trim_ratio": 0.0
}
},
"runtime_metrics": null
},
"metrics": {
"offline": {
"results_by_concurrency": [
{
"client_concurrency": 8,
"throughput_tokens_per_sec": 671.32,
"throughput_tokens_per_sec_per_chip": 671.32,
"throughput_tokens_per_sec_total": 1168.67,
"elapsed_seconds_median": 51.6,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 32,
"throughput_tokens_per_sec": 670.99,
"throughput_tokens_per_sec_per_chip": 670.99,
"throughput_tokens_per_sec_total": 1168.09,
"elapsed_seconds_median": 51.6,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 128,
"throughput_tokens_per_sec": 671.43,
"throughput_tokens_per_sec_per_chip": 671.43,
"throughput_tokens_per_sec_total": 1168.44,
"elapsed_seconds_median": 51.6,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
}
]
}
},
"accuracy": {
"subset_score": null,
"baseline_delta": null,
"valid": false,
"notes": "Run --scenario accuracy to check model accuracy."
},
"meta": {
"submitted_by": "JuhaoLiang1997",
"submission_type": "individual",
"date": "2026-05-18",
"time": "18:03:39",
"run_id": "4e0e6eba",
"run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
"flagged": null,
"reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
"env_info_file": "../env_info.json",
"log_file": "run.log",
"samples_file": "samples.jsonl",
"notes": null,
"benchmark_start_time": "2026-05-18T09:53:19.928949+00:00",
"benchmark_end_time": "2026-05-18T10:03:39.512440+00:00",
"benchmark_elapsed_minutes": 10.3,
"model_load_seconds": 47.8
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
{
"schema_version": "1.0",
"suite_id": "suite_A",
"implementation_id": "nvidia_onecat_vllm_12a253c2",
"chip": {
"name": "Tesla V100-PCIE-32GB",
"vendor": "NVIDIA",
"count": 1,
"memory_gb": 32.0,
"interconnect_intra_node": null,
"interconnect_inter_node": null
},
"environment": {
"collected_at": "2026-05-18T09:38:50.346241+00:00",
"accelerators": [
{
"index": 0,
"name": "Tesla V100-PCIE-32GB",
"vendor": "NVIDIA",
"memory_gb": 32.0,
"driver_version": "580.82.07",
"firmware_version": null,
"compute_capability": "7.0",
"supports_bf16": false
}
],
"accelerator_platform": "nvidia",
"accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n",
"intra_node_interconnect": null,
"cpu": {
"model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
"physical_cores": 26,
"logical_cores": 52,
"numa_nodes": 2
},
"system_memory_gb": 214.5,
"pcie_generation": "PCIe Gen 3",
"cpu_accelerator_bandwidth_gbs": null,
"network_interfaces": null,
"os": "Ubuntu 22.04.5 LTS",
"python_version": "3.12.13",
"kernel_version": "5.4.0-149-generic",
"runtime_version": "CUDA 12.8",
"pytorch_version": "2.9.1+cu128"
},
"software": {
"framework": "1Cat-vLLM",
"framework_version": "1.0.0+flash_attn_v100-1.0.0",
"driver_version": "580.82.07",
"runtime_version": "CUDA 12.8",
"os": "Ubuntu 22.04.5 LTS",
"python_version": "3.12.13"
},
"model": {
"model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
"model_name": null,
"model_note": null,
"model_source": "local",
"architecture": "dense",
"parameter_count_b": 8.0,
"precision": "FP16",
"effective_dtype": null,
"quantization_method": null,
"model_format": "HuggingFace original"
},
"task": {
"scenario": "online",
"num_runs": 3,
"warmup_runs": 1,
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"expert_parallel_size": 1,
"data_parallel_size": 1
},
"extra_config": {
"tensor_parallel_size": 1,
"enforce_eager": false,
"max_num_seqs": 512,
"gpu_memory_utilization": 0.9,
"engine_kwargs": {
"enable_prefix_caching": false,
"enable_chunked_prefill": false,
"kv_cache_auto_trim_ratio": 0.0
}
},
"runtime_metrics": null
},
"metrics": {
"online": {
"sla_ttft_ms": 500,
"max_valid_qps": 0.0,
"results_by_qps": [
{
"target_qps": 5,
"achieved_qps": 5.0,
"ttft_ms_p50": 113119.0,
"ttft_ms_p90": 832380.28,
"ttft_ms_p99": 872316.46,
"tpot_ms_p50": 1274.2,
"tpot_ms_p90": 1801.34,
"tpot_ms_p99": 4289.09,
"elapsed_seconds_median": 968.7,
"sla_met": false
},
{
"target_qps": 25,
"achieved_qps": 25.0,
"ttft_ms_p50": 130646.03,
"ttft_ms_p90": 865522.04,
"ttft_ms_p99": 901339.26,
"tpot_ms_p50": 1262.15,
"tpot_ms_p90": 1785.02,
"tpot_ms_p99": 4287.18,
"elapsed_seconds_median": 936.5,
"sla_met": false
},
{
"target_qps": 100,
"achieved_qps": 100.0,
"ttft_ms_p50": 132710.0,
"ttft_ms_p90": 863880.66,
"ttft_ms_p99": 888527.06,
"tpot_ms_p50": 1248.86,
"tpot_ms_p90": 1740.58,
"tpot_ms_p99": 4225.34,
"elapsed_seconds_median": 921.5,
"sla_met": false
}
]
}
},
"accuracy": {
"subset_score": null,
"baseline_delta": null,
"valid": false,
"notes": "Run --scenario accuracy to check model accuracy."
},
"meta": {
"submitted_by": "JuhaoLiang1997",
"submission_type": "individual",
"date": "2026-05-18",
"time": "20:25:39",
"run_id": "4e0e6eba",
"run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
"flagged": null,
"reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
"env_info_file": "../env_info.json",
"log_file": "run.log",
"samples_file": "samples.jsonl",
"notes": null,
"benchmark_start_time": "2026-05-18T10:04:46.235502+00:00",
"benchmark_end_time": "2026-05-18T12:25:39.450279+00:00",
"benchmark_elapsed_minutes": 140.9,
"model_load_seconds": 45.2
}
}
Loading
Loading