diff --git a/README.md b/README.md index 3007966..922c479 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:| | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ | | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example new file mode 100644 index 0000000..6644d79 --- /dev/null +++ b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example @@ -0,0 +1,19 @@ +# AccelMark runner config — nvidia_onecat_vllm_12a253c2 (1Cat-vLLM on V100) +# Copy to runner_nvidia_onecat_vllm_12a253c2.yaml (gitignored). See runner README. + +tensor_parallel_size: 1 +enforce_eager: false +max_num_seqs: 512 +gpu_memory_utilization: 0.90 + +engine_kwargs: + enable_prefix_caching: false + enable_chunked_prefill: false + kv_cache_auto_trim_ratio: 0.0 + +suites: + suite_D: + max_num_seqs: 1 + gpu_memory_utilization: 0.85 + suite_C: + max_num_seqs: 1 diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json new file mode 100644 index 0000000..304c3db --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.61, + "baseline_delta": null, + "valid": true, + "framework": "1Cat-vLLM", + "precision": "FP16", + "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json new file mode 100644 index 0000000..52c2fdc --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json @@ -0,0 +1,33 @@ +{ + "collected_at": "2026-05-18T09:38:50.346241+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json new file mode 100644 index 0000000..2e6fc7f --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:38:50.346241+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 671.32, + "throughput_tokens_per_sec_per_chip": 671.32, + "throughput_tokens_per_sec_total": 1168.67, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 670.99, + "throughput_tokens_per_sec_per_chip": 670.99, + "throughput_tokens_per_sec_total": 1168.09, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 671.43, + "throughput_tokens_per_sec_per_chip": 671.43, + "throughput_tokens_per_sec_total": 1168.44, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "18:03:39", + "run_id": "4e0e6eba", + "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00", + "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00", + "benchmark_elapsed_minutes": 10.3, + "model_load_seconds": 47.8 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json new file mode 100644 index 0000000..66aeb48 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json @@ -0,0 +1,158 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:38:50.346241+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 113119.0, + "ttft_ms_p90": 832380.28, + "ttft_ms_p99": 872316.46, + "tpot_ms_p50": 1274.2, + "tpot_ms_p90": 1801.34, + "tpot_ms_p99": 4289.09, + "elapsed_seconds_median": 968.7, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 130646.03, + "ttft_ms_p90": 865522.04, + "ttft_ms_p99": 901339.26, + "tpot_ms_p50": 1262.15, + "tpot_ms_p90": 1785.02, + "tpot_ms_p99": 4287.18, + "elapsed_seconds_median": 936.5, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 132710.0, + "ttft_ms_p90": 863880.66, + "ttft_ms_p99": 888527.06, + "tpot_ms_p50": 1248.86, + "tpot_ms_p90": 1740.58, + "tpot_ms_p99": 4225.34, + "elapsed_seconds_median": 921.5, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:25:39", + "run_id": "4e0e6eba", + "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T10:04:46.235502+00:00", + "benchmark_end_time": "2026-05-18T12:25:39.450279+00:00", + "benchmark_elapsed_minutes": 140.9, + "model_load_seconds": 45.2 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json new file mode 100644 index 0000000..07930da --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json @@ -0,0 +1,210 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:38:50.346241+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + } + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 671.32, + "throughput_tokens_per_sec_per_chip": 671.32, + "throughput_tokens_per_sec_total": 1168.67, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 670.99, + "throughput_tokens_per_sec_per_chip": 670.99, + "throughput_tokens_per_sec_total": 1168.09, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 671.43, + "throughput_tokens_per_sec_per_chip": 671.43, + "throughput_tokens_per_sec_total": 1168.44, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 113119.0, + "ttft_ms_p90": 832380.28, + "ttft_ms_p99": 872316.46, + "tpot_ms_p50": 1274.2, + "tpot_ms_p90": 1801.34, + "tpot_ms_p99": 4289.09, + "elapsed_seconds_median": 968.7, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 130646.03, + "ttft_ms_p90": 865522.04, + "ttft_ms_p99": 901339.26, + "tpot_ms_p50": 1262.15, + "tpot_ms_p90": 1785.02, + "tpot_ms_p99": 4287.18, + "elapsed_seconds_median": 936.5, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 132710.0, + "ttft_ms_p90": 863880.66, + "ttft_ms_p99": 888527.06, + "tpot_ms_p50": 1248.86, + "tpot_ms_p90": 1740.58, + "tpot_ms_p99": 4225.34, + "elapsed_seconds_median": 921.5, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": 0.61, + "baseline_delta": null, + "valid": true, + "framework": "1Cat-vLLM", + "precision": "FP16", + "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "18:03:39", + "run_id": "4e0e6eba", + "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00", + "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00", + "benchmark_elapsed_minutes": 151.2, + "model_load_seconds": 47.8, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.", + "scenario_dirs": { + "offline": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline", + "online": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online" + } + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json new file mode 100644 index 0000000..94e5547 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.37, + "baseline_delta": 0.0, + "valid": true, + "framework": "1Cat-vLLM", + "precision": "FP16", + "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json new file mode 100644 index 0000000..1f8b6bd --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json @@ -0,0 +1,33 @@ +{ + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json new file mode 100644 index 0000000..f017bc2 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json @@ -0,0 +1,126 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "FP16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 26.76, + "ttft_ms_p90": 29.57, + "ttft_ms_p99": 40.69, + "tpot_ms_p50": 3.51, + "tpot_ms_p90": 3.76, + "tpot_ms_p99": 3.81, + "peak_memory_gb": null, + "elapsed_seconds_median": 116.9 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:45:36", + "run_id": "419b138c", + "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T12:39:46.224469+00:00", + "benchmark_end_time": "2026-05-18T12:45:36.498231+00:00", + "benchmark_elapsed_minutes": 5.8, + "model_load_seconds": 27.8 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json new file mode 100644 index 0000000..da8126b --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "FP16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 6234.82, + "throughput_tokens_per_sec_per_chip": 6234.82, + "throughput_tokens_per_sec_total": 9303.11, + "elapsed_seconds_median": 6.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 6292.79, + "throughput_tokens_per_sec_per_chip": 6292.79, + "throughput_tokens_per_sec_total": 9356.18, + "elapsed_seconds_median": 6.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 6243.51, + "throughput_tokens_per_sec_per_chip": 6243.51, + "throughput_tokens_per_sec_total": 9267.55, + "elapsed_seconds_median": 6.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:28:55", + "run_id": "419b138c", + "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00", + "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00", + "benchmark_elapsed_minutes": 1.4, + "model_load_seconds": 31.7 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json new file mode 100644 index 0000000..170f9d0 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json @@ -0,0 +1,146 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "FP16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 6316.13, + "ttft_ms_p90": 53409.43, + "ttft_ms_p99": 67932.56, + "tpot_ms_p50": 206.23, + "tpot_ms_p90": 291.3, + "tpot_ms_p99": 636.32, + "elapsed_seconds_median": 103.3, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 19238.78, + "ttft_ms_p90": 56898.27, + "ttft_ms_p99": 75398.9, + "tpot_ms_p50": 189.24, + "tpot_ms_p90": 300.17, + "tpot_ms_p99": 582.22, + "elapsed_seconds_median": 86.3, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:38:56", + "run_id": "419b138c", + "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T12:29:46.673625+00:00", + "benchmark_end_time": "2026-05-18T12:38:56.798553+00:00", + "benchmark_elapsed_minutes": 9.2, + "model_load_seconds": 28.7 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json new file mode 100644 index 0000000..12baab4 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json @@ -0,0 +1,210 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "FP16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + } + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 6234.82, + "throughput_tokens_per_sec_per_chip": 6234.82, + "throughput_tokens_per_sec_total": 9303.11, + "elapsed_seconds_median": 6.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 6292.79, + "throughput_tokens_per_sec_per_chip": 6292.79, + "throughput_tokens_per_sec_total": 9356.18, + "elapsed_seconds_median": 6.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 6243.51, + "throughput_tokens_per_sec_per_chip": 6243.51, + "throughput_tokens_per_sec_total": 9267.55, + "elapsed_seconds_median": 6.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 6316.13, + "ttft_ms_p90": 53409.43, + "ttft_ms_p99": 67932.56, + "tpot_ms_p50": 206.23, + "tpot_ms_p90": 291.3, + "tpot_ms_p99": 636.32, + "elapsed_seconds_median": 103.3, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 19238.78, + "ttft_ms_p90": 56898.27, + "ttft_ms_p99": 75398.9, + "tpot_ms_p50": 189.24, + "tpot_ms_p90": 300.17, + "tpot_ms_p99": 582.22, + "elapsed_seconds_median": 86.3, + "sla_met": false + } + ] + }, + "interactive": { + "ttft_ms_p50": 26.76, + "ttft_ms_p90": 29.57, + "ttft_ms_p99": 40.69, + "tpot_ms_p50": 3.51, + "tpot_ms_p90": 3.76, + "tpot_ms_p99": 3.81, + "peak_memory_gb": null, + "elapsed_seconds_median": 116.9 + } + }, + "accuracy": { + "subset_score": 0.37, + "baseline_delta": 0.0, + "valid": true, + "framework": "1Cat-vLLM", + "precision": "FP16", + "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:28:55", + "run_id": "419b138c", + "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00", + "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00", + "benchmark_elapsed_minutes": 16.4, + "model_load_seconds": 31.7, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.", + "scenario_dirs": { + "offline": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline", + "online": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online", + "interactive": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive" + } + } +} \ No newline at end of file diff --git a/runners/nvidia_onecat_vllm_12a253c2/README.md b/runners/nvidia_onecat_vllm_12a253c2/README.md new file mode 100644 index 0000000..0556214 --- /dev/null +++ b/runners/nvidia_onecat_vllm_12a253c2/README.md @@ -0,0 +1,227 @@ +# nvidia_onecat_vllm_12a253c2 — 1Cat-vLLM Runner (Tesla V100 / SM70) + +AccelMark runner for **Tesla V100 / V100S only**, using +[1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) (community vLLM fork for Volta). + +> **Hardware:** Use this runner only on V100 / V100S (SM70). On Ampere or newer, +> use upstream `nvidia_vllm_*`. + +> **Third-party software:** 1Cat-vLLM is maintained by [1CatAI](https://github.com/1CatAI/1Cat-vLLM) +> under its own license. AccelMark ships only the thin `runner.py` wrapper; install +> 1Cat-vLLM separately as described below. + +## Why 1Cat-vLLM + +| Limitation on stock vLLM + V100 | 1Cat-vLLM | +|--------------------------------|-----------| +| AWQ kernels need SM75+ | SM70 AWQ via lmdeploy TurboMind | +| FlashAttention 2/3 need Ampere+ | `FLASH_ATTN_V100` backend | +| Qwen3.5 / Qwen3.6 on V100 | Fork model/runtime fixes | +| Long-context on Volta | SM70 paged-attention path | + +Release notes: [1Cat-vLLM v1.0.0](https://github.com/1CatAI/1Cat-vLLM/releases/tag/v1.0.0). + +## Runner defaults (code) + +| Setting | Default | +|---------|---------| +| `attention_backend` | `FLASH_ATTN_V100` (auto unless overridden) | +| `SUPPORTED_PRECISIONS` | `fp16`, `fp32` (no BF16 on V100) | +| `SUPPORTED_QUANTIZATION_BACKENDS` | `awq` only | +| `max_num_seqs` | `512` global default (same as upstream vLLM); use `1` for suite D / long-context | +| `gpu_memory_utilization` | `0.90` | + +## Supported suites + +| Suite | Notes | +|-------|-------| +| A | Runs on 1× V100; upstream `nvidia_vllm_*` + `--enforce-eager` is often enough | +| B | **Primary** — use `--tensor-parallel-size 4` on 4× V100 32GB | +| C | **Primary** — AWQ | +| D | **Primary** — long context + `FLASH_ATTN_V100` | +| E | Multi-chip scaling (same TP guidance as B) | +| F | Not recommended (edge model; use upstream runner) | +| G | **Primary** — MoE + AWQ (Qwen3.5/3.6 class models) | + +--- + +## Environment setup + +### Reference stack (1Cat-vLLM 1.0.0) + +| Component | Version | +|-----------|---------| +| GPU | Tesla V100 / V100S (SM70) | +| Python | **3.12** (`cp312` wheels only) | +| CUDA toolkit | **12.8** | +| Driver | 570.x recommended (CUDA 12.8) | +| PyTorch | **2.9.1+cu128** (from 1Cat wheels or build env) | + +### Path A — Prebuilt wheels (Ubuntu 24.04+, glibc ≥ 2.38) + +Official wheels require **glibc 2.38+** (e.g. Ubuntu 24.04). On Ubuntu 22.04, +`pip install` may succeed but `import vllm` fails with `GLIBC_2.38 not found` +— use Path B instead. + +```bash +conda create -y -n onecat-vllm python=3.12 +conda activate onecat-vllm +python -m pip install --upgrade pip setuptools wheel + +# Install BOTH wheels together — never `pip install vllm` from PyPI +python -m pip install --prefer-binary --no-cache-dir \ + --extra-index-url https://download.pytorch.org/whl/cu128 \ + "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \ + "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl" + +cd /path/to/AccelMark +pip install -r runners/nvidia_onecat_vllm_12a253c2/requirements.txt +``` + +### Path B — Build from source (Ubuntu 22.04 / glibc 2.35) + +Build on the **host glibc** so binaries link against 2.35. Typical AutoDL / +Ubuntu 22.04 V100 boxes use this path. + +**Prerequisites:** CUDA 12.8 toolkit (`nvcc` on PATH), conda Python 3.12, ~20GB +free disk for build tree + wheels. + +```bash +conda create -y -n onecat-vllm python=3.12 +conda activate onecat-vllm +export CUDA_HOME=/usr/local/cuda-12.8 +export PATH="$CUDA_HOME/bin:$PATH" +export TORCH_CUDA_ARCH_LIST="7.0" +export MAX_JOBS=6 +export PIP_CACHE_DIR=/path/to/fast/disk/pip-cache # optional + +git clone --depth 1 --branch v1.0.0 https://github.com/1CatAI/1Cat-vLLM.git +cd 1Cat-vLLM +pip install -r requirements/build.txt -r requirements/cuda.txt -r requirements/common.txt +pip install cmake build ninja + +DIST=/path/to/dist-cu128-sm70-v1.0.0 +mkdir -p "$DIST" + +# 1) flash_attn_v100 wheel +pushd flash-attention-v100 +python -m build --wheel --no-isolation --outdir "$DIST" +popd + +# 2) vllm wheel (30–90 min on V100 host) +export VLLM_TARGET_DEVICE=cuda +python -m build --wheel --no-isolation --outdir "$DIST" + +# 3) Install — run from /tmp so Python does not import the source tree +pip install "$DIST"/flash_attn_v100-*.whl +cd /tmp && pip install --no-deps --force-reinstall "$DIST"/vllm-*.whl + +cd /path/to/AccelMark +pip install -r runners/nvidia_onecat_vllm_12a253c2/requirements.txt +``` + +Do **not** run AccelMark from inside the cloned `1Cat-vLLM/` directory; Python +may import the local `vllm/` package instead of the installed wheel. + +### Smoke test + +Run from `/tmp` or the AccelMark repo root (not inside `1Cat-vLLM/`): + +```bash +python - <<'PY' +import torch, vllm +print("torch:", torch.__version__, "vllm:", vllm.__version__) +import flash_attn_v100_cuda +print("flash_attn_v100: ok") +from vllm import LLM +print("LLM import: ok") +PY +``` + +--- + +## AccelMark runner config (required on V100) + +Copy and edit: + +```bash +cp configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example \ + configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml +``` + +**Single V100 32GB** — recommended `engine_kwargs` (avoids SM70 +`Shared memory exceeds 96KB` in `prefill_paged_fwd`): + +```yaml +tensor_parallel_size: 1 +max_num_seqs: 512 +gpu_memory_utilization: 0.90 +engine_kwargs: + enable_prefix_caching: false + enable_chunked_prefill: false + kv_cache_auto_trim_ratio: 0.0 + +suites: + suite_D: + max_num_seqs: 1 + gpu_memory_utilization: 0.85 +``` + +If it still crashes, export before `python run.py`: + +```bash +export VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1 +``` + +That forces the slower paged-KV gather fallback instead of `prefill_paged_fwd`. + +**4× V100 32GB** — set `tensor_parallel_size: 4`; keep the same `engine_kwargs` +unless you are deliberately testing 1Cat's MTP / prefix-cache profile (see +example file comments). + +Other tuning: + +| Symptom | Try | +|---------|-----| +| `Shared memory exceeds 96KB` | `enable_chunked_prefill: false` + `enable_prefix_caching: false` (above); then `export VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1` | +| First request hangs (CUDA graph) | `enforce_eager: true` or `--enforce-eager` | +| OOM at engine init | Lower `gpu_memory_utilization` (e.g. `0.85`) | +| `GLIBC_2.38 not found` | Path B source build, or Ubuntu 24.04+ | + +--- + +## Basic usage + +```bash +cp configs/submitter.yaml.example configs/submitter.yaml # once +cp configs/models_local.yaml.example configs/models_local.yaml # map local model paths + +export PYTHONPATH=/path/to/AccelMark # if pip install -e . is unavailable + +# Suite A smoke (1× V100) +python run.py --runner nvidia_onecat_vllm_12a253c2 \ + --suite suite_A --scenario accuracy --tensor-parallel-size 1 + +# Suite B (4× V100) +python run.py --runner nvidia_onecat_vllm_12a253c2 \ + --suite suite_B --tensor-parallel-size 4 +``` + +--- + +## Known limitations + +- Prefix caching and **chunked prefill** (even with prefix caching off) can hit the + `prefill_paged_fwd` kernel (>96KB shared memory on SM70). Disable both in config; + use `VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1` if needed (see above). +- `max_num_seqs: 1` limits batch throughput vs upstream vLLM defaults — intentional + for 1Cat's long-context V100 profile. +- Suite F is marked unsupported in `meta.json` (use upstream runner on V100 if needed). +- End-to-end validation on 4× V100 reference hardware is still community-pending in + `meta.json`; single-GPU smoke (Suite A accuracy) has been exercised on V100 32GB. + +## Requirements + +See `requirements.txt`. Install `torch`, `flash_attn_v100`, and the `vllm` fork +from 1Cat-vLLM **before** the AccelMark extras file. Do not install upstream +`vllm` from PyPI after the fork. diff --git a/runners/nvidia_onecat_vllm_12a253c2/meta.json b/runners/nvidia_onecat_vllm_12a253c2/meta.json new file mode 100644 index 0000000..394601f --- /dev/null +++ b/runners/nvidia_onecat_vllm_12a253c2/meta.json @@ -0,0 +1,21 @@ +{ + "id": "nvidia_onecat_vllm_12a253c2", + "platform": "nvidia", + "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA", + "framework": "1Cat-vLLM", + "submitted_by": "JuhaoLiang1997", + "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — community vLLM fork with FLASH_ATTN_V100 and SM70 AWQ kernels. Use nvidia_vllm_* on Ampere or newer.", + "supersedes_chain": ["nvidia_onecat_vllm_4a9ca6c3", "nvidia_onecat_vllm_a43d1bcf"], + "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless overridden. V100: disable prefix caching and chunked prefill in runner config (see README). External dependency: https://github.com/1CatAI/1Cat-vLLM", + "created": "2026-05-15", + "hardware_label": "NVIDIA V100 (SM70)", + "suite_support": { + "A": "pending", + "B": "pending", + "C": "pending", + "D": "pending", + "E": "pending", + "F": "unsupported", + "G": "pending" + } +} diff --git a/runners/nvidia_onecat_vllm_12a253c2/requirements.txt b/runners/nvidia_onecat_vllm_12a253c2/requirements.txt new file mode 100644 index 0000000..b6d4c62 --- /dev/null +++ b/runners/nvidia_onecat_vllm_12a253c2/requirements.txt @@ -0,0 +1,17 @@ +# AccelMark extras for nvidia_onecat_vllm_4a9ca6c3. +# Install 1Cat-vLLM (flash_attn_v100 + vllm fork) first — see README.md. + +transformers==4.57.6 +tokenizers==0.22.2 +huggingface-hub==0.35.0 +accelerate==1.10.1 +safetensors==0.6.2 + +numpy==1.26.4 +jsonschema==4.25.1 +psutil==7.1.0 +tqdm==4.67.1 + +nvidia-ml-py==13.580.82 +aiohttp==3.12.15 +PyYAML==6.0.2 diff --git a/runners/nvidia_onecat_vllm_12a253c2/runner.py b/runners/nvidia_onecat_vllm_12a253c2/runner.py new file mode 100644 index 0000000..3462765 --- /dev/null +++ b/runners/nvidia_onecat_vllm_12a253c2/runner.py @@ -0,0 +1,382 @@ +""" +AccelMark — NVIDIA 1Cat-vLLM (SM70 / V100) benchmark script. + +Thin vLLM runner wrapper for the 1Cat-vLLM fork on Tesla V100 / V100S. +See README.md in this folder for install, hardware scope, and tuning. +""" + +import asyncio +import sys +import time +from pathlib import Path +from typing import Optional + +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(_REPO_ROOT)) + +import torch +from vllm import LLM, AsyncLLMEngine, SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from transformers import AutoTokenizer + +from runners.benchmark_runner import BenchmarkRunner, InferenceRequest +from loadgen.types import InferenceResult + + +import logging +logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING) +logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING) + + +class OneCatVLLMRunner(BenchmarkRunner): + """1Cat-vLLM on NVIDIA V100 / V100S (SM70). Use nvidia_vllm_* on newer GPUs.""" + + SUPPORTS_STREAMING = True + SUPPORTS_BATCHING = True + SUPPORTS_ONLINE = True + SUPPORTS_MULTI_CHIP = True + + SUPPORTED_PRECISIONS = ["fp16", "fp32"] + SUPPORTED_QUANTIZATION_BACKENDS = ["awq"] + + def __init__(self): + self.llm: LLM = None + self.engine: AsyncLLMEngine = None + self.tokenizer: AutoTokenizer = None + self.sampling_params: SamplingParams = None + self._loop: asyncio.AbstractEventLoop = None + + def _get_chip_count(self) -> int: + try: + import torch + n = torch.cuda.device_count() + return n if n > 0 else 1 + except Exception: + return 1 + + def _get_framework_name(self) -> str: + return "1Cat-vLLM" + + def _get_framework_version(self) -> str: + core = "unknown" + try: + import vllm + core = vllm.__version__ + except Exception: + pass + + fa_v100 = None + try: + from importlib.metadata import version as _pkg_version + fa_v100 = _pkg_version("flash_attn_v100") + except Exception: + try: + import flash_attn_v100_cuda # type: ignore # noqa: F401 + fa_v100 = "installed" + except Exception: + fa_v100 = None + + if fa_v100: + return f"{core}+flash_attn_v100-{fa_v100}" + return core + + def load_model(self, model_path: str, parallelism: dict) -> None: + tp_size = parallelism["tensor_parallel_size"] + pp_size = parallelism["pipeline_parallel_size"] + ep_size = parallelism.get("expert_parallel_size", 1) + assert pp_size <= 1, "Pipeline parallelism is not supported in OneCatVLLMRunner" + + max_tokens = parallelism["max_tokens"] + max_model_len = parallelism["max_model_len"] + use_async = parallelism["use_async"] + enforce_eager = getattr(self, "_enforce_eager", False) + + cfg = getattr(self, "_runner_config", {}) + max_num_seqs = cfg.get("max_num_seqs", 512) + gpu_memory_util = cfg.get("gpu_memory_utilization", 0.90) + extra_kwargs = dict(cfg.get("engine_kwargs") or {}) + + import os + if ( + "attention_backend" not in extra_kwargs + and "VLLM_ATTENTION_BACKEND" not in os.environ + ): + extra_kwargs["attention_backend"] = "FLASH_ATTN_V100" + + try: + import dataclasses + from vllm.engine.arg_utils import EngineArgs as _EngineArgs + _valid = {f.name for f in dataclasses.fields(_EngineArgs)} + _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid} + if _dropped: + print(f" Warning: engine_kwargs keys not supported by this " + f"1Cat-vLLM version and will be ignored: {list(_dropped)}") + extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid} + except Exception: + pass + + effective_precision = getattr(self, "_effective_precision", "BF16").upper() + precision = getattr(self, "_precision", None) or effective_precision + + _dtype_override = getattr(self, "_precision_dtype_override", None) + _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {}) + + quantization = _prec_eng_kwargs.pop("quantization", None) + + _NATIVE_DTYPE_MAP = { + "BF16": "bfloat16", + "FP16": "float16", + "FP32": "float32", + } + dtype = _NATIVE_DTYPE_MAP.get(precision, "auto") + self._quantization_method = quantization + + if _dtype_override: + dtype = _dtype_override + + if _prec_eng_kwargs: + _prec_eng_kwargs.update(extra_kwargs) + extra_kwargs = _prec_eng_kwargs + + print(f"Loading model: precision={precision}, dtype={dtype}" + + (f", quantization_method={self._quantization_method}" + if self._quantization_method else "")) + + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=False + ) + + self.sampling_params = SamplingParams( + max_tokens=max_tokens, + temperature=0.0, + ) + + if not use_async: + llm_kwargs = dict( + model=model_path, + dtype=dtype, + tensor_parallel_size=tp_size, + trust_remote_code=False, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + gpu_memory_utilization=gpu_memory_util, + **extra_kwargs, + ) + if ep_size > 1: + llm_kwargs["enable_expert_parallel"] = True + llm_kwargs["tensor_parallel_size"] = tp_size + if quantization: + llm_kwargs["quantization"] = quantization + if max_model_len: + llm_kwargs["max_model_len"] = max_model_len + self.llm = LLM(**llm_kwargs) + else: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + engine_kwargs = dict( + model=model_path, + dtype=dtype, + tensor_parallel_size=tp_size, + trust_remote_code=False, + enforce_eager=enforce_eager, + gpu_memory_utilization=gpu_memory_util, + **extra_kwargs, + ) + if ep_size > 1: + engine_kwargs["enable_expert_parallel"] = True + if max_model_len: + engine_kwargs["max_model_len"] = max_model_len + engine_args = AsyncEngineArgs(**engine_kwargs) + self.engine = AsyncLLMEngine.from_engine_args(engine_args) + + def get_effective_dtype(self) -> Optional[str]: + try: + if self.llm is not None: + dtype = self.llm.llm_engine.model_config.dtype + return str(dtype).replace("torch.", "") + elif self.engine is not None: + dtype = self.engine.engine.model_config.dtype + return str(dtype).replace("torch.", "") + except Exception: + pass + return getattr(self, "_effective_dtype", None) + + def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]: + formatted = [self._format_prompt(r.prompt) for r in requests] + t_start = time.perf_counter() + outputs = self.llm.generate(formatted, self.sampling_params) + elapsed = time.perf_counter() - t_start + + self._last_accuracy_outputs = [o.outputs[0].text for o in outputs] + + results = [] + for output in outputs: + results.append(InferenceResult( + first_token_time_ms=None, + total_time_ms=elapsed * 1000, + output_tokens=len(output.outputs[0].token_ids), + input_tokens=len(output.prompt_token_ids), + success=True, + output_text=output.outputs[0].text, + )) + return results + + async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult: + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + t_start = time.perf_counter() + first_token_time_ms = None + output_tokens = 0 + output_text = "" + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + if ( + first_token_time_ms is None + and len(output.outputs[0].token_ids) > 0 + ): + first_token_time_ms = (time.perf_counter() - t_start) * 1000 + output_tokens = len(output.outputs[0].token_ids) + output_text = output.outputs[0].text + + total_time_ms = (time.perf_counter() - t_start) * 1000 + return InferenceResult( + first_token_time_ms=first_token_time_ms, + total_time_ms=total_time_ms, + output_tokens=output_tokens, + input_tokens=0, + success=True, + output_text=output_text, + ) + + async def inference_fn_token_stream(self, request: InferenceRequest): + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + prev_length = 0 + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + current_text = output.outputs[0].text + delta = current_text[prev_length:] + if delta: + yield delta + prev_length = len(current_text) + + def get_peak_memory_gb(self) -> float: + try: + return torch.cuda.max_memory_allocated() / (1024 ** 3) + except Exception: + return None + + def release_resources(self) -> None: + if self.llm is not None: + try: + del self.llm + except Exception: + pass + self.llm = None + + if self.engine is not None: + try: + if self._loop and not self._loop.is_closed(): + self._loop.run_until_complete(self.engine.shutdown()) + except Exception: + pass + try: + del self.engine + except Exception: + pass + self.engine = None + + try: + from vllm.distributed.parallel_state import cleanup_dist_env_and_memory + cleanup_dist_env_and_memory(shutdown_ray=False) + except Exception: + try: + from vllm.distributed.parallel_state import ( + destroy_model_parallel, destroy_distributed_environment, + ) + destroy_model_parallel() + destroy_distributed_environment() + except Exception: + pass + + try: + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + except Exception: + pass + + def parse_args(self): + args = super().parse_args() + cfg = self._runner_config + + import argparse + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument("--tensor-parallel-size", type=int, default=None, + dest="tensor_parallel_size") + parser.add_argument("--pipeline-parallel-size", type=int, default=None, + dest="pipeline_parallel_size") + parser.add_argument("--expert-parallel-size", type=int, default=None, + dest="expert_parallel_size") + parser.add_argument("--enforce-eager", action="store_true", default=False, + dest="enforce_eager") + extra, _ = parser.parse_known_args() + + tp_size, _tp_source = self._resolve_tensor_parallel_size( + extra.tensor_parallel_size + ) + + pp_size = (extra.pipeline_parallel_size + if extra.pipeline_parallel_size is not None + else cfg.get("pipeline_parallel_size", 1)) + ep_size = (extra.expert_parallel_size + if extra.expert_parallel_size is not None + else cfg.get("expert_parallel_size", 1)) + self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False) + + print(f" tensor_parallel_size = {tp_size} [{_tp_source}]") + if ep_size > 1: + print(f" expert_parallel_size = {ep_size} [cli/yaml]") + + if not self.SUPPORTS_MULTI_CHIP and tp_size * pp_size > 1: + print(f"Warning: {self.__class__.__name__} does not support multi-chip. " + f"Ignoring tensor_parallel_size={tp_size}, using 1.") + tp_size = 1 + pp_size = 1 + ep_size = 1 + + self._parallelism = { + "tensor_parallel_size": tp_size, + "pipeline_parallel_size": pp_size, + "expert_parallel_size": ep_size, + "data_parallel_size": 1, + } + self._chip_count = tp_size * pp_size + self._precision = getattr(args, "precision", None) + return args + + def get_extra_subprocess_args(self, args) -> list[str]: + extra = [ + "--tensor-parallel-size", + str(self._parallelism.get("tensor_parallel_size", 1)), + ] + if self._parallelism.get("pipeline_parallel_size", 1) > 1: + extra += ["--pipeline-parallel-size", + str(self._parallelism["pipeline_parallel_size"])] + if self._parallelism.get("expert_parallel_size", 1) > 1: + extra += ["--expert-parallel-size", + str(self._parallelism["expert_parallel_size"])] + if self._enforce_eager: + extra += ["--enforce-eager"] + return extra + + +if __name__ == "__main__": + OneCatVLLMRunner().main()