FreedomIntelligence · JuhaoLiang1997 · May 18, 2026 · May 15, 2026 · May 18, 2026 · May 18, 2026
diff --git a/README.md b/README.md
@@ -89,6 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
 | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
 | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |

diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example
@@ -0,0 +1,19 @@
+# AccelMark runner config — nvidia_onecat_vllm_12a253c2 (1Cat-vLLM on V100)
+# Copy to runner_nvidia_onecat_vllm_12a253c2.yaml (gitignored). See runner README.
+
+tensor_parallel_size: 1
+enforce_eager: false
+max_num_seqs: 512
+gpu_memory_utilization: 0.90
+
+engine_kwargs:
+  enable_prefix_caching: false
+  enable_chunked_prefill: false
+  kv_cache_auto_trim_ratio: 0.0
+
+suites:
+  suite_D:
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.85
+  suite_C:
+    max_num_seqs: 1
diff --git a/...esla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json b/...esla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.61,
+  "baseline_delta": null,
+  "valid": true,
+  "framework": "1Cat-vLLM",
+  "precision": "FP16",
+  "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+}
diff --git a/...mmunity/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json b/...mmunity/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json
@@ -0,0 +1,33 @@
+{
+  "collected_at": "2026-05-18T09:38:50.346241+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "Tesla V100-PCIE-32GB",
+      "vendor": "NVIDIA",
+      "memory_gb": 32.0,
+      "driver_version": "580.82.07",
+      "firmware_version": null,
+      "compute_capability": "7.0",
+      "supports_bf16": false
+    }
+  ],
+  "accelerator_platform": "nvidia",
+  "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+    "physical_cores": 26,
+    "logical_cores": 52,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 214.5,
+  "pcie_generation": "PCIe Gen 3",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": null,
+  "os": "Ubuntu 22.04.5 LTS",
+  "python_version": "3.12.13",
+  "kernel_version": "5.4.0-149-generic",
+  "runtime_version": "CUDA 12.8",
+  "pytorch_version": "2.9.1+cu128"
+}
diff --git a/...y/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json b/...y/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json
@@ -0,0 +1,159 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:38:50.346241+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 671.32,
+          "throughput_tokens_per_sec_per_chip": 671.32,
+          "throughput_tokens_per_sec_total": 1168.67,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 670.99,
+          "throughput_tokens_per_sec_per_chip": 670.99,
+          "throughput_tokens_per_sec_total": 1168.09,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 671.43,
+          "throughput_tokens_per_sec_per_chip": 671.43,
+          "throughput_tokens_per_sec_total": 1168.44,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "18:03:39",
+    "run_id": "4e0e6eba",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00",
+    "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00",
+    "benchmark_elapsed_minutes": 10.3,
+    "model_load_seconds": 47.8
+  }
+}
diff --git a/...ty/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json b/...ty/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json
@@ -0,0 +1,158 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:38:50.346241+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "FP16",
+    "effective_dtype": null,
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 113119.0,
+          "ttft_ms_p90": 832380.28,
+          "ttft_ms_p99": 872316.46,
+          "tpot_ms_p50": 1274.2,
+          "tpot_ms_p90": 1801.34,
+          "tpot_ms_p99": 4289.09,
+          "elapsed_seconds_median": 968.7,
+          "sla_met": false
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 130646.03,
+          "ttft_ms_p90": 865522.04,
+          "ttft_ms_p99": 901339.26,
+          "tpot_ms_p50": 1262.15,
+          "tpot_ms_p90": 1785.02,
+          "tpot_ms_p99": 4287.18,
+          "elapsed_seconds_median": 936.5,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 132710.0,
+          "ttft_ms_p90": 863880.66,
+          "ttft_ms_p99": 888527.06,
+          "tpot_ms_p50": 1248.86,
+          "tpot_ms_p90": 1740.58,
+          "tpot_ms_p99": 4225.34,
+          "elapsed_seconds_median": 921.5,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:25:39",
+    "run_id": "4e0e6eba",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T10:04:46.235502+00:00",
+    "benchmark_end_time": "2026-05-18T12:25:39.450279+00:00",
+    "benchmark_elapsed_minutes": 140.9,
+    "model_load_seconds": 45.2
+  }
+}