diff --git a/README.md b/README.md
index 3007966..922c479 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
 | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
 | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example
new file mode 100644
index 0000000..6644d79
--- /dev/null
+++ b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example
@@ -0,0 +1,19 @@
+# AccelMark runner config — nvidia_onecat_vllm_12a253c2 (1Cat-vLLM on V100)
+# Copy to runner_nvidia_onecat_vllm_12a253c2.yaml (gitignored). See runner README.
+
+tensor_parallel_size: 1
+enforce_eager: false
+max_num_seqs: 512
+gpu_memory_utilization: 0.90
+
+engine_kwargs:
+  enable_prefix_caching: false
+  enable_chunked_prefill: false
+  kv_cache_auto_trim_ratio: 0.0
+
+suites:
+  suite_D:
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.85
+  suite_C:
+    max_num_seqs: 1
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json
new file mode 100644
index 0000000..304c3db
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.61,
+  "baseline_delta": null,
+  "valid": true,
+  "framework": "1Cat-vLLM",
+  "precision": "FP16",
+  "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json
new file mode 100644
index 0000000..52c2fdc
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json
@@ -0,0 +1,33 @@
+{
+  "collected_at": "2026-05-18T09:38:50.346241+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "Tesla V100-PCIE-32GB",
+      "vendor": "NVIDIA",
+      "memory_gb": 32.0,
+      "driver_version": "580.82.07",
+      "firmware_version": null,
+      "compute_capability": "7.0",
+      "supports_bf16": false
+    }
+  ],
+  "accelerator_platform": "nvidia",
+  "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+    "physical_cores": 26,
+    "logical_cores": 52,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 214.5,
+  "pcie_generation": "PCIe Gen 3",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": null,
+  "os": "Ubuntu 22.04.5 LTS",
+  "python_version": "3.12.13",
+  "kernel_version": "5.4.0-149-generic",
+  "runtime_version": "CUDA 12.8",
+  "pytorch_version": "2.9.1+cu128"
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json
new file mode 100644
index 0000000..2e6fc7f
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json
@@ -0,0 +1,159 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:38:50.346241+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 671.32,
+          "throughput_tokens_per_sec_per_chip": 671.32,
+          "throughput_tokens_per_sec_total": 1168.67,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 670.99,
+          "throughput_tokens_per_sec_per_chip": 670.99,
+          "throughput_tokens_per_sec_total": 1168.09,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 671.43,
+          "throughput_tokens_per_sec_per_chip": 671.43,
+          "throughput_tokens_per_sec_total": 1168.44,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "18:03:39",
+    "run_id": "4e0e6eba",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00",
+    "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00",
+    "benchmark_elapsed_minutes": 10.3,
+    "model_load_seconds": 47.8
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json
new file mode 100644
index 0000000..66aeb48
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json
@@ -0,0 +1,158 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:38:50.346241+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "FP16",
+    "effective_dtype": null,
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 113119.0,
+          "ttft_ms_p90": 832380.28,
+          "ttft_ms_p99": 872316.46,
+          "tpot_ms_p50": 1274.2,
+          "tpot_ms_p90": 1801.34,
+          "tpot_ms_p99": 4289.09,
+          "elapsed_seconds_median": 968.7,
+          "sla_met": false
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 130646.03,
+          "ttft_ms_p90": 865522.04,
+          "ttft_ms_p99": 901339.26,
+          "tpot_ms_p50": 1262.15,
+          "tpot_ms_p90": 1785.02,
+          "tpot_ms_p99": 4287.18,
+          "elapsed_seconds_median": 936.5,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 132710.0,
+          "ttft_ms_p90": 863880.66,
+          "ttft_ms_p99": 888527.06,
+          "tpot_ms_p50": 1248.86,
+          "tpot_ms_p90": 1740.58,
+          "tpot_ms_p99": 4225.34,
+          "elapsed_seconds_median": 921.5,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:25:39",
+    "run_id": "4e0e6eba",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T10:04:46.235502+00:00",
+    "benchmark_end_time": "2026-05-18T12:25:39.450279+00:00",
+    "benchmark_elapsed_minutes": 140.9,
+    "model_load_seconds": 45.2
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json
new file mode 100644
index 0000000..07930da
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json
@@ -0,0 +1,210 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:38:50.346241+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenarios_run": [
+      "offline",
+      "online"
+    ],
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "num_runs": 3,
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    }
+  },
+  "metrics": {
+    "derived": {},
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 671.32,
+          "throughput_tokens_per_sec_per_chip": 671.32,
+          "throughput_tokens_per_sec_total": 1168.67,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 670.99,
+          "throughput_tokens_per_sec_per_chip": 670.99,
+          "throughput_tokens_per_sec_total": 1168.09,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 671.43,
+          "throughput_tokens_per_sec_per_chip": 671.43,
+          "throughput_tokens_per_sec_total": 1168.44,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    },
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 113119.0,
+          "ttft_ms_p90": 832380.28,
+          "ttft_ms_p99": 872316.46,
+          "tpot_ms_p50": 1274.2,
+          "tpot_ms_p90": 1801.34,
+          "tpot_ms_p99": 4289.09,
+          "elapsed_seconds_median": 968.7,
+          "sla_met": false
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 130646.03,
+          "ttft_ms_p90": 865522.04,
+          "ttft_ms_p99": 901339.26,
+          "tpot_ms_p50": 1262.15,
+          "tpot_ms_p90": 1785.02,
+          "tpot_ms_p99": 4287.18,
+          "elapsed_seconds_median": 936.5,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 132710.0,
+          "ttft_ms_p90": 863880.66,
+          "ttft_ms_p99": 888527.06,
+          "tpot_ms_p50": 1248.86,
+          "tpot_ms_p90": 1740.58,
+          "tpot_ms_p99": 4225.34,
+          "elapsed_seconds_median": 921.5,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": 0.61,
+    "baseline_delta": null,
+    "valid": true,
+    "framework": "1Cat-vLLM",
+    "precision": "FP16",
+    "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "18:03:39",
+    "run_id": "4e0e6eba",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00",
+    "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00",
+    "benchmark_elapsed_minutes": 151.2,
+    "model_load_seconds": 47.8,
+    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.",
+    "scenario_dirs": {
+      "offline": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline",
+      "online": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online"
+    }
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json
new file mode 100644
index 0000000..94e5547
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.37,
+  "baseline_delta": 0.0,
+  "valid": true,
+  "framework": "1Cat-vLLM",
+  "precision": "FP16",
+  "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json
new file mode 100644
index 0000000..1f8b6bd
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json
@@ -0,0 +1,33 @@
+{
+  "collected_at": "2026-05-18T12:26:03.593928+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "Tesla V100-PCIE-32GB",
+      "vendor": "NVIDIA",
+      "memory_gb": 32.0,
+      "driver_version": "580.82.07",
+      "firmware_version": null,
+      "compute_capability": "7.0",
+      "supports_bf16": false
+    }
+  ],
+  "accelerator_platform": "nvidia",
+  "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+    "physical_cores": 26,
+    "logical_cores": 52,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 214.5,
+  "pcie_generation": "PCIe Gen 3",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": null,
+  "os": "Ubuntu 22.04.5 LTS",
+  "python_version": "3.12.13",
+  "kernel_version": "5.4.0-149-generic",
+  "runtime_version": "CUDA 12.8",
+  "pytorch_version": "2.9.1+cu128"
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json
new file mode 100644
index 0000000..f017bc2
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json
@@ -0,0 +1,126 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T12:26:03.593928+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "FP16",
+    "effective_dtype": null,
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "interactive",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "interactive": {
+      "ttft_ms_p50": 26.76,
+      "ttft_ms_p90": 29.57,
+      "ttft_ms_p99": 40.69,
+      "tpot_ms_p50": 3.51,
+      "tpot_ms_p90": 3.76,
+      "tpot_ms_p99": 3.81,
+      "peak_memory_gb": null,
+      "elapsed_seconds_median": 116.9
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:45:36",
+    "run_id": "419b138c",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T12:39:46.224469+00:00",
+    "benchmark_end_time": "2026-05-18T12:45:36.498231+00:00",
+    "benchmark_elapsed_minutes": 5.8,
+    "model_load_seconds": 27.8
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json
new file mode 100644
index 0000000..da8126b
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json
@@ -0,0 +1,159 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T12:26:03.593928+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 4,
+          "throughput_tokens_per_sec": 6234.82,
+          "throughput_tokens_per_sec_per_chip": 6234.82,
+          "throughput_tokens_per_sec_total": 9303.11,
+          "elapsed_seconds_median": 6.8,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 16,
+          "throughput_tokens_per_sec": 6292.79,
+          "throughput_tokens_per_sec_per_chip": 6292.79,
+          "throughput_tokens_per_sec_total": 9356.18,
+          "elapsed_seconds_median": 6.7,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 64,
+          "throughput_tokens_per_sec": 6243.51,
+          "throughput_tokens_per_sec_per_chip": 6243.51,
+          "throughput_tokens_per_sec_total": 9267.55,
+          "elapsed_seconds_median": 6.8,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:28:55",
+    "run_id": "419b138c",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00",
+    "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00",
+    "benchmark_elapsed_minutes": 1.4,
+    "model_load_seconds": 31.7
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json
new file mode 100644
index 0000000..170f9d0
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json
@@ -0,0 +1,146 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T12:26:03.593928+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "FP16",
+    "effective_dtype": null,
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 10,
+          "achieved_qps": 10.0,
+          "ttft_ms_p50": 6316.13,
+          "ttft_ms_p90": 53409.43,
+          "ttft_ms_p99": 67932.56,
+          "tpot_ms_p50": 206.23,
+          "tpot_ms_p90": 291.3,
+          "tpot_ms_p99": 636.32,
+          "elapsed_seconds_median": 103.3,
+          "sla_met": false
+        },
+        {
+          "target_qps": 40,
+          "achieved_qps": 40.0,
+          "ttft_ms_p50": 19238.78,
+          "ttft_ms_p90": 56898.27,
+          "ttft_ms_p99": 75398.9,
+          "tpot_ms_p50": 189.24,
+          "tpot_ms_p90": 300.17,
+          "tpot_ms_p99": 582.22,
+          "elapsed_seconds_median": 86.3,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:38:56",
+    "run_id": "419b138c",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T12:29:46.673625+00:00",
+    "benchmark_end_time": "2026-05-18T12:38:56.798553+00:00",
+    "benchmark_elapsed_minutes": 9.2,
+    "model_load_seconds": 28.7
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json
new file mode 100644
index 0000000..12baab4
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json
@@ -0,0 +1,210 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T12:26:03.593928+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenarios_run": [
+      "offline",
+      "online",
+      "interactive"
+    ],
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "num_runs": 3,
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    }
+  },
+  "metrics": {
+    "derived": {},
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 4,
+          "throughput_tokens_per_sec": 6234.82,
+          "throughput_tokens_per_sec_per_chip": 6234.82,
+          "throughput_tokens_per_sec_total": 9303.11,
+          "elapsed_seconds_median": 6.8,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 16,
+          "throughput_tokens_per_sec": 6292.79,
+          "throughput_tokens_per_sec_per_chip": 6292.79,
+          "throughput_tokens_per_sec_total": 9356.18,
+          "elapsed_seconds_median": 6.7,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 64,
+          "throughput_tokens_per_sec": 6243.51,
+          "throughput_tokens_per_sec_per_chip": 6243.51,
+          "throughput_tokens_per_sec_total": 9267.55,
+          "elapsed_seconds_median": 6.8,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    },
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 10,
+          "achieved_qps": 10.0,
+          "ttft_ms_p50": 6316.13,
+          "ttft_ms_p90": 53409.43,
+          "ttft_ms_p99": 67932.56,
+          "tpot_ms_p50": 206.23,
+          "tpot_ms_p90": 291.3,
+          "tpot_ms_p99": 636.32,
+          "elapsed_seconds_median": 103.3,
+          "sla_met": false
+        },
+        {
+          "target_qps": 40,
+          "achieved_qps": 40.0,
+          "ttft_ms_p50": 19238.78,
+          "ttft_ms_p90": 56898.27,
+          "ttft_ms_p99": 75398.9,
+          "tpot_ms_p50": 189.24,
+          "tpot_ms_p90": 300.17,
+          "tpot_ms_p99": 582.22,
+          "elapsed_seconds_median": 86.3,
+          "sla_met": false
+        }
+      ]
+    },
+    "interactive": {
+      "ttft_ms_p50": 26.76,
+      "ttft_ms_p90": 29.57,
+      "ttft_ms_p99": 40.69,
+      "tpot_ms_p50": 3.51,
+      "tpot_ms_p90": 3.76,
+      "tpot_ms_p99": 3.81,
+      "peak_memory_gb": null,
+      "elapsed_seconds_median": 116.9
+    }
+  },
+  "accuracy": {
+    "subset_score": 0.37,
+    "baseline_delta": 0.0,
+    "valid": true,
+    "framework": "1Cat-vLLM",
+    "precision": "FP16",
+    "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:28:55",
+    "run_id": "419b138c",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00",
+    "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00",
+    "benchmark_elapsed_minutes": 16.4,
+    "model_load_seconds": 31.7,
+    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.",
+    "scenario_dirs": {
+      "offline": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline",
+      "online": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online",
+      "interactive": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive"
+    }
+  }
+}
\ No newline at end of file
diff --git a/runners/nvidia_onecat_vllm_12a253c2/README.md b/runners/nvidia_onecat_vllm_12a253c2/README.md
new file mode 100644
index 0000000..0556214
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_12a253c2/README.md
@@ -0,0 +1,227 @@
+# nvidia_onecat_vllm_12a253c2 — 1Cat-vLLM Runner (Tesla V100 / SM70)
+
+AccelMark runner for **Tesla V100 / V100S only**, using
+[1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) (community vLLM fork for Volta).
+
+> **Hardware:** Use this runner only on V100 / V100S (SM70). On Ampere or newer,
+> use upstream `nvidia_vllm_*`.
+
+> **Third-party software:** 1Cat-vLLM is maintained by [1CatAI](https://github.com/1CatAI/1Cat-vLLM)
+> under its own license. AccelMark ships only the thin `runner.py` wrapper; install
+> 1Cat-vLLM separately as described below.
+
+## Why 1Cat-vLLM
+
+| Limitation on stock vLLM + V100 | 1Cat-vLLM |
+|--------------------------------|-----------|
+| AWQ kernels need SM75+ | SM70 AWQ via lmdeploy TurboMind |
+| FlashAttention 2/3 need Ampere+ | `FLASH_ATTN_V100` backend |
+| Qwen3.5 / Qwen3.6 on V100 | Fork model/runtime fixes |
+| Long-context on Volta | SM70 paged-attention path |
+
+Release notes: [1Cat-vLLM v1.0.0](https://github.com/1CatAI/1Cat-vLLM/releases/tag/v1.0.0).
+
+## Runner defaults (code)
+
+| Setting | Default |
+|---------|---------|
+| `attention_backend` | `FLASH_ATTN_V100` (auto unless overridden) |
+| `SUPPORTED_PRECISIONS` | `fp16`, `fp32` (no BF16 on V100) |
+| `SUPPORTED_QUANTIZATION_BACKENDS` | `awq` only |
+| `max_num_seqs` | `512` global default (same as upstream vLLM); use `1` for suite D / long-context |
+| `gpu_memory_utilization` | `0.90` |
+
+## Supported suites
+
+| Suite | Notes |
+|-------|-------|
+| A | Runs on 1× V100; upstream `nvidia_vllm_*` + `--enforce-eager` is often enough |
+| B | **Primary** — use `--tensor-parallel-size 4` on 4× V100 32GB |
+| C | **Primary** — AWQ |
+| D | **Primary** — long context + `FLASH_ATTN_V100` |
+| E | Multi-chip scaling (same TP guidance as B) |
+| F | Not recommended (edge model; use upstream runner) |
+| G | **Primary** — MoE + AWQ (Qwen3.5/3.6 class models) |
+
+---
+
+## Environment setup
+
+### Reference stack (1Cat-vLLM 1.0.0)
+
+| Component | Version |
+|-----------|---------|
+| GPU | Tesla V100 / V100S (SM70) |
+| Python | **3.12** (`cp312` wheels only) |
+| CUDA toolkit | **12.8** |
+| Driver | 570.x recommended (CUDA 12.8) |
+| PyTorch | **2.9.1+cu128** (from 1Cat wheels or build env) |
+
+### Path A — Prebuilt wheels (Ubuntu 24.04+, glibc ≥ 2.38)
+
+Official wheels require **glibc 2.38+** (e.g. Ubuntu 24.04). On Ubuntu 22.04,
+`pip install` may succeed but `import vllm` fails with `GLIBC_2.38 not found`
+— use Path B instead.
+
+```bash
+conda create -y -n onecat-vllm python=3.12
+conda activate onecat-vllm
+python -m pip install --upgrade pip setuptools wheel
+
+# Install BOTH wheels together — never `pip install vllm` from PyPI
+python -m pip install --prefer-binary --no-cache-dir \
+    --extra-index-url https://download.pytorch.org/whl/cu128 \
+    "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \
+    "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl"
+
+cd /path/to/AccelMark
+pip install -r runners/nvidia_onecat_vllm_12a253c2/requirements.txt
+```
+
+### Path B — Build from source (Ubuntu 22.04 / glibc 2.35)
+
+Build on the **host glibc** so binaries link against 2.35. Typical AutoDL /
+Ubuntu 22.04 V100 boxes use this path.
+
+**Prerequisites:** CUDA 12.8 toolkit (`nvcc` on PATH), conda Python 3.12, ~20GB
+free disk for build tree + wheels.
+
+```bash
+conda create -y -n onecat-vllm python=3.12
+conda activate onecat-vllm
+export CUDA_HOME=/usr/local/cuda-12.8
+export PATH="$CUDA_HOME/bin:$PATH"
+export TORCH_CUDA_ARCH_LIST="7.0"
+export MAX_JOBS=6
+export PIP_CACHE_DIR=/path/to/fast/disk/pip-cache   # optional
+
+git clone --depth 1 --branch v1.0.0 https://github.com/1CatAI/1Cat-vLLM.git
+cd 1Cat-vLLM
+pip install -r requirements/build.txt -r requirements/cuda.txt -r requirements/common.txt
+pip install cmake build ninja
+
+DIST=/path/to/dist-cu128-sm70-v1.0.0
+mkdir -p "$DIST"
+
+# 1) flash_attn_v100 wheel
+pushd flash-attention-v100
+python -m build --wheel --no-isolation --outdir "$DIST"
+popd
+
+# 2) vllm wheel (30–90 min on V100 host)
+export VLLM_TARGET_DEVICE=cuda
+python -m build --wheel --no-isolation --outdir "$DIST"
+
+# 3) Install — run from /tmp so Python does not import the source tree
+pip install "$DIST"/flash_attn_v100-*.whl
+cd /tmp && pip install --no-deps --force-reinstall "$DIST"/vllm-*.whl
+
+cd /path/to/AccelMark
+pip install -r runners/nvidia_onecat_vllm_12a253c2/requirements.txt
+```
+
+Do **not** run AccelMark from inside the cloned `1Cat-vLLM/` directory; Python
+may import the local `vllm/` package instead of the installed wheel.
+
+### Smoke test
+
+Run from `/tmp` or the AccelMark repo root (not inside `1Cat-vLLM/`):
+
+```bash
+python - <<'PY'
+import torch, vllm
+print("torch:", torch.__version__, "vllm:", vllm.__version__)
+import flash_attn_v100_cuda
+print("flash_attn_v100: ok")
+from vllm import LLM
+print("LLM import: ok")
+PY
+```
+
+---
+
+## AccelMark runner config (required on V100)
+
+Copy and edit:
+
+```bash
+cp configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example \
+   configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml
+```
+
+**Single V100 32GB** — recommended `engine_kwargs` (avoids SM70
+`Shared memory exceeds 96KB` in `prefill_paged_fwd`):
+
+```yaml
+tensor_parallel_size: 1
+max_num_seqs: 512
+gpu_memory_utilization: 0.90
+engine_kwargs:
+  enable_prefix_caching: false
+  enable_chunked_prefill: false
+  kv_cache_auto_trim_ratio: 0.0
+
+suites:
+  suite_D:
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.85
+```
+
+If it still crashes, export before `python run.py`:
+
+```bash
+export VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1
+```
+
+That forces the slower paged-KV gather fallback instead of `prefill_paged_fwd`.
+
+**4× V100 32GB** — set `tensor_parallel_size: 4`; keep the same `engine_kwargs`
+unless you are deliberately testing 1Cat's MTP / prefix-cache profile (see
+example file comments).
+
+Other tuning:
+
+| Symptom | Try |
+|---------|-----|
+| `Shared memory exceeds 96KB` | `enable_chunked_prefill: false` + `enable_prefix_caching: false` (above); then `export VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1` |
+| First request hangs (CUDA graph) | `enforce_eager: true` or `--enforce-eager` |
+| OOM at engine init | Lower `gpu_memory_utilization` (e.g. `0.85`) |
+| `GLIBC_2.38 not found` | Path B source build, or Ubuntu 24.04+ |
+
+---
+
+## Basic usage
+
+```bash
+cp configs/submitter.yaml.example configs/submitter.yaml   # once
+cp configs/models_local.yaml.example configs/models_local.yaml   # map local model paths
+
+export PYTHONPATH=/path/to/AccelMark   # if pip install -e . is unavailable
+
+# Suite A smoke (1× V100)
+python run.py --runner nvidia_onecat_vllm_12a253c2 \
+    --suite suite_A --scenario accuracy --tensor-parallel-size 1
+
+# Suite B (4× V100)
+python run.py --runner nvidia_onecat_vllm_12a253c2 \
+    --suite suite_B --tensor-parallel-size 4
+```
+
+---
+
+## Known limitations
+
+- Prefix caching and **chunked prefill** (even with prefix caching off) can hit the
+  `prefill_paged_fwd` kernel (>96KB shared memory on SM70). Disable both in config;
+  use `VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1` if needed (see above).
+- `max_num_seqs: 1` limits batch throughput vs upstream vLLM defaults — intentional
+  for 1Cat's long-context V100 profile.
+- Suite F is marked unsupported in `meta.json` (use upstream runner on V100 if needed).
+- End-to-end validation on 4× V100 reference hardware is still community-pending in
+  `meta.json`; single-GPU smoke (Suite A accuracy) has been exercised on V100 32GB.
+
+## Requirements
+
+See `requirements.txt`. Install `torch`, `flash_attn_v100`, and the `vllm` fork
+from 1Cat-vLLM **before** the AccelMark extras file. Do not install upstream
+`vllm` from PyPI after the fork.
diff --git a/runners/nvidia_onecat_vllm_12a253c2/meta.json b/runners/nvidia_onecat_vllm_12a253c2/meta.json
new file mode 100644
index 0000000..394601f
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_12a253c2/meta.json
@@ -0,0 +1,21 @@
+{
+  "id": "nvidia_onecat_vllm_12a253c2",
+  "platform": "nvidia",
+  "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA",
+  "framework": "1Cat-vLLM",
+  "submitted_by": "JuhaoLiang1997",
+  "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — community vLLM fork with FLASH_ATTN_V100 and SM70 AWQ kernels. Use nvidia_vllm_* on Ampere or newer.",
+  "supersedes_chain": ["nvidia_onecat_vllm_4a9ca6c3", "nvidia_onecat_vllm_a43d1bcf"],
+  "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless overridden. V100: disable prefix caching and chunked prefill in runner config (see README). External dependency: https://github.com/1CatAI/1Cat-vLLM",
+  "created": "2026-05-15",
+  "hardware_label": "NVIDIA V100 (SM70)",
+  "suite_support": {
+    "A": "pending",
+    "B": "pending",
+    "C": "pending",
+    "D": "pending",
+    "E": "pending",
+    "F": "unsupported",
+    "G": "pending"
+  }
+}
diff --git a/runners/nvidia_onecat_vllm_12a253c2/requirements.txt b/runners/nvidia_onecat_vllm_12a253c2/requirements.txt
new file mode 100644
index 0000000..b6d4c62
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_12a253c2/requirements.txt
@@ -0,0 +1,17 @@
+# AccelMark extras for nvidia_onecat_vllm_4a9ca6c3.
+# Install 1Cat-vLLM (flash_attn_v100 + vllm fork) first — see README.md.
+
+transformers==4.57.6
+tokenizers==0.22.2
+huggingface-hub==0.35.0
+accelerate==1.10.1
+safetensors==0.6.2
+
+numpy==1.26.4
+jsonschema==4.25.1
+psutil==7.1.0
+tqdm==4.67.1
+
+nvidia-ml-py==13.580.82
+aiohttp==3.12.15
+PyYAML==6.0.2
diff --git a/runners/nvidia_onecat_vllm_12a253c2/runner.py b/runners/nvidia_onecat_vllm_12a253c2/runner.py
new file mode 100644
index 0000000..3462765
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_12a253c2/runner.py
@@ -0,0 +1,382 @@
+"""
+AccelMark — NVIDIA 1Cat-vLLM (SM70 / V100) benchmark script.
+
+Thin vLLM runner wrapper for the 1Cat-vLLM fork on Tesla V100 / V100S.
+See README.md in this folder for install, hardware scope, and tuning.
+"""
+
+import asyncio
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(_REPO_ROOT))
+
+import torch
+from vllm import LLM, AsyncLLMEngine, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from transformers import AutoTokenizer
+
+from runners.benchmark_runner import BenchmarkRunner, InferenceRequest
+from loadgen.types import InferenceResult
+
+
+import logging
+logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING)
+logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING)
+
+
+class OneCatVLLMRunner(BenchmarkRunner):
+    """1Cat-vLLM on NVIDIA V100 / V100S (SM70). Use nvidia_vllm_* on newer GPUs."""
+
+    SUPPORTS_STREAMING = True
+    SUPPORTS_BATCHING = True
+    SUPPORTS_ONLINE = True
+    SUPPORTS_MULTI_CHIP = True
+
+    SUPPORTED_PRECISIONS = ["fp16", "fp32"]
+    SUPPORTED_QUANTIZATION_BACKENDS = ["awq"]
+
+    def __init__(self):
+        self.llm: LLM = None
+        self.engine: AsyncLLMEngine = None
+        self.tokenizer: AutoTokenizer = None
+        self.sampling_params: SamplingParams = None
+        self._loop: asyncio.AbstractEventLoop = None
+
+    def _get_chip_count(self) -> int:
+        try:
+            import torch
+            n = torch.cuda.device_count()
+            return n if n > 0 else 1
+        except Exception:
+            return 1
+
+    def _get_framework_name(self) -> str:
+        return "1Cat-vLLM"
+
+    def _get_framework_version(self) -> str:
+        core = "unknown"
+        try:
+            import vllm
+            core = vllm.__version__
+        except Exception:
+            pass
+
+        fa_v100 = None
+        try:
+            from importlib.metadata import version as _pkg_version
+            fa_v100 = _pkg_version("flash_attn_v100")
+        except Exception:
+            try:
+                import flash_attn_v100_cuda  # type: ignore  # noqa: F401
+                fa_v100 = "installed"
+            except Exception:
+                fa_v100 = None
+
+        if fa_v100:
+            return f"{core}+flash_attn_v100-{fa_v100}"
+        return core
+
+    def load_model(self, model_path: str, parallelism: dict) -> None:
+        tp_size = parallelism["tensor_parallel_size"]
+        pp_size = parallelism["pipeline_parallel_size"]
+        ep_size = parallelism.get("expert_parallel_size", 1)
+        assert pp_size <= 1, "Pipeline parallelism is not supported in OneCatVLLMRunner"
+
+        max_tokens    = parallelism["max_tokens"]
+        max_model_len = parallelism["max_model_len"]
+        use_async     = parallelism["use_async"]
+        enforce_eager = getattr(self, "_enforce_eager", False)
+
+        cfg             = getattr(self, "_runner_config", {})
+        max_num_seqs    = cfg.get("max_num_seqs", 512)
+        gpu_memory_util = cfg.get("gpu_memory_utilization", 0.90)
+        extra_kwargs    = dict(cfg.get("engine_kwargs") or {})
+
+        import os
+        if (
+            "attention_backend" not in extra_kwargs
+            and "VLLM_ATTENTION_BACKEND" not in os.environ
+        ):
+            extra_kwargs["attention_backend"] = "FLASH_ATTN_V100"
+
+        try:
+            import dataclasses
+            from vllm.engine.arg_utils import EngineArgs as _EngineArgs
+            _valid = {f.name for f in dataclasses.fields(_EngineArgs)}
+            _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid}
+            if _dropped:
+                print(f"  Warning: engine_kwargs keys not supported by this "
+                      f"1Cat-vLLM version and will be ignored: {list(_dropped)}")
+            extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid}
+        except Exception:
+            pass
+
+        effective_precision = getattr(self, "_effective_precision", "BF16").upper()
+        precision           = getattr(self, "_precision", None) or effective_precision
+
+        _dtype_override  = getattr(self, "_precision_dtype_override", None)
+        _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {})
+
+        quantization = _prec_eng_kwargs.pop("quantization", None)
+
+        _NATIVE_DTYPE_MAP = {
+            "BF16":  "bfloat16",
+            "FP16":  "float16",
+            "FP32":  "float32",
+        }
+        dtype = _NATIVE_DTYPE_MAP.get(precision, "auto")
+        self._quantization_method = quantization
+
+        if _dtype_override:
+            dtype = _dtype_override
+
+        if _prec_eng_kwargs:
+            _prec_eng_kwargs.update(extra_kwargs)
+            extra_kwargs = _prec_eng_kwargs
+
+        print(f"Loading model: precision={precision}, dtype={dtype}"
+              + (f", quantization_method={self._quantization_method}"
+                 if self._quantization_method else ""))
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=False
+        )
+
+        self.sampling_params = SamplingParams(
+            max_tokens=max_tokens,
+            temperature=0.0,
+        )
+
+        if not use_async:
+            llm_kwargs = dict(
+                model=model_path,
+                dtype=dtype,
+                tensor_parallel_size=tp_size,
+                trust_remote_code=False,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+                gpu_memory_utilization=gpu_memory_util,
+                **extra_kwargs,
+            )
+            if ep_size > 1:
+                llm_kwargs["enable_expert_parallel"] = True
+                llm_kwargs["tensor_parallel_size"]   = tp_size
+            if quantization:
+                llm_kwargs["quantization"] = quantization
+            if max_model_len:
+                llm_kwargs["max_model_len"] = max_model_len
+            self.llm = LLM(**llm_kwargs)
+        else:
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+            engine_kwargs = dict(
+                model=model_path,
+                dtype=dtype,
+                tensor_parallel_size=tp_size,
+                trust_remote_code=False,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=gpu_memory_util,
+                **extra_kwargs,
+            )
+            if ep_size > 1:
+                engine_kwargs["enable_expert_parallel"] = True
+            if max_model_len:
+                engine_kwargs["max_model_len"] = max_model_len
+            engine_args = AsyncEngineArgs(**engine_kwargs)
+            self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    def get_effective_dtype(self) -> Optional[str]:
+        try:
+            if self.llm is not None:
+                dtype = self.llm.llm_engine.model_config.dtype
+                return str(dtype).replace("torch.", "")
+            elif self.engine is not None:
+                dtype = self.engine.engine.model_config.dtype
+                return str(dtype).replace("torch.", "")
+        except Exception:
+            pass
+        return getattr(self, "_effective_dtype", None)
+
+    def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]:
+        formatted = [self._format_prompt(r.prompt) for r in requests]
+        t_start = time.perf_counter()
+        outputs = self.llm.generate(formatted, self.sampling_params)
+        elapsed = time.perf_counter() - t_start
+
+        self._last_accuracy_outputs = [o.outputs[0].text for o in outputs]
+
+        results = []
+        for output in outputs:
+            results.append(InferenceResult(
+                first_token_time_ms=None,
+                total_time_ms=elapsed * 1000,
+                output_tokens=len(output.outputs[0].token_ids),
+                input_tokens=len(output.prompt_token_ids),
+                success=True,
+                output_text=output.outputs[0].text,
+            ))
+        return results
+
+    async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult:
+        from vllm.utils import random_uuid
+
+        formatted = self._format_prompt(request.prompt)
+        request_id = random_uuid()
+        t_start = time.perf_counter()
+        first_token_time_ms = None
+        output_tokens = 0
+        output_text = ""
+
+        async for output in self.engine.generate(
+            formatted, self.sampling_params, request_id
+        ):
+            if (
+                first_token_time_ms is None
+                and len(output.outputs[0].token_ids) > 0
+            ):
+                first_token_time_ms = (time.perf_counter() - t_start) * 1000
+            output_tokens = len(output.outputs[0].token_ids)
+            output_text = output.outputs[0].text
+
+        total_time_ms = (time.perf_counter() - t_start) * 1000
+        return InferenceResult(
+            first_token_time_ms=first_token_time_ms,
+            total_time_ms=total_time_ms,
+            output_tokens=output_tokens,
+            input_tokens=0,
+            success=True,
+            output_text=output_text,
+        )
+
+    async def inference_fn_token_stream(self, request: InferenceRequest):
+        from vllm.utils import random_uuid
+
+        formatted   = self._format_prompt(request.prompt)
+        request_id  = random_uuid()
+        prev_length = 0
+
+        async for output in self.engine.generate(
+            formatted, self.sampling_params, request_id
+        ):
+            current_text = output.outputs[0].text
+            delta = current_text[prev_length:]
+            if delta:
+                yield delta
+                prev_length = len(current_text)
+
+    def get_peak_memory_gb(self) -> float:
+        try:
+            return torch.cuda.max_memory_allocated() / (1024 ** 3)
+        except Exception:
+            return None
+
+    def release_resources(self) -> None:
+        if self.llm is not None:
+            try:
+                del self.llm
+            except Exception:
+                pass
+            self.llm = None
+
+        if self.engine is not None:
+            try:
+                if self._loop and not self._loop.is_closed():
+                    self._loop.run_until_complete(self.engine.shutdown())
+            except Exception:
+                pass
+            try:
+                del self.engine
+            except Exception:
+                pass
+            self.engine = None
+
+        try:
+            from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+            cleanup_dist_env_and_memory(shutdown_ray=False)
+        except Exception:
+            try:
+                from vllm.distributed.parallel_state import (
+                    destroy_model_parallel, destroy_distributed_environment,
+                )
+                destroy_model_parallel()
+                destroy_distributed_environment()
+            except Exception:
+                pass
+
+        try:
+            if torch.distributed.is_initialized():
+                torch.distributed.destroy_process_group()
+        except Exception:
+            pass
+
+    def parse_args(self):
+        args = super().parse_args()
+        cfg = self._runner_config
+
+        import argparse
+        parser = argparse.ArgumentParser(add_help=False)
+        parser.add_argument("--tensor-parallel-size", type=int, default=None,
+                            dest="tensor_parallel_size")
+        parser.add_argument("--pipeline-parallel-size", type=int, default=None,
+                            dest="pipeline_parallel_size")
+        parser.add_argument("--expert-parallel-size", type=int, default=None,
+                            dest="expert_parallel_size")
+        parser.add_argument("--enforce-eager", action="store_true", default=False,
+                            dest="enforce_eager")
+        extra, _ = parser.parse_known_args()
+
+        tp_size, _tp_source = self._resolve_tensor_parallel_size(
+            extra.tensor_parallel_size
+        )
+
+        pp_size = (extra.pipeline_parallel_size
+                   if extra.pipeline_parallel_size is not None
+                   else cfg.get("pipeline_parallel_size", 1))
+        ep_size = (extra.expert_parallel_size
+                   if extra.expert_parallel_size is not None
+                   else cfg.get("expert_parallel_size", 1))
+        self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False)
+
+        print(f"  tensor_parallel_size = {tp_size}  [{_tp_source}]")
+        if ep_size > 1:
+            print(f"  expert_parallel_size = {ep_size}  [cli/yaml]")
+
+        if not self.SUPPORTS_MULTI_CHIP and tp_size * pp_size > 1:
+            print(f"Warning: {self.__class__.__name__} does not support multi-chip. "
+                  f"Ignoring tensor_parallel_size={tp_size}, using 1.")
+            tp_size = 1
+            pp_size = 1
+            ep_size = 1
+
+        self._parallelism = {
+            "tensor_parallel_size":   tp_size,
+            "pipeline_parallel_size": pp_size,
+            "expert_parallel_size":   ep_size,
+            "data_parallel_size":     1,
+        }
+        self._chip_count = tp_size * pp_size
+        self._precision  = getattr(args, "precision", None)
+        return args
+
+    def get_extra_subprocess_args(self, args) -> list[str]:
+        extra = [
+            "--tensor-parallel-size",
+            str(self._parallelism.get("tensor_parallel_size", 1)),
+        ]
+        if self._parallelism.get("pipeline_parallel_size", 1) > 1:
+            extra += ["--pipeline-parallel-size",
+                      str(self._parallelism["pipeline_parallel_size"])]
+        if self._parallelism.get("expert_parallel_size", 1) > 1:
+            extra += ["--expert-parallel-size",
+                      str(self._parallelism["expert_parallel_size"])]
+        if self._enforce_eager:
+            extra += ["--enforce-eager"]
+        return extra
+
+
+if __name__ == "__main__":
+    OneCatVLLMRunner().main()