Skip to content

[Submission] nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd #37

@Gong-K

Description

@Gong-K
{
  "schema_version": "1.0",
  "suite_id": "suite_A",
  "implementation_id": "nvidia_sglang_c43a8309",
  "chip": {
    "name": "NVIDIA A100-SXM4-40GB",
    "vendor": "NVIDIA",
    "count": 1,
    "memory_gb": 40.0,
    "interconnect_intra_node": null,
    "interconnect_inter_node": null
  },
  "environment": {
    "collected_at": "2026-05-06T11:15:11.081772+00:00",
    "accelerators": [
      {
        "index": 0,
        "name": "NVIDIA A100-SXM4-40GB",
        "vendor": "NVIDIA",
        "memory_gb": 40.0,
        "driver_version": "565.57.01",
        "firmware_version": null,
        "compute_capability": "8.0",
        "supports_bf16": true
      }
    ],
    "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tNIC3\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tPXB\tNODE\tSYS\t0-31,64-95\t0\t\tN/A\nNIC0\tPXB\t X \tPIX\tNODE\tSYS\t\t\t\t\nNIC1\tPXB\tPIX\t X \tNODE\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tNODE\t X \tSYS\t\t\t\t\nNIC3\tSYS\tSYS\tSYS\tSYS\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n  NIC3: mlx5_3\n\n",
    "intra_node_interconnect": null,
    "cpu": {
      "model": "AMD EPYC 7532 32-Core Processor",
      "physical_cores": 64,
      "logical_cores": 128,
      "numa_nodes": 2
    },
    "system_memory_gb": 1007.7,
    "pcie_generation": "PCIe Gen 4",
    "cpu_accelerator_bandwidth_gbs": null,
    "network_interfaces": [
      {
        "name": "mlx5_0",
        "type": "InfiniBand/RoCE",
        "bandwidth_gbps": null
      },
      {
        "name": "mlx5_1",
        "type": "InfiniBand/RoCE",
        "bandwidth_gbps": null
      },
      {
        "name": "mlx5_2",
        "type": "InfiniBand/RoCE",
        "bandwidth_gbps": null
      },
      {
        "name": "mlx5_3",
        "type": "InfiniBand/RoCE",
        "bandwidth_gbps": null
      }
    ],
    "os": "Ubuntu 22.04.4 LTS",
    "python_version": "3.10.20",
    "kernel_version": "5.15.0-60-generic",
    "runtime_version": "CUDA 12.8",
    "pytorch_version": "2.9.1+cu128"
  },
  "software": {
    "framework": "SGLang",
    "framework_version": "0.5.6",
    "driver_version": "565.57.01",
    "runtime_version": "CUDA 12.8",
    "os": "Ubuntu 22.04.4 LTS",
    "python_version": "3.10.20"
  },
  "model": {
    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
    "model_name": null,
    "model_note": null,
    "model_source": "local",
    "architecture": "dense",
    "parameter_count_b": 8.0,
    "precision": "BF16",
    "effective_dtype": "bfloat16",
    "quantization_method": null,
    "model_format": "HuggingFace original"
  },
  "task": {
    "scenarios_run": [
      "offline",
      "online",
      "interactive",
      "sustained",
      "speculative",
      "burst"
    ],
    "parallelism": {
      "tensor_parallel_size": 1,
      "pipeline_parallel_size": 1,
      "expert_parallel_size": 1,
      "data_parallel_size": 1
    },
    "num_runs": 3,
    "extra_config": null
  },
  "metrics": {
    "derived": {},
    "offline": {
      "results_by_concurrency": [
        {
          "client_concurrency": 8,
          "throughput_tokens_per_sec": 3144.73,
          "throughput_tokens_per_sec_per_chip": 3144.73,
          "elapsed_seconds_median": 11.2,
          "peak_memory_gb": null,
          "power_watts_avg": null,
          "power_watts_peak": null,
          "oom": false,
          "_throughput_note": "output_only",
          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
        },
        {
          "client_concurrency": 32,
          "throughput_tokens_per_sec": 3146.66,
          "throughput_tokens_per_sec_per_chip": 3146.66,
          "elapsed_seconds_median": 11.2,
          "peak_memory_gb": null,
          "power_watts_avg": null,
          "power_watts_peak": null,
          "oom": false,
          "_throughput_note": "output_only",
          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
        },
        {
          "client_concurrency": 128,
          "throughput_tokens_per_sec": 3146.09,
          "throughput_tokens_per_sec_per_chip": 3146.09,
          "elapsed_seconds_median": 11.2,
          "peak_memory_gb": null,
          "power_watts_avg": null,
          "power_watts_peak": null,
          "oom": false,
          "_throughput_note": "output_only",
          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
        }
      ]
    },
    "online": {
      "sla_ttft_ms": 500,
      "max_valid_qps": 100,
      "results_by_qps": [
        {
          "target_qps": 5,
          "achieved_qps": 5.0,
          "ttft_ms_p50": 43.91,
          "ttft_ms_p90": 62.26,
          "ttft_ms_p99": 972.47,
          "tpot_ms_p50": 15.63,
          "tpot_ms_p90": 17.36,
          "tpot_ms_p99": 18.58,
          "elapsed_seconds_median": 66.1,
          "sla_met": false
        },
        {
          "target_qps": 25,
          "achieved_qps": 25.0,
          "ttft_ms_p50": 52.85,
          "ttft_ms_p90": 67.65,
          "ttft_ms_p99": 80.71,
          "tpot_ms_p50": 36.16,
          "tpot_ms_p90": 41.45,
          "tpot_ms_p99": 57.42,
          "elapsed_seconds_median": 17.1,
          "sla_met": true
        },
        {
          "target_qps": 100,
          "achieved_qps": 100.0,
          "ttft_ms_p50": 50.85,
          "ttft_ms_p90": 62.88,
          "ttft_ms_p99": 245.1,
          "tpot_ms_p50": 41.47,
          "tpot_ms_p90": 53.07,
          "tpot_ms_p99": 177.42,
          "elapsed_seconds_median": 10.8,
          "sla_met": true
        }
      ]
    },
    "interactive": {
      "ttft_ms_p50": 32.52,
      "ttft_ms_p90": 44.71,
      "ttft_ms_p99": 61.84,
      "tpot_ms_p50": 12.93,
      "tpot_ms_p90": 12.98,
      "tpot_ms_p99": 13.03,
      "peak_memory_gb": null,
      "elapsed_seconds_median": 381.4
    },
    "sustained": {
      "sustained_concurrency": 8,
      "duration_minutes": 30,
      "warmup_minutes": 2,
      "sample_interval_seconds": 60,
      "samples": [
        {
          "minute": 1.0,
          "is_warmup": true,
          "throughput_tokens_per_sec": 477.0,
          "tokens_out": 28638,
          "tokens_in": 0,
          "requests_completed": 154,
          "ttft_ms_p50": 49.6,
          "ttft_ms_p99": 6750.3
        },
        {
          "minute": 2.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 566.2,
          "tokens_out": 33972,
          "tokens_in": 0,
          "requests_completed": 180,
          "ttft_ms_p50": 44.6,
          "ttft_ms_p99": 61.2
        },
        {
          "minute": 3.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 560.8,
          "tokens_out": 33639,
          "tokens_in": 0,
          "requests_completed": 177,
          "ttft_ms_p50": 44.5,
          "ttft_ms_p99": 61.3
        },
        {
          "minute": 4.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 565.3,
          "tokens_out": 33929,
          "tokens_in": 0,
          "requests_completed": 181,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 60.8
        },
        {
          "minute": 5.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 561.4,
          "tokens_out": 33685,
          "tokens_in": 0,
          "requests_completed": 179,
          "ttft_ms_p50": 44.2,
          "ttft_ms_p99": 61.6
        },
        {
          "minute": 6.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 561.9,
          "tokens_out": 33707,
          "tokens_in": 0,
          "requests_completed": 180,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 60.8
        },
        {
          "minute": 7.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 570.0,
          "tokens_out": 34190,
          "tokens_in": 0,
          "requests_completed": 179,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 61.4
        },
        {
          "minute": 8.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 558.3,
          "tokens_out": 33498,
          "tokens_in": 0,
          "requests_completed": 177,
          "ttft_ms_p50": 44.6,
          "ttft_ms_p99": 62.6
        },
        {
          "minute": 9.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 563.3,
          "tokens_out": 33801,
          "tokens_in": 0,
          "requests_completed": 180,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 61.8
        },
        {
          "minute": 10.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 552.7,
          "tokens_out": 33163,
          "tokens_in": 0,
          "requests_completed": 176,
          "ttft_ms_p50": 44.5,
          "ttft_ms_p99": 52.4
        },
        {
          "minute": 11.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 569.3,
          "tokens_out": 34157,
          "tokens_in": 0,
          "requests_completed": 181,
          "ttft_ms_p50": 44.1,
          "ttft_ms_p99": 60.4
        },
        {
          "minute": 12.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 558.9,
          "tokens_out": 33526,
          "tokens_in": 0,
          "requests_completed": 177,
          "ttft_ms_p50": 44.2,
          "ttft_ms_p99": 47.9
        },
        {
          "minute": 13.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 568.4,
          "tokens_out": 34113,
          "tokens_in": 0,
          "requests_completed": 180,
          "ttft_ms_p50": 44.5,
          "ttft_ms_p99": 139.2
        },
        {
          "minute": 14.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 557.2,
          "tokens_out": 33424,
          "tokens_in": 0,
          "requests_completed": 178,
          "ttft_ms_p50": 44.5,
          "ttft_ms_p99": 61.5
        },
        {
          "minute": 15.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 565.5,
          "tokens_out": 33942,
          "tokens_in": 0,
          "requests_completed": 181,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 61.7
        },
        {
          "minute": 16.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 554.1,
          "tokens_out": 33238,
          "tokens_in": 0,
          "requests_completed": 175,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 61.2
        },
        {
          "minute": 17.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 563.7,
          "tokens_out": 33832,
          "tokens_in": 0,
          "requests_completed": 178,
          "ttft_ms_p50": 44.4,
          "ttft_ms_p99": 61.7
        },
        {
          "minute": 18.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 563.3,
          "tokens_out": 33783,
          "tokens_in": 0,
          "requests_completed": 180,
          "ttft_ms_p50": 44.5,
          "ttft_ms_p99": 62.1
        },
        {
          "minute": 19.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 565.0,
          "tokens_out": 33912,
          "tokens_in": 0,
          "requests_completed": 180,
          "ttft_ms_p50": 44.6,
          "ttft_ms_p99": 62.2
        },
        {
          "minute": 20.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 563.3,
          "tokens_out": 33773,
          "tokens_in": 0,
          "requests_completed": 179,
          "ttft_ms_p50": 44.6,
          "ttft_ms_p99": 61.6
        },
        {
          "minute": 21.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 564.7,
          "tokens_out": 33889,
          "tokens_in": 0,
          "requests_completed": 178,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 61.3
        },
        {
          "minute": 22.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 564.3,
          "tokens_out": 33853,
          "tokens_in": 0,
          "requests_completed": 179,
          "ttft_ms_p50": 44.6,
          "ttft_ms_p99": 61.1
        },
        {
          "minute": 23.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 562.3,
          "tokens_out": 33744,
          "tokens_in": 0,
          "requests_completed": 180,
          "ttft_ms_p50": 44.5,
          "ttft_ms_p99": 61.2
        },
        {
          "minute": 24.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 569.5,
          "tokens_out": 34180,
          "tokens_in": 0,
          "requests_completed": 180,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 61.3
        },
        {
          "minute": 25.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 550.8,
          "tokens_out": 33047,
          "tokens_in": 0,
          "requests_completed": 176,
          "ttft_ms_p50": 44.5,
          "ttft_ms_p99": 61.5
        },
        {
          "minute": 26.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 562.5,
          "tokens_out": 33749,
          "tokens_in": 0,
          "requests_completed": 178,
          "ttft_ms_p50": 44.5,
          "ttft_ms_p99": 63.9
        },
        {
          "minute": 27.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 561.7,
          "tokens_out": 33689,
          "tokens_in": 0,
          "requests_completed": 179,
          "ttft_ms_p50": 44.7,
          "ttft_ms_p99": 61.6
        },
        {
          "minute": 28.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 566.5,
          "tokens_out": 34010,
          "tokens_in": 0,
          "requests_completed": 181,
          "ttft_ms_p50": 44.7,
          "ttft_ms_p99": 141.7
        },
        {
          "minute": 29.0,
          "is_warmup": false,
          "throughput_tokens_per_sec": 558.2,
          "tokens_out": 33464,
          "tokens_in": 0,
          "requests_completed": 178,
          "ttft_ms_p50": 44.3,
          "ttft_ms_p99": 60.9
        }
      ],
      "sustained_throughput_tokens_per_sec": 562.5,
      "throttle_ratio": 0.966,
      "throttle_onset_minute": null,
      "ttft_p99_drift_ms": -0.3
    },
    "speculative": {
      "results_by_concurrency": [
        {
          "client_concurrency": 8,
          "throughput_tokens_per_sec": 705.16,
          "throughput_tokens_per_sec_per_chip": 705.16,
          "elapsed_seconds_median": 49.7,
          "peak_memory_gb": null,
          "power_watts_avg": null,
          "power_watts_peak": null,
          "oom": false,
          "_throughput_note": "output_only",
          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
        },
        {
          "client_concurrency": 32,
          "throughput_tokens_per_sec": 703.58,
          "throughput_tokens_per_sec_per_chip": 703.58,
          "elapsed_seconds_median": 49.8,
          "peak_memory_gb": null,
          "power_watts_avg": null,
          "power_watts_peak": null,
          "oom": false,
          "_throughput_note": "output_only",
          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
        },
        {
          "client_concurrency": 128,
          "throughput_tokens_per_sec": 704.29,
          "throughput_tokens_per_sec_per_chip": 704.29,
          "elapsed_seconds_median": 49.7,
          "peak_memory_gb": null,
          "power_watts_avg": null,
          "power_watts_peak": null,
          "oom": false,
          "_throughput_note": "output_only",
          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
        }
      ]
    },
    "burst": {
      "sla_ttft_ms": 500,
      "burst_steady_qps": 5,
      "burst_peak_qps": 25,
      "burst_duration_seconds": 30,
      "burst_interval_seconds": 120,
      "steady_requests_total": 1812,
      "burst_requests_total": 2245,
      "steady_ttft_p50_ms": 43.06,
      "steady_ttft_p99_ms": 3985.36,
      "burst_ttft_p50_ms": 57.82,
      "burst_ttft_p99_ms": 99.11,
      "sla_met_during_burst": true,
      "burst_degradation_ratio": 0.025,
      "results_by_cycle": [
        {
          "cycle": 1,
          "steady_requests": 581,
          "burst_requests": 760,
          "steady_ttft_p99_ms": 5093.98,
          "burst_ttft_p99_ms": 103.16
        },
        {
          "cycle": 2,
          "steady_requests": 595,
          "burst_requests": 734,
          "steady_ttft_p99_ms": 63.56,
          "burst_ttft_p99_ms": 90.45
        },
        {
          "cycle": 3,
          "steady_requests": 636,
          "burst_requests": 751,
          "steady_ttft_p99_ms": 65.44,
          "burst_ttft_p99_ms": 85.05
        }
      ]
    }
  },
  "accuracy": {
    "subset_score": 0.61,
    "baseline_delta": 0.01,
    "valid": true,
    "framework": "SGLang",
    "precision": "BF16",
    "notes": "Integrated accuracy check \u2014 used same SGLang instance as benchmark."
  },
  "meta": {
    "submitted_by": "Gong-K",
    "submission_type": "individual",
    "date": "2026-05-06",
    "time": "11:21:34",
    "run_id": "958afbbd",
    "run_name": "nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd",
    "flagged": null,
    "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py",
    "env_info_file": "../env_info.json",
    "log_file": "run.log",
    "samples_file": "samples.jsonl",
    "notes": null,
    "benchmark_start_time": "2026-05-06T11:19:15.947406+00:00",
    "benchmark_end_time": "2026-05-06T11:21:34.758403+00:00",
    "benchmark_elapsed_minutes": 74.2,
    "model_load_seconds": 50.0,
    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'speculative', 'burst'] scenarios.",
    "scenario_dirs": {
      "offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/offline",
      "online": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/online",
      "interactive": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/interactive",
      "sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/sustained",
      "speculative": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/speculative",
      "burst": "results/community/nvidia_a100_sxm4_40gbx1_suite_A_nvidia_sglang_c43a8309_958afbbd/burst"
    }
  }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    community-submissionResult submitted via OpenClaw AccelMark Skill

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions