From 305cf0f503edc4b7c2778809fdf897cc208bd7a4 Mon Sep 17 00:00:00 2001 From: Sergey Chernov Date: Mon, 11 May 2026 18:21:12 -0700 Subject: [PATCH 1/3] fix compare result --- .github/workflows/benchmarks.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 64f513d97..95d973f33 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -121,6 +121,16 @@ jobs: with: ref: ${{ env.CHC_BRANCH }} + # The benchmark code runs against the PR's working tree (see the + # PR checkout below), but the comparison tooling lives in this + # workflow's contract on `main`. Stash a copy now so the compare + # step still works even when the PR branch was forked before + # `.github/scripts/compare-jmh.py` existed. + - name: Stash comparison tooling from main + run: | + mkdir -p "$RUNNER_TEMP/jmh-tools" + cp -v .github/scripts/compare-jmh.py "$RUNNER_TEMP/jmh-tools/" + - name: Check out PR if: steps.pr.outputs.number != '' run: | @@ -196,7 +206,7 @@ jobs: if: steps.pr.outputs.number != '' && steps.baseline.outputs.found == 'true' continue-on-error: true run: | - python3 .github/scripts/compare-jmh.py \ + python3 "$RUNNER_TEMP/jmh-tools/compare-jmh.py" \ --baseline baseline-results \ --current performance \ --baseline-run-id "${{ steps.baseline.outputs.run_id }}" \ From 940e83747759943a8f3c0d02efefb5f4bb21086c Mon Sep 17 00:00:00 2001 From: Sergey Chernov Date: Mon, 11 May 2026 20:15:22 -0700 Subject: [PATCH 2/3] Changed PR report --- .github/scripts/compare-jmh.py | 99 +++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 12 deletions(-) diff --git a/.github/scripts/compare-jmh.py b/.github/scripts/compare-jmh.py index 39c1a5d0f..f174e2211 100644 --- a/.github/scripts/compare-jmh.py +++ b/.github/scripts/compare-jmh.py @@ -30,6 +30,7 @@ import json import os import sys +from collections import defaultdict from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Tuple @@ -247,6 +248,35 @@ def build_rows( # --------------------------------------------------------------------------- +def compute_discriminators( + rows: List[Row], current: Dict[Key, Dict[str, Any]] +) -> Dict[Key, str]: + """For each row, return a short string of only the params that differ + between rows sharing the same fully-qualified benchmark name. + + When a benchmark is run with only one combination of params, its + discriminator is the empty string (no need to disambiguate). + """ + by_bench: Dict[str, List[Row]] = defaultdict(list) + for r in rows: + bench, _ = r.key + by_bench[bench].append(r) + + out: Dict[Key, str] = {} + for bench, group in by_bench.items(): + if len(group) <= 1: + out[group[0].key] = "" + continue + params_dicts = [(current.get(r.key) or {}).get("params") or {} for r in group] + all_keys = sorted({k for p in params_dicts for k in p.keys()}) + varying = [ + k for k in all_keys if len({p.get(k) for p in params_dicts}) > 1 + ] + for r, p in zip(group, params_dicts): + out[r.key] = ", ".join(f"{k}={p[k]}" for k in varying if k in p) + return out + + def build_markdown( rows: List[Row], only_current: List[Key], @@ -266,14 +296,19 @@ def build_markdown( if not any(d.regression(threshold) for d in r.deltas) and any(d.improvement(threshold) for d in r.deltas) ) + discriminators = compute_discriminators(rows, current) out: List[str] = [""] if regressions: - out.append(f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%") + out.append( + f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%" + ) elif improvements: - out.append(f"## ✅ JMH benchmark comparison — {improvements} improvement(s) over {threshold:g}%") + out.append( + f"## ✅ JMH benchmark comparison — no regressions, {improvements} improvement(s) over {threshold:g}%" + ) else: - out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%") + out.append(f"## ✅ JMH benchmark comparison — no changes over {threshold:g}%") out.append("") if repo and baseline_run_id and current_run_id: @@ -285,15 +320,56 @@ def build_markdown( ) out.append("") - out.append( - f"Threshold: **±{threshold:g}%**. " - f"Metrics: **Time** (`primaryMetric.score`, `SampleTime` — proxy for CPU work) and " - f"**Alloc/op** (`{ALLOC_NORM_KEY}`, GC allocations per op — memory pressure). " - "Both are lower-is-better, so a positive Δ% means the PR is worse than baseline." - ) - out.append("") + # ---- brief per-benchmark summary -------------------------------------- + if rows: + # Stable order: regressions first, then improvements, then noise. + # Within each bucket, biggest |Δ| first. + def bucket(r: Row) -> int: + if any(d.regression(threshold) for d in r.deltas): + return 0 + if any(d.improvement(threshold) for d in r.deltas): + return 1 + return 2 + + rows = sorted(rows, key=lambda r: (bucket(r), -r.sort_key())) + for r in rows: + bench, _ = r.key + disc = discriminators.get(r.key, "") + b = bucket(r) + icon = "❌" if b == 0 else "✅" + + # In the brief view, only mention metrics that actually crossed + # the threshold — keeps noisy rows to a single line. + bits: List[str] = [] + for d in r.deltas: + if d.delta_pct is None: + continue + if d.regression(threshold): + bits.append(f"**{d.metric.label} {fmt_delta(d.delta_pct)}**") + elif d.improvement(threshold): + bits.append(f"{d.metric.label} {fmt_delta(d.delta_pct)}") + + line = f"- {icon} `{short_bench(bench)}`" + if disc: + line += f" `[{disc}]`" + if bits: + line += " — " + ", ".join(bits) + out.append(line) + out.append("") + else: + out.append("_No benchmarks matched between baseline and PR._") + out.append("") + + # ---- detailed table (collapsed) --------------------------------------- if rows: + out.append( + f"
Detailed metrics " + f"(threshold ±{threshold:g}%, Time = primaryMetric.score from SampleTime, " + f"Alloc/op = {ALLOC_NORM_KEY}; positive Δ% = worse than baseline)" + "" + ) + out.append("") header = "| Benchmark | Params | " + " | ".join(m.label for m in METRICS) + " | Status |" sep = "|---|---|" + "|".join(["---"] * len(METRICS)) + "|---|" out.append(header) @@ -305,8 +381,7 @@ def build_markdown( f"| `{short_bench(bench)}` | {params or '—'} | {cells} | {r.status(threshold)} |" ) out.append("") - else: - out.append("_No benchmarks matched between baseline and PR._") + out.append("
") out.append("") if only_current: From c0bd3816243983e79775bd6d5836765a3a5d009c Mon Sep 17 00:00:00 2001 From: Sergey Chernov Date: Mon, 11 May 2026 21:33:02 -0700 Subject: [PATCH 3/3] Fixed diff render --- .github/scripts/compare-jmh.py | 62 ++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/.github/scripts/compare-jmh.py b/.github/scripts/compare-jmh.py index f174e2211..e0a6bd17d 100644 --- a/.github/scripts/compare-jmh.py +++ b/.github/scripts/compare-jmh.py @@ -11,9 +11,11 @@ mean sampled latency per op; it's our best available proxy for CPU work since no dedicated CPU profiler is configured in `BenchmarkRunner`. -* `Alloc/op` — `secondaryMetrics["·gc.alloc.rate.norm"]`, populated by +* `Alloc/op` — `secondaryMetrics["gc.alloc.rate.norm"]`, populated by JMH's `GCProfiler`. This is bytes allocated per benchmark op and is - the standard, low-noise JMH memory metric. + the standard, low-noise JMH memory metric. (Note: JMH 1.37 stores the + key with no prefix; some visualizer tools display it as + `·gc.alloc.rate.norm` but the raw JSON does not contain that dot.) Both metrics are "lower is better", so a positive delta indicates the PR is worse than the baseline. A run is considered failed when **any** @@ -42,9 +44,25 @@ # --------------------------------------------------------------------------- # JMH's `GCProfiler` reports allocation rate normalised per op under this -# secondary metric key (the leading char is U+00B7 MIDDLE DOT, not a regular -# dot — that's JMH's convention for profiler-emitted metrics). -ALLOC_NORM_KEY = "\u00b7gc.alloc.rate.norm" +# secondary metric key. Verified against jmh-core 1.37 sources: the key +# is the literal `gc.alloc.rate.norm` with no prefix. Some visualizers +# render it as `·gc.alloc.rate.norm`, which is a display convention, not +# what JMH writes to JSON. +ALLOC_NORM_KEY = "gc.alloc.rate.norm" + +# Tolerated prefix-bearing variants we'll fall back to if the canonical +# key ever moves. Lets a future JMH (or a non-Oracle JVM profiler that +# decorates the label) keep working without us being aware. +ALLOC_NORM_VARIANTS = ( + ALLOC_NORM_KEY, + "\u00b7gc.alloc.rate.norm", # pre-emptive middle-dot variant + "+gc.alloc.rate.norm", +) + +# Set to True the first time we fail to find any allocation data; used +# to emit a one-time diagnostic listing the secondary keys present so +# the cause is obvious in workflow logs. +_alloc_warn_printed = False @dataclass(frozen=True) @@ -70,13 +88,35 @@ def _secondary(record: Dict[str, Any], key: str) -> Optional[Dict[str, Any]]: return val if isinstance(val, dict) else None +def _alloc(record: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Pull the GC allocation-per-op metric, tolerating prefix variants. + + Falls back to a suffix match so even an unrecognised prefix (e.g. a + third-party JMH profiler that decorates the label) still works.""" + global _alloc_warn_printed + sm = record.get("secondaryMetrics") or {} + for k in ALLOC_NORM_VARIANTS: + v = sm.get(k) + if isinstance(v, dict): + return v + # Last-resort suffix match. + for k, v in sm.items(): + if isinstance(v, dict) and k.lower().endswith("gc.alloc.rate.norm"): + return v + if sm and not _alloc_warn_printed: + print( + "warn: no `gc.alloc.rate.norm` (or prefix-variant) found in " + "secondaryMetrics. Available keys: " + + ", ".join(sorted(sm.keys())), + file=sys.stderr, + ) + _alloc_warn_printed = True + return None + + METRICS: List[Metric] = [ Metric(id="time", label="Time", extract=_primary), - Metric( - id="alloc", - label="Alloc/op", - extract=lambda r: _secondary(r, ALLOC_NORM_KEY), - ), + Metric(id="alloc", label="Alloc/op", extract=_alloc), ] @@ -391,7 +431,7 @@ def bucket(r: Row) -> int: bench, params = k rec = current[k] time_d = _primary(rec) or {} - alloc_d = _secondary(rec, ALLOC_NORM_KEY) or {} + alloc_d = _alloc(rec) or {} out.append( f"- `{short_bench(bench)}` ({params or '—'}): " f"time={fmt_score(_float(time_d, 'score'), _float(time_d, 'scoreError'), time_d.get('scoreUnit', ''))}, "