In [None]:
import json
from typing import Any, Dict
from matplotlib import pyplot
from matplotlib.axes._axes import Axes  # For typing purposes
import numpy as np
import pyperf
from scipy import stats

In [None]:
def get_data(result: Dict[str, Any]) -> Dict[str, np.ndarray]:
    """Parse data"""
    results = {}

    for benchmark in result["benchmarks"]:
        if "metadata" in benchmark:
            name = benchmark["metadata"]["name"]
        else:
            name = result["metadata"]["name"]
        data = []
        for run in benchmark["runs"]:
            data.extend(run.get("values", []))
        results[name] = np.array(data, dtype=np.float64)
        results[name].sort()

    return results

In [None]:
# Load data
run = "run6"

file1 = "python3-11-3-low-01"
file2 = "python3-11-3-low-02"

file1_path = f"results/{run}/bm-{file1}.json"
file2_path = f"results/{run}/bm-{file2}.json"

with open(file1_path) as fb, open(file2_path) as fh:
    run1 = get_data(json.load(fb))
    run2 = get_data(json.load(fh))

In [None]:
# Make sure that benchmarks match
if run1.keys() != run2.keys():
    raise Exception("The benchmarking suites are not the same size")

In [None]:
# Mix data
for name in run1.keys():
    temp1 = np.concatenate([run1[name][0::2], run2[name][1::2]])
    temp1.sort()
    temp2 = np.concatenate([run2[name][0::2], run1[name][1::2]])
    temp2.sort()

    run1[name] = temp1
    run2[name] = temp2

In [None]:
# Naive outlier removal
for name in run1.keys():
    run1[name] = run1[name][:45]
    run2[name] = run2[name][:45]

In [None]:
# Calculate diffs
diffs = {}

for name in run1.keys():
    diffs[name] = run1[name] - run2[name]

In [None]:
def generate_dist_plot(ax: Axes, data: np.ndarray) -> None:
    # The density set to `True` makes the integral of the histogram 1.
    ax.hist(data, alpha=0.5, density=True, bins=20)
    ax.xaxis.set_major_formatter(lambda val, _: f"{val:4.1g}s")

In [None]:
def generate_table(ax: Axes, data: np.ndarray, t_score: float) -> None:
    r_labels = [
        "Minimum",
        "Maximum",
        "Mean",
        "Variance",
        "T-Score",
        "Significant",
    ]

    data = [
        [data.min()],
        [data.max()],
        [data.mean()],
        [data.var()],
        [t_score],
        [True if abs(t_score) > 2 else False],
    ]
    data = [[f"{b[0]:4.2g}"] for b in data]  # Round to 4 decimal points

    table = ax.table(cellText=data, loc='center', cellLoc='center', rowLabels=r_labels, colWidths=[.4])
    table.set_fontsize(12)

    ax.axis("off")

In [None]:
debug = False

if debug:
    fig, axs = pyplot.subplots(3, 2, figsize=(12, 3 * 3), layout="constrained", width_ratios=[.6, .3])
else:
    fig, axs = pyplot.subplots(len(run1), 2, figsize=(12, 3 * len(run1)), layout="constrained", width_ratios=[.6, .3])

# Sort by t-score
pairs = []
significant = 0
for name in run1.keys():
    t_score, p_val = stats.ttest_rel(run1[name], run2[name])
    pairs.append((name, t_score))
    if abs(t_score) >= 2:
        significant += 1
pairs = sorted(pairs, key=lambda x: abs(x[1]), reverse=True)

ax_counter = 0

for name, t_score in pairs:
    axs[ax_counter][0].set_title(name)
    generate_dist_plot(axs[ax_counter][0], diffs[name])
    generate_table(axs[ax_counter][1], diffs[name], t_score)

    ax_counter += 1

    if debug and ax_counter == 3:
        break

title = f"""Comparision of: {file1} and {file2}
    Significant: {significant}, out of: {len(run1)}
"""
fig.suptitle(title, fontsize=16)
pyplot.show()