In [None]:
import json
from typing import Any, Dict
from matplotlib import pyplot
from matplotlib.axes._axes import Axes  # For typing purposes
import numpy as np
import pyperf
from scipy import stats

In [None]:
def get_data(result: Dict[str, Any]) -> Dict[str, np.ndarray]:
    """Parse data"""
    results = {}

    for benchmark in result["benchmarks"]:
        if "metadata" in benchmark:
            name = benchmark["metadata"]["name"]
        else:
            name = result["metadata"]["name"]
        data = []
        for run in benchmark["runs"]:
            data.extend(run.get("values", []))
        results[name] = np.array(data, dtype=np.float64)
        results[name].sort()

    return results

In [None]:
# Load data
run = "run6"

file1 = "python3-11-3-low-01"
file2 = "python3-11-3-low-02"

file1_path = f"results/{run}/bm-{file1}.json"
file2_path = f"results/{run}/bm-{file2}.json"

with open(file1_path) as fb, open(file2_path) as fh:
    run1 = get_data(json.load(fb))
    run2 = get_data(json.load(fh))

In [None]:
# Make sure that benchmarks match
if run1.keys() != run2.keys():
    raise Exception("The benchmarking suites are not the same size")

In [None]:
# Naive outlier removal
for name in run1.keys():
    run1[name] = run1[name][:45]
    run2[name] = run2[name][:45]

In [None]:
# Original outlier removal
def remove_outliers(values: np.ndarray, m: int = 2):
    return values[abs(values - np.mean(values)) < m * np.std(values)]

for name in run1.keys():
    run1[name] = remove_outliers(run1[name])
    run2[name] = remove_outliers(run2[name])

    if len(run1[name]) > len(run2[name]):
        run1[name] = run1[name][np.round(np.linspace(0, len(run1[name]) - 1, len(run2[name]))).astype(int)]
    elif len(run1[name]) < len(run2[name]):
        run2[name] = run2[name][np.round(np.linspace(0, len(run2[name]) - 1, len(run1[name]))).astype(int)]

In [None]:
def generate_dist_plot(ax: Axes, base: np.ndarray, head: np.ndarray) -> None:
    # The density set to `True` makes the integral of the histogram 1.
    ax.hist(base, alpha=0.5, label='Run 1', density=True)
    ax.hist(head, alpha=0.5, label='Run 2', density=True)

    # ax.set_xscale('log')
    ax.xaxis.set_major_formatter(lambda val, _: f"{val:4.1g}s")

    x1 = np.linspace(base.mean() - 3 * base.std(), base.mean() + 3 * base.std(), 100)
    y1 = stats.norm.pdf(x1, base.mean(), base.std())
    ax.plot(x1, y1, label='Run 1 (Normal Dist.)', color="blue")

    x2 = np.linspace(head.mean() - 3 * head.std(), head.mean() + 3 * head.std(), 100)
    y2 = stats.norm.pdf(x2, head.mean(), head.std())
    ax.plot(x2, y2, label='Run 2 (Normal Dist.)', color="orange")

    ax.legend()


In [None]:
def add_cell(table, row: int, name: str, val: str) -> None:
    default_c = table._cells[(0, 0)]
    c_height = default_c._height
    c_width = default_c._width

    # I think matplotlib enforces the same number of cells in each row.
    # Therefore i use two cells and remove one border.
    table.add_cell(row, -1, text=name, loc='left', width=c_width, height=c_height)
    table.add_cell(row, 0, text=val, loc='center', width=c_width, height=c_height).visible_edges = 'BTL'
    table.add_cell(row, 1, text='', width=c_width, height=c_height).visible_edges = 'BRT'

In [None]:
def generate_table(ax: Axes, base: np.ndarray, head: np.ndarray) -> None:
    c_labels = ["Run 1", "Run 2"]
    r_labels = [
        "Mean",
        "Variance",
        "Skewness",
        "Variation",
        "Minimum",
        "Maximum",
    ]

    data = [
        [base.mean(), head.mean()],  # Empirical mean
        [base.var(), head.var()],  # Variance
        [stats.skew(base), stats.skew(head)],  # Sample Skewness
        [stats.variation(base), stats.variation(head)],  # Coefficient of variation
        [base.min(), head.min()],  # Minimum
        [base.max(), head.max()],  # Maximum
    ]
    data = [[f"{b:4.1g}", f"{h:4.1g}"] for b, h in data]  # Round to 4 decimal points

    table = ax.table(cellText=data, loc='center', cellLoc='center', rowLabels=r_labels, colLabels=c_labels, colWidths=[.3, .3])
    table.set_fontsize(11)

    _, p_val = stats.ttest_ind(base, head)  # P-value
    deg_freedom = len(base) + len(head) - 2
    critical_value = pyperf._utils.tdist95conf_level(deg_freedom)
    t_score = pyperf._utils.tscore(base, head)
    is_significant, _ = pyperf._utils.is_significant(base, head)
    
    # Round
    p_val = f"{p_val:4.1g}"
    t_score = f"{t_score:4.1g}"

    add_cell(table, 7, 'P-value', p_val)
    add_cell(table, 8, 'Deg. of freedom', deg_freedom)
    add_cell(table, 9, 'Critical value', critical_value)
    add_cell(table, 10, 'T-Score', t_score)
    add_cell(table, 11, 'Significant', is_significant)

    ax.axis("off")

In [None]:
# Calculate the logarithm of all values
for name in run1.keys():
    run1[name] = np.abs(np.log(run1[name]))
    run2[name] = np.abs(np.log(run2[name]))

In [None]:
debug = False

if debug:
    fig, axs = pyplot.subplots(3, 2, figsize=(13, 3 * 3), layout="constrained", width_ratios=[.7, .3])
else:
    fig, axs = pyplot.subplots(len(run1), 2, figsize=(13, 3 * len(run1)), layout="constrained", width_ratios=[.7, .3])

# Sort by t-score
pairs = []
significant = 0
for name in run1.keys():
    t_score = pyperf._utils.tscore(run1[name], run2[name])
    pairs.append((name, t_score))
    sig, _ = pyperf._utils.is_significant(run1[name], run2[name])
    if sig:
        significant += 1
pairs = sorted(pairs, key=lambda x: abs(x[1]), reverse=True)

ax_counter = 0

for name, _ in pairs:
    axs[ax_counter][0].set_title(name)
    generate_dist_plot(axs[ax_counter][0], run1[name], run2[name])
    
    generate_table(axs[ax_counter][1], run1[name], run2[name])
    ax_counter += 1

    if debug and ax_counter == 3:
        break

title = f"""Comparision of: {file1} and {file2}
    Significant: {significant}, out of: {len(run1)}
"""
fig.suptitle(title, fontsize=16)
pyplot.show()

In [None]:
without_removal = {}
naive_removal = {}

for name in run1.keys():
    _, p_val = stats.ttest_ind(run1[name], run2[name])
    without_removal[name] = p_val

for name in run1.keys():
    # run1[name] = run1[name][:45]
    # run2[name] = run2[name][:45]

    run1[name] = run1[name][abs(run1[name] - np.mean(run1[name])) < 2 * np.std(run1[name])]
    run2[name] = run2[name][abs(run2[name] - np.mean(run2[name])) < 2 * np.std(run2[name])]


for name in run1.keys():
    _, p_val = stats.ttest_ind(run1[name], run2[name])
    naive_removal[name] = p_val

In [None]:
counter = 0

for name in without_removal.keys():
    if without_removal[name] < naive_removal[name]:
        print(f"Name: {name}")
        print(without_removal[name])
        print(naive_removal[name])
        counter += 1

print(f"Improved: {counter}")
print(f"Out of {len(without_removal)}")