In [None]:
if "running_all" not in globals():
    from ipywidgets import widgets
    matrices = ["ex10", "msc04515", "s1rmq4m1", "Na5", "bcsstk18",
                "vibrobox", "cbuckle", "Pres_Poisson", "raefsky4", "vanbody",
                "ct20stif", "cant", "bcircuit", "apache1", "consph"]
    b = widgets.Button(description="Run over all matrices", button_style="success")
    output = widgets.Output()

    display(b, output)

    def run_over_all_matrices(button):
        global running_all
        global matrix
        running_all = True
        with output:
            for matrix in matrices:
                print(f"Running {matrix}...")
                %run ./model_overhead_comparison.ipynb # will output at this cell rather than later
            print("Finished!")
    b.on_click(run_over_all_matrices)

In [None]:
if "running_all" not in globals():
    matrix = "bcsstk18"  # manually set to run over one matrix
matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import os
from joblib import load
from json import load as json_load
from glob import glob
sys.path.append(os.path.join(os.getcwd(), os.pardir))
from io_utils import load_matrices_from_dir

In [None]:
def get_test_data_size(name):
    if name in ["vanbody", "cant"]:
        return 100
    return 1000


ylims = {
    "apache1": (0, 0.25),
    "bcircuit": (0, 0.8),
    "bcsstk18": (0.2, 1.5),
    "cant": (0.6, 1.3),
    "cbuckle": (0, 1.0),
    "consph": (0, 0.02),
    "ct20stif": (0, 1.5),
    "ex10": (0.4, 1.5),
    "msc04515": (0, 0.4),
    "Na5": (0, 0.6),
    "Pres_Poisson": (0, 0.5),
    "raefsky4": (0, 0.2),
    "s1rmq4m1": (0, 0.6),
    "vanbody": (0.9, 1.05),
    "vibrobox": (0.2, 1.2),
}

df = pd.read_csv(f"data/{matrix}_{get_test_data_size(matrix)}.csv")
errorfree_iterations = df["errorfree_iterations"][0]  # all the same
n_rows = df["n_rows"][0]  # all the same
df.head()

In [None]:
real_names = {
    "Ridge": "Polynomial Regression",
    "RandomForestRegressor": "Random Forest",
    "KNeighborsRegressor": "K-Nearest Neighbors",
    "XGBRegressor": "XGBoost",
    "LinearSVR": "Support Vector Machine"
}

models = [load(fn) for fn in glob(f"./models/{matrix}/*.pkl")]
model_names = [real_names[model.steps[-1][1].__class__.__name__] for model in models]
ps = [1/98] + list(np.arange(0.02, 1.01, 0.01))

models

In [None]:
X = df[["error_iter", "pos_2norm"]].to_numpy()

for name, model in zip(model_names, models):
    df[f"prot_score_{name}"] = model.predict(X)

df.head()

In [None]:
mats = load_matrices_from_dir("../matrices/raw", subset=[matrix])
mat = list(mats.values())[0]

with open(f"../matrices/2norms/{matrix}_pos_2norms.json") as f:
    pos_2norms = json_load(f)

mat

In [None]:
df_preds = pd.DataFrame([[i, pos_2norms[str(pos)], pos] for pos in range(mat.shape[0])
                         for i in range(errorfree_iterations)], columns=["i", "2norm", "rowid"])

In [None]:
for name, model in zip(model_names, models):
    df_preds[f"output_{name}"] = model.predict(df_preds[["i", "2norm"]])
df_preds.head()

In [None]:
ps = [1/98] + list(np.arange(0.02, 1.01, 0.01))
percentages = np.arange(0.01, 1.0, 0.01)
nonerror_runs_by_p = {p: int((len(df) / p) - len(df)) for p in ps}
max_nonerror_runs = int((len(df) / min(ps)) - len(df))
solve_iterations = np.append(df["solve_iterations"], [errorfree_iterations] * max_nonerror_runs)
slowdowns = np.append(df["slowdown"], [1] * max_nonerror_runs)
errorfree_op_count = errorfree_iterations * n_rows

In [None]:
def compute_overheads(error_iterations, n_protections):
    return ((error_iterations * n_rows + n_protections) - errorfree_op_count) / errorfree_op_count


def protect(error_iterations, protections):
    return np.vectorize(lambda i: error_iterations[i] if not protections[i]
                        else errorfree_iterations)(range(len(error_iterations)))


def make_p_overhead_dataframe(ohs_by_p):
    return pd.concat([pd.DataFrame({"p": [ps[i]] * len(os), "overhead": os})
                      for i, os in enumerate(ohs_by_p)], ignore_index=True)

In [None]:
dfs = {}

for name, model in zip(model_names, models):
    prot_overheads_by_p = []

    for p in ps:
        # need to pad dataset to add non-error runs
        n_nonerror_runs = nonerror_runs_by_p[p]
        data_size = n_nonerror_runs + len(df)

        # for the purpose of choosing solve_iterations or errorfree_iterations, did_protect will
        # always be False for nonerror runs, but this is fine because n_protections is computed
        # later for the purposes of calculating overhead
        protections = np.append(df[f"prot_score_{name}"] > (1 + (1 / p)), [False] * n_nonerror_runs)

        prot_iterations = protect(solve_iterations[:data_size], protections)
        n_protections = (df_preds[f"output_{name}"] > (1 + (1 / p))).sum()

        prot_overheads = compute_overheads(prot_iterations, n_protections)
        prot_overheads_by_p.append(prot_overheads)

    dfs[name] = make_p_overhead_dataframe(prot_overheads_by_p)

In [None]:
palette = sns.color_palette("tab10", 6)[1:]
for i, (name, model_df) in enumerate(dfs.items()):
    sns.lineplot(model_df, x="p", y="overhead", c=palette[i], label=name)


def formatter(x, pos):
    del pos
    return str(round(x * 100))


plt.gca().yaxis.set_major_formatter(formatter)
plt.gcf().set_size_inches(8, 6)
plt.gcf().set_dpi(100)

plt.xlabel("$p$")
plt.ylabel("Mean Overhead (%)")
plt.title(f"{matrix}", weight="bold")
plt.legend()
plt.grid()
plt.xlim(0.01, 1)
plt.ylim(*ylims[matrix])
plt.tight_layout()
plt.savefig(f"./figures/{matrix}/model_overhead_comparison.png")
plt.show()