# Utilities/Imports

In [1]:
import matplotlib as mpl
mpl.use('pgf')
import numpy as np
import pandas as pd
import statsmodels.api as sm
from bisect import bisect_left
from collections import defaultdict
from matplotlib import pyplot as plt, ticker
from pathlib import Path
from tqdm.notebook import tqdm

In [308]:
preamble = '\n'.join([
    r'\usepackage{amsmath}',
    r'\usepackage{amssymb}',
    r'\usepackage{amsthm}',
    r'\usepackage{dsfont}',
    r'\usepackage[libertine,slantedGreek,vvarbb,libaltvw]{newtxmath}',
    r'\usepackage{url}',
    r'\usepackage{bm}',
    r'\usepackage[no-math]{fontspec}',
    r'\usepackage[ttscale=0.85]{libertine}',
    r'\usepackage[binary-units]{siunitx}',
])
plt.rcParams.update({
    'text.usetex': True,
    'pgf.rcfonts': False,
    'pgf.texsystem': 'lualatex',
    'pgf.preamble': preamble,
    'text.latex.preamble': preamble,
})

In [714]:
%%html
<style>
    table {
        display: inline-block
    }
</style>

# Data loading & sanitizing

In [310]:
CATEGORY_DIR_MAP = {
    'cm': 'BS20',
    'murakami_uno': 'MU13',
    'thomas': 'Bir+20',
    'vlrg': 'GV17',
}
instance_dir = Path('../../instances')
full_path = {p.name: p.relative_to(instance_dir)
             for p in instance_dir.glob('**/*')
             if p.is_file()
                 and p.suffix == '.dat'
                 and p.relative_to(instance_dir).parts[0] != 'outdated'}
instance_category = {name: CATEGORY_DIR_MAP[p.parts[0]] for name, p in full_path.items()}
instances = list(instance_category.keys())
categories = set(instance_category.values())

In [716]:
two_variants = ['base', 'abs-pos']
variants = [
    'base',
    'abs-pos',
    'abs-neg',
    'abs-sum',
    'abs-max',
    'rel-pos',
    'rel-neg',
    'rel-max',
    'rel-sum',
]


def load_csv(path):
    df = pd.read_csv(f'results/{path}.csv', index_col='file_name')
    df = df[df.index.isin(full_path.keys())].copy()
    # We track branching nodes + leaf nodes, which is always 2 * branching nodes + 1
    # since it is a binary tree
    df.iterations //= 2
    return df


df_class_base = pd.concat([load_csv(f'classification/base{suff}') for suff in ('', '-slow', '-really-slow', '-new')])
df_class_abspos = pd.concat([load_csv(f'classification/abs-pos{suff}') for suff in ('', '-new')])

# Add preliminary data for unfinished instance
df_class_base.loc['cost_matrix_component_nr_118_size_204_cutoff_10.0.cm.dat'] = [None, None, None, 1443219, 648126579, None]

df_fast_cand_base = pd.concat([load_csv(f'fast-candidates-10/base{suff}') for suff in ('', '-new')])
df_fast_cand_abspos = pd.concat([load_csv(f'fast-candidates-10/abs-pos{suff}') for suff in ('', '-new')])

df_easy_base = load_csv('easy-10/base')
df_easy_abspos = load_csv('easy-10/abs-pos')

df_base = pd.concat([df_class_base, df_fast_cand_base, df_easy_base])
df_abspos = pd.concat([df_class_abspos, df_fast_cand_abspos, df_easy_abspos])

hard_instances = set(df_abspos.index) - set(df_class_base[df_class_base.iterations < 500].index)
all_easy_instances = set(df_base.index) - hard_instances
base_counts = df_base[df_base.index.isin(hard_instances)].iterations.groupby('file_name').count()
hard_fast_instances = set(base_counts[base_counts > 1].index)
fully_reduced = set(df_class_base[df_class_base.iterations == 0].index)
easy_instances = all_easy_instances - fully_reduced

#unfinished_instances = hard_instances - set(df_base.index)
assert(hard_instances - set(df_base.index) == set())

# Boxplots

In [695]:
def five_instance_boxplots(ax, before, after, prop='iterations'):
    improvements = before[prop].groupby('file_name').median() / after[prop].groupby('file_name').median()
    improvements.sort_values(inplace=True)
    n = len(improvements)
    instances = [
        improvements.index[0],
        improvements.index[round(0.25 * n)],
        improvements.index[round(0.5 * n)],
        improvements.index[round(0.75 * n)],
        improvements.index[-1],
    ]
    print(before.loc[instances].iterations.groupby('file_name').median())
    print(after.loc[instances].iterations.groupby('file_name').median())
    
    data, pos = [], []
    for i, name in enumerate(instances):
        norm = before[prop].loc[name].median()
        data.extend([
            before[prop].loc[name] / norm,
            after[prop].loc[name] / norm,
        ])
        pos.extend([3 * i + 1, 3 * i + 2])
    ax.boxplot(data, positions=pos, flierprops=dict(markersize=2, markeredgewidth=0.25))
    ax.set_xticks([3 * i + 1.5 for i in range(5)])
    ax.set_xticks(pos, minor=True)
    ax.set_xticklabels([r'\texttt{base}', '\\texttt{abs-}\n\\texttt{incl}'] * 5, fontdict=dict(fontsize='xx-small'), minor=True)
    ax.tick_params(axis='x', which='minor', length=0)
    ax.tick_params(axis='x', which='major', labelbottom=False)
    ax.tick_params(axis='y', labelsize='xx-small')
    ax.plot([0.5, 14.5], [1.0, 1.0], dashes=[3, 5], linewidth=0.5, zorder=-1, color='gray')
    for i, t in enumerate(ax.get_xticklabels(minor=True)):
        t.set_y(-0.02 if i % 2 else -0.06)

    
fig, axs = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(4.858, 3), constrained_layout=True)
five_instance_boxplots(axs[0],
                       df_base[df_base.index.isin(hard_fast_instances)],
                       df_abspos[df_abspos.index.isin(hard_fast_instances)])
easy_before = df_base[~df_base.index.isin(hard_instances | fully_reduced)]
easy_after = df_abspos[~df_abspos.index.isin(hard_instances | fully_reduced)]
#names = {name for name in easy_before.index if df_class_base.loc[name].iterations >= 10}
five_instance_boxplots(axs[1],
                       easy_before,
                       easy_after)

axs[0].set_ylabel(r'\texttt{hard-fast}', labelpad=19, fontsize='small')
axs[1].set_ylabel(r'\texttt{easy}', labelpad=19, fontsize='small')

fig.add_subplot(111, frameon=False)
plt.tick_params(bottom=False, left=False, labelsize=0)
plt.gca().set_ylabel(r'\#br.\ nodes (rel.\ to median)', labelpad=19)

# Incredibly hacky...
plt.savefig('box-five-instance-both.pgf')
bbox_top = axs[0].get_position()
bbox_bot = axs[1].get_position()
plt.gca().set_position([bbox_top.x0, bbox_bot.y0, bbox_top.x1 - bbox_top.x0, bbox_top.y1 - bbox_bot.y0])
axs[0].set_ylabel(r'\texttt{hard-fast}', labelpad=4)
axs[1].set_ylabel(r'\texttt{easy}', labelpad=4)
fig.set_constrained_layout(False)
plt.savefig('box-five-instance-both.pgf')
plt.close()

file_name
cost_matrix_component_nr_1032_size_115_cutoff_10.0.cm.dat    1046788.0
cost_matrix_component_nr_1112_size_78_cutoff_10.0.cm.dat       15084.0
cost_matrix_component_nr_362_size_99_cutoff_10.0.cm.dat         1456.0
cost_matrix_component_nr_504_size_96_cutoff_10.0.cm.dat          775.0
cost_matrix_component_nr_96_size_139_cutoff_10.0.cm.dat         8246.0
Name: iterations, dtype: float64
file_name
cost_matrix_component_nr_1032_size_115_cutoff_10.0.cm.dat    12708
cost_matrix_component_nr_1112_size_78_cutoff_10.0.cm.dat      1835
cost_matrix_component_nr_362_size_99_cutoff_10.0.cm.dat       1070
cost_matrix_component_nr_504_size_96_cutoff_10.0.cm.dat       1287
cost_matrix_component_nr_96_size_139_cutoff_10.0.cm.dat       2833
Name: iterations, dtype: int64
file_name
cost_matrix_component_nr_201_size_22_cutoff_10.0.cm.dat      1.0
cost_matrix_component_nr_2183_size_43_cutoff_10.0.cm.dat    75.0
cost_matrix_component_nr_2356_size_33_cutoff_10.0.cm.dat     9.0
cost_matrix_component

In [696]:
dfs = [df_base.loc[easy_instances], df_abspos.loc[easy_instances], df_base.loc[hard_fast_instances], df_abspos.loc[hard_fast_instances]]
data = [df.iterations.groupby('file_name') for df in dfs]
data = [df.std() / df.mean() for df in data]
plt.figure(figsize=(1.8, 1.8))
plt.ylabel(r'coefficient of variation', fontsize='small')
ax = plt.gca()
plt.boxplot(data, positions=[1.2, 1.8, 3.2, 3.8], flierprops=dict(markersize=2, markeredgewidth=0.25))
ax.set_xticks([1.5, 3.5])
ax.set_xticklabels([r'\texttt{easy}', r'\texttt{hard-fast}'], fontdict=dict(fontsize='small'))
ax.set_xticks([1.1, 1.9, 3.1, 3.9], minor=True)
ax.set_xticklabels([r'\texttt{base}', '\\texttt{abs-}\n\\texttt{incl}', r'\texttt{base}', '\\texttt{abs-}\n\\texttt{incl}'],
                   fontdict=dict(fontsize='xx-small'),
                   minor=True)
ax.tick_params(axis='x', which='minor', length=0)
ax.tick_params(axis='y', labelsize='small')
for i, t in enumerate(ax.get_xticklabels()):
    t.set_y(-0.1 if i else -0.115)
for i, t in enumerate(ax.get_xticklabels(minor=True)):
    t.set_y(0 if i % 2 else -0.03)
plt.tight_layout()
plt.savefig('box-coefficient-of-variation.pgf')
plt.close()

In [711]:
before_medians = df_base.iterations.groupby('file_name').median()
after_medians = df_abspos.loc[df_base.index].iterations.groupby('file_name').median()

plt.figure(figsize=(1.8, 2.5))
ax = plt.gca()

def at_least_names(mn):
    return set(before_medians[before_medians >= mn].index)

improv = before_medians / after_medians
improv = improv[~improv.index.isin(hard_instances | fully_reduced)]
print(len(improv[(0.5 <= improv) & (improv <= 2.0)]))
data = [np.log2(improv[improv.index.isin(at_least_names(10**i))]) for i in range(3)]
labels = [r'$\geq 1$', r'$\geq 10$', r'$\geq 100$']
plt.boxplot(data, labels=labels, flierprops=dict(markersize=2, markeredgewidth=0.25))

ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: rf'$2^{{{int(x)}}}$'))
plt.xlabel(r'\texttt{base} [\#br.\ nodes]')
plt.ylabel(r'improvement')

plt.tight_layout()
plt.savefig('box-improvements-easy.pgf')
plt.close('all')

1098


# Statistical Analysis

In [6]:
# https://gist.github.com/jacksonpradolima/f9b19d65b7f16603c837024d5f8c8a65

import itertools as it

from bisect import bisect_left
from typing import List

import numpy as np
import pandas as pd
import scipy.stats as ss

from pandas import Categorical


def VD_A(treatment: List[float], control: List[float]):
    """
    Computes Vargha and Delaney A index
    A. Vargha and H. D. Delaney.
    A critique and improvement of the CL common language
    effect size statistics of McGraw and Wong.
    Journal of Educational and Behavioral Statistics, 25(2):101-132, 2000
    The formula to compute A has been transformed to minimize accuracy errors
    See: http://mtorchiano.wordpress.com/2014/05/19/effect-size-of-r-precision/
    :param treatment: a numeric list
    :param control: another numeric list
    :returns the value estimate and the magnitude
    """
    m = len(treatment)
    n = len(control)

    if m != n:
        raise ValueError("Data d and f must have the same length")

    r = ss.rankdata(treatment + control)
    r1 = sum(r[0:m])

    # Compute the measure
    # A = (r1/m - (m+1)/2)/n # formula (14) in Vargha and Delaney, 2000
    A = (2 * r1 - m * (m + 1)) / (2 * n * m)  # equivalent formula to avoid accuracy errors

    levels = [0.147, 0.33, 0.474]  # effect sizes from Hess and Kromrey, 2004
    magnitude = ["negligible", "small", "medium", "large"]
    scaled_A = (A - 0.5) * 2

    magnitude = magnitude[bisect_left(levels, abs(scaled_A))]
    estimate = A

    return estimate, magnitude

In [None]:
def analyze(name, bef, aft, prop):
    after = aft[prop]
    base = bef[prop]
    u = mannwhitneyu(after, base, True, 'less')
    u_inv = mannwhitneyu(after, base, True, 'greater')
    return (shorten_name(name), u, u_inv, VD_A(list(after), list(base)))

def analyze_all(bef, aft):
    names = list(bef.index.unique())
    data_iterations = sorted((analyze(name, bef.loc[name], aft.loc[name], 'iterations') for name in names), key=lambda t: t[1].pvalue)
    data_time = sorted((analyze(name, bef.loc[name], aft.loc[name], 'solve_time') for name in names), key=lambda t: t[1].pvalue)
    return data_iterations, data_time

analyze_all(base_fast, abspos_fast)[0]

### Fast candidates (194 instances, 11 runs per instance)

#### Iterations

* 4x: significant large increase
* 15x: insignificant change (medium increase to medium reduction)
* 2x: significant medium reduction
* 173x: significant large reduction

# Before/After Scatterplots

In [697]:
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

def before_after_scatterplot(before,
                             after,
                             prop='iterations',
                             agg='median',
                             log=True,
                             mn=1,
                             xlabel=r'\texttt{base} [\#br.\ nodes]',
                             ylabel=r'\texttt{abs-incl} [\#br.\ nodes]',
                             legendloc='best',
                             highlight=None,
                             highlight_label=None,
                             no_highlight_label=None,
                             figsize=(3, 3),
                             legendfontsize='medium',
                             linear_regression=False):
    COLORS = ['#e66101','#fdb863','#b2abd2','#5e3c99']
    
    x = before[prop].groupby('file_name').median().sort_index()
    y = after[prop].groupby('file_name').median().sort_index()
    highlight = set(highlight or [])
    x_no_highlight, y_no_highlight = x[~x.index.isin(highlight)], y[~y.index.isin(highlight)]
    x_highlight, y_highlight = x[x.index.isin(highlight)], y[y.index.isin(highlight)]
    
    category_colors = {cat: COLORS[i] for i, cat in enumerate(sorted(categories, key=lambda s: s.lower()))}
    appearing_categories = {instance_category[name] for name in x.index}
    mx = (1.5 if log else 1.05) * max(x.max(), y.max())
    
    plt.figure(figsize=figsize)
    ax = plt.gca()
    if log:
        logtype = 'symlog' if mn <= 0 else 'log'
        plt.xscale(logtype)
        plt.yscale(logtype)
    plt.xlim([mn, mx])
    plt.ylim([mn, mx])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    locmaj = ticker.LogLocator(base=10, numticks=100)
    locmin = ticker.LogLocator(base=10, subs=np.arange(1, 10) / 10, numticks=100)
    for axis in (ax.xaxis, ax.yaxis):
        axis.set_major_locator(locmaj)
        axis.set_minor_locator(locmin)
        axis.set_minor_formatter(ticker.NullFormatter())
        #axis.set_ticks_position('both')
    ax.set_aspect('equal', 'box')
    
    highlight = set(highlight or [])
    plt.scatter(x_no_highlight,
                y_no_highlight, 
                c=[category_colors[instance_category[name]] for name in x_no_highlight.index],
                s=3.0**2,
                linewidth=0.25,
                edgecolor='black',
                alpha=1)
    plt.scatter(x_highlight,
                y_highlight, 
                c=[category_colors[instance_category[name]] for name in x_highlight.index],
                s=3.0**2,
                marker='s',
                linewidth=0.4,
                edgecolor='black',
                alpha=1)
    
    def line(x1, y1, x2, y2, **kwargs):
        plt.plot([x1, x2], [y1, y2], color='black', zorder=-1, **kwargs)
    line(mn, mn, mx, mx, linewidth=0.8)
    base=10
    if log:
        for i in range(1, 100):
            if base**i > mx:
                break
            line(mn, mn * base**i, mx / base**i, mx, dashes=[3, 5], linewidth=.5)
            line(mn * base**i, mn, mx, mx / base**i, dashes=[3, 5], linewidth=.5)
    
    legend_handles = [Patch(color=col, label=r'\texttt{' + cat + '}') for cat, col in category_colors.items() if cat in appearing_categories]
    if highlight:
        for edgewidth, marker, label in [(0.5, 'o', no_highlight_label), (0.8, 's', highlight_label)]:
            legend_handles.append(Line2D([], [], color='lightgrey', marker=marker, linestyle='None',
                                         markeredgecolor='black', markeredgewidth=edgewidth,
                                         markersize=6, label=label))
    plt.legend(handles=legend_handles,
               loc=legendloc,
               fontsize=legendfontsize)
    
    if linear_regression:
        assert log
        regr_x = sm.add_constant(np.log(x))
        regr_y = np.log(y)
        model = sm.OLS(regr_y, regr_x)
        res = model.fit()
        print(res.summary())
        pred_x = np.linspace(np.log(mn), np.log(mx), 100)
        sf = res.get_prediction(sm.add_constant(pred_x)).summary_frame(alpha=0.05)
        plt.plot(np.exp(pred_x), np.exp(sf['mean']), zorder=1, linewidth=0.8, color=COLORS[3])
        plt.fill_between(np.exp(pred_x),
                         np.exp(sf['mean_ci_lower']),
                         np.exp(sf['mean_ci_upper']),
                         alpha=0.5,
                         zorder=1,
                         color=COLORS[3],
                         linewidth=0)
        
    plt.tight_layout()

In [698]:
before = df_base[df_base.index.isin(hard_finished_instances)]
after = df_abspos[df_abspos.index.isin(hard_finished_instances)]
counts = before.iterations.groupby('file_name').count()
only_single_run = list(counts[counts == 1].index)
before_after_scatterplot(before, 
                         after,
                         mn = 100,
                         legendloc='upper left',
                         legendfontsize='x-small',
                         highlight=only_single_run,
                         no_highlight_label=r'\texttt{hard-fast}',
                         highlight_label=r'\texttt{hard-slow}',
                         figsize=(2.429, 2.429))
plt.savefig('scatter-branching-nodes-hard.pgf')
plt.close()

In [699]:
names = {name for name in hard_finished_instances if instance_category[name] != 'MU13'}
before = df_base[df_base.index.isin(names)]
after = df_abspos[df_abspos.index.isin(names)]
counts = before.iterations.groupby('file_name').count()
only_single_run = list(counts[counts == 1].index)
before_after_scatterplot(before, 
                         after,
                         mn = 100,
                         legendloc='upper left',
                         legendfontsize='x-small',
                         highlight=only_single_run,
                         no_highlight_label=r'\texttt{hard-fast}',
                         highlight_label=r'\texttt{hard-slow}',
                         linear_regression=True,
                         figsize=(2.429, 2.429))
plt.savefig('scatter-branching-nodes-hard-regression.pgf')
plt.close()

                            OLS Regression Results                            
Dep. Variable:             iterations   R-squared:                       0.892
Model:                            OLS   Adj. R-squared:                  0.892
Method:                 Least Squares   F-statistic:                     1508.
Date:                Fri, 05 Feb 2021   Prob (F-statistic):           5.33e-90
Time:                        11:15:08   Log-Likelihood:                -179.15
No. Observations:                 184   AIC:                             362.3
Df Residuals:                     182   BIC:                             368.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0462      0.158     12.976      0.0

In [700]:
before = df_base[~df_base.index.isin(hard_instances | fully_reduced)]
after = df_abspos[~df_abspos.index.isin(hard_instances | fully_reduced)]
before_after_scatterplot(before,
                         after,
                         figsize=(2.429, 2.429),
                         legendloc='upper left',
                         legendfontsize='x-small')
plt.savefig('scatter-branching-nodes-easy.pgf')
plt.close()

In [701]:
bs20 = {name for name in instances if instance_category[name] == 'BS20'}
before = df_base[~df_base.index.isin(hard_instances | fully_reduced | bs20)]
after = df_abspos[~df_abspos.index.isin(hard_instances | fully_reduced | bs20)]
before_after_scatterplot(before,
                         after,
                         figsize=(2.429, 2.429),
                         legendloc='upper left',
                         legendfontsize='x-small')
plt.savefig('scatter-branching-nodes-easy-no-bs20.pgf')
plt.close()

In [702]:
before = df_base[df_base.index.isin(hard_finished_instances)]
after = df_abspos[df_abspos.index.isin(hard_finished_instances)]
counts = before.iterations.groupby('file_name').count()
only_single_run = list(counts[counts == 1].index)
before_after_scatterplot(before, 
                         after,
                         mn = 0.1,
                         prop='solve_time',
                         legendloc='upper left',
                         legendfontsize='xx-small',
                         xlabel=r'\texttt{base} [s]',
                         ylabel=r'\texttt{abs-incl} [s]',
                         highlight=only_single_run,
                         no_highlight_label=r'\texttt{hard-fast}',
                         highlight_label=r'\texttt{hard-slow}',
                         figsize=(2.429, 2.429))
plt.savefig('scatter-runtime-hard.pgf')
plt.close()

# Instance Classification

In [47]:
from collections import Counter

bef = dfs_class['base'].iterations
aft = dfs_class['abs-pos'].iterations
bef_time = dfs_class['base'].solve_time
aft_time = dfs_class['abs-pos'].solve_time

candidates = [name for name in instances if name in aft.index and (name not in bef.index or bef.loc[name] >= 1000)]
print(len(candidates))
print(sum(aft_time[name] for name in candidates))
print(sum(bef_time.get(name, 12 * 60 * 60) for name in candidates))
print(sum(aft_time[name] for name in candidates if name in bef.index))
print(sum(bef_time[name] for name in candidates if name in bef.index))

fast_candidates = [name for name in candidates if name in bef.index]
slow_candidates = [name for name in candidates if name not in bef.index]
#Path('../fast-candidates.txt').write_text('\n'.join(str(full_path[name]) for name in fast_candidates))
#Path('../slow-candidates.txt').write_text('\n'.join(str(full_path[name]) for name in slow_candidates))

trivial = [name for name in instances if bef.get(name, 1000) < 1000]
trivial_least_10 = [name for name in trivial if bef.loc[name] >= 10]
trivial_least_100 = [name for name in trivial if bef.loc[name] >= 100]
print(f'Avg iterations improvement for trivial:', sum(bef.get(name) / aft.get(name) for name in trivial) / len(trivial))
print(f'Avg iterations improvement for trivial >= 10:', sum(bef.get(name) / aft.get(name) for name in trivial_least_10) / len(trivial_least_10))
print(f'Avg iterations improvement for trivial >= 100:', sum(bef.get(name) / aft.get(name) for name in trivial_least_100) / len(trivial_least_100))

214
154196.94061529092
1009069.7600713499
80821.333990683
145069.76007135015
Avg iterations improvement for trivial: 1.0424554931154455
Avg iterations improvement for trivial >= 10: 1.2797001743086387
Avg iterations improvement for trivial >= 100: 1.5357183287806604


## Instance classification

### (Too) few iterations

* 3793x: fewer than 100 its w/o activity
* 4016x: fewer than 1000 its w/o activity

### Long runtime

* 63x: did not finish in 3h w/ or w/o activity
* 20x: did not finish w/o activity but finished with
  * 8x: did not finish in 1h w/ activity

-> 206 candidates for running experiments (finished w/ activity in 1h and >= 1000 its w/o activity)

### Instance types

| | vlrg | murakami_uno | konect_bip | cm | thomas | enumdat |
|-|------|--------------|------------|----|--------|---------|
| candidate | 0 | 30 | 0 | 173 | 3 | 0 |
| <1000 its | 29 | 101 | 16 | 3727 | 131 | 12 |
| too slow | 0 | 9 | 10 | 52 | 0 | 0 |

In [None]:
df_slow_24 = pd.read_csv('./results/classification/base-slow.csv', index_col='file_name')
def format_runtime(secs):
    return f'{int(secs) // 60**2:02}:{int(round(int(secs) % 60**2 / 60)):02}'

for name in slow_candidates:
    its_aft = dfs_class['abs-pos'].loc[name].iterations
    time_aft = dfs_class['abs-pos'].loc[name].solve_time
    if name in df_slow_24.index:
        its_bef = df_slow_24.loc[name].iterations
        time_bef = df_slow_24.loc[name].solve_time
        time_speed_up = f'{time_bef / time_aft:.2f}'
        its_speed_up = f'{its_bef / its_aft:.2f}'
        its_bef, time_bef = str(its_bef), format_runtime(time_bef)
    else:
        time_bef = '>24:00'
        its_bef = '-'
        time_speed_up = f'>{24 * 60**2 / time_aft:.2f}'
        its_speed_up = '-'
    print('|', ' | '.join([shorten_name(name), time_bef, format_runtime(time_aft), time_speed_up, its_bef, str(its_aft), its_speed_up]), '|')

| name | before | after | speed up | before | after | speed up |
|-|-|-|-|-|-|-|
| nr_130_size_289 | 03:14 | 00:39 | 4.99 | 448641 | 21825 | 20.56 |
| nr_487_size_113 | 03:17 | 00:01 | 175.79 | 15774187 | 90889 | 173.55 |
| nr_493_size_263 | 04:59 | 00:43 | 6.96 | 646907 | 17965 | 36.01 |
| nr_54_size_279 | 06:25 | 01:38 | 3.93 | 1395789 | 64857 | 21.52 |
| nr_1751_size_144 | 07:51 | 00:09 | 51.30 | 5820373 | 82849 | 70.25 |
| nr_758_size_127 | 08:57 | 00:08 | 67.22 | 19003071 | 220359 | 86.24 |
| nr_95_size_134 | 09:60 | 00:23 | 25.94 | 34298865 | 1348593 | 25.43 |
| nr_241_size_252 | 11:24 | 02:48 | 4.08 | 14767953 | 119031 | 124.07 |
| nr_385_size_217 | 12:32 | 00:15 | 49.32 | 4570381 | 39207 | 116.57 |
| nr_794_size_163 | 13:48 | 01:01 | 13.54 | 13060125 | 869011 | 15.03 |
| nr_163_size_122 | 17:17 | 00:20 | 52.00 | 71952883 | 809753 | 88.86 |
| nr_789_size_150 | 19:26 | 00:28 | 41.14 | 36261777 | 465771 | 77.85 |
| nr_525_size_215 | 20:38 | 02:17 | 9.03 | 7043685 | 254567 | 27.67 |
| nr_279_size_209 | 22:46 | 01:30 | 15.14 | 7123757 | 173723 | 41.01 |
| nr_209_size_160 | 31:13 | 00:37 | 51.14 | 69918283 | 809703 | 86.35 |
| nr_123_size_148 | 45:31 | 00:08 | 341.42 | 64041167 | 76451 | 837.68 |
| nr_43_size_345 | 47:41 | 02:56 | 16.23 | 4520973 | 125247 | 36.1 |
| nr_92_size_194 | 61:56 | 01:48 | 34.26 | 27503137 | 480469 | 57.24 |
| nr_74_size_223 | 145:14| 01:54 | 76.24 | 39129927 | 322911 | 121.18 |
| nr_118_size_204 | >24:00 | 00:39 | >37.31 | - | 2774675 | - |

# Improvement vs. Speed up

In [712]:
plt.figure(figsize=(2.429, 2.429))

before_hard_fast = df_base.loc[hard_fast_instances].groupby('file_name').median()
after_hard_fast = df_abspos.loc[hard_fast_instances].groupby('file_name').median()
before_hard_slow = df_base.loc[hard_finished_instances - hard_fast_instances].groupby('file_name').median()
after_hard_slow = df_abspos.loc[hard_finished_instances - hard_fast_instances].groupby('file_name').median()

fast_it = before_hard_fast.iterations / after_hard_fast.iterations
fast_ti = before_hard_fast.solve_time / after_hard_fast.solve_time
slow_it = before_hard_slow.iterations / after_hard_slow.iterations
slow_ti = before_hard_slow.solve_time / after_hard_slow.solve_time

mn = 0.1
mx = 1.5 * max(fast_it.max(), fast_ti.max(), slow_it.max(), slow_ti.max())
ax = plt.gca()
plt.yscale('log')
plt.xscale('log')
plt.xlim([mn, mx])
plt.ylim([mn, mx])
locmaj = ticker.LogLocator(base=10, numticks=100)
locmin = ticker.LogLocator(base=10, subs=np.arange(1, 10) / 10, numticks=100)
for axis in (ax.xaxis, ax.yaxis):
    axis.set_major_locator(locmaj)
    axis.set_minor_locator(locmin)
    axis.set_minor_formatter(ticker.NullFormatter())
ax.set_aspect('equal', 'box')

COLORS = ['#e66101','#fdb863','#b2abd2','#5e3c99']
category_colors = {cat: COLORS[i] for i, cat in enumerate(sorted(categories, key=lambda s: s.lower()))}
appearing_categories = {instance_category[name] for name in hard_finished_instances}
legend_handles = [Patch(color=col, label=r'\texttt{' + cat + '}') for cat, col in category_colors.items() if cat in appearing_categories]
for edgewidth, marker, label in [(0.5, 'o', r'\texttt{hard-fast}'), (0.8, 's', r'\texttt{hard-slow}')]:
    legend_handles.append(Line2D([], [], color='lightgrey', marker=marker, linestyle='None',
                                 markeredgecolor='black', markeredgewidth=edgewidth,
                                 markersize=6, label=label))
plt.legend(handles=legend_handles,
           loc='lower right',
           fontsize='xx-small')

plt.scatter(fast_ti,
            fast_it, 
            c=[category_colors[instance_category[name]] for name in before_hard_fast.index],
            s=3.0**2,
            linewidth=0.25,
            edgecolor='black')
plt.scatter(slow_ti,
            slow_it,
            c=[category_colors[instance_category[name]] for name in before_hard_slow.index],
            s=3.0**2,
            marker='s',
            linewidth=0.4,
            edgecolor='black')

plt.xlabel('speed up')
plt.ylabel('improvement')

def line(x1, y1, x2, y2, **kwargs):
    plt.plot([x1, x2], [y1, y2], color='black', zorder=-1, **kwargs)
line(mn, mn, mx, mx, linewidth=0.8)
base=10
for i in range(1, 100):
    if base**i > mx:
        break
    line(mn, mn * base**i, mx / base**i, mx, dashes=[3, 5], linewidth=.5)
    line(mn * base**i, mn, mx, mx / base**i, dashes=[3, 5], linewidth=.5)

plt.tight_layout()
plt.savefig('scatter-improvement-vs-speed-up-hard.pgf')
plt.close()

plt.figure(figsize=(2.4, 5))
improvement_speedup_ratio = pd.concat([fast_ti, slow_ti]) / pd.concat([fast_it, slow_it])
plt.yscale('log')
plt.boxplot([improvement_speedup_ratio], flierprops=dict(markersize=2, markeredgewidth=0.25))
plt.tight_layout()
plt.savefig('test2.pdf')
plt.close()

f = np.vectorize(lambda name: instance_category[name])
improvement_speedup_ratio = improvement_speedup_ratio[(f(improvement_speedup_ratio.index) != 'MU13')]
print(improvement_speedup_ratio.describe())
print(improvement_speedup_ratio.sort_values()[1])
print('>=1:', len(improvement_speedup_ratio[improvement_speedup_ratio >= 1]))
print('>=0.75:', len(improvement_speedup_ratio[improvement_speedup_ratio >= 0.75]))
print('>=0.5:', len(improvement_speedup_ratio[improvement_speedup_ratio >= 0.5]))
print(improvement_speedup_ratio.sort_values()[1])

count    184.000000
mean       0.718298
std        0.266358
min        0.032887
25%        0.552681
50%        0.719602
75%        0.881303
max        1.475097
dtype: float64
0.1523072282865475
>=1: 25
>=0.75: 82
>=0.5: 148
0.1523072282865475


# Various statistics

In [55]:
len([name for name in hard_instances if instance_category[name] == 'MU13'])

32

In [672]:
len(hard_instances)

216

In [709]:
before_medians = df_base.iterations.groupby('file_name').median()
after_medians = df_abspos.loc[df_base.index].iterations.groupby('file_name').median()
improv = before_medians / after_medians
improv[improv.index.isin(hard_instances)].groupby(lambda name: instance_category[name]).describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BS20,181.0,22.289923,74.365691,0.602176,2.171582,4.207729,15.043373,837.686933
Bir+20,3.0,5.315677,3.222153,1.596585,4.339713,7.08284,7.175223,7.267606
MU13,32.0,1.223185,0.797749,0.761491,0.999865,1.013387,1.047971,5.21652


In [708]:
before_medians = df_base.iterations.groupby('file_name').median()
after_medians = df_abspos.loc[df_base.index].iterations.groupby('file_name').median()
improv = before_medians / after_medians
f = np.vectorize(lambda name: instance_category[name])
non_random = improv[(f(improv.index) != 'MU13') & improv.index.isin(hard_instances)]
print('<1:', len(non_random[non_random < 1.0]))
print('>=10:', len(non_random[non_random >= 10]))
print(non_random.describe())

<1: 5
>=10: 59
count    184.000000
mean      22.013169
std       73.785879
min        0.602176
25%        2.166705
50%        4.239436
75%       15.032397
max      837.686933
Name: iterations, dtype: float64


In [710]:
before_medians = df_base.solve_time.groupby('file_name').median()
after_medians = df_abspos.loc[df_base.index].solve_time.groupby('file_name').median()
improv = before_medians / after_medians
f = np.vectorize(lambda name: instance_category[name])
non_random = improv[(f(improv.index) != 'MU13') & improv.index.isin(hard_instances)]
print('<1:', len(non_random[non_random < 1.0]))
print('>=10:', len(non_random[non_random >= 10]))
print(non_random.describe())

<1: 22
>=10: 42
count    184.000000
mean      14.820053
std       54.684237
min        0.445792
25%        1.507324
50%        3.134512
75%        8.801426
max      623.200692
Name: solve_time, dtype: float64
