In [None]:
def parse(data):
    ret = []
    for d in data:
        ret.append([float(x[1]) / 10 for x in d['values']])
    return ret

def parse_full(data, remove_sink=False):
    ret = {}
    for d in data:
        task_name = d["metric"]["task_name"]
        if "Source" in task_name or (remove_sink and "Sink:_Sink_table__default_catalog_default_database_discard_sink" in task_name):
            continue
        if task_name not in ret:
            ret[task_name] = d["values"][12:-12]
        else:
            ret[task_name] += d["values"][12:-12]
    for key in ret:
        #display(q1_full[key])
        ret[key] = [float(item[1]) / 10.0 for item in ret[key]]
    return ret


In [None]:
import json
import numpy as np
import random

e1 = np.random.normal(80, 5, size=500)

with open("../busyness-results/q1-full.json") as f:
    data = json.load(f)
    q1_full_U = parse_full(data)
    q1_full = [v for (k, v) in q1_full_U.items()]

with open("../busyness-results/q2-full.json") as f:
    data = json.load(f)
    q2_full_U = parse_full(data)
    keys = list(q2_full_U.keys())
    q2_full = [q2_full_U[keys[1]], q2_full_U[keys[0]]]


with open("../busyness-results/q5-full.json",) as f:
    data = json.load(f)
    q5_full_U = parse_full(data, True)
    keys = list(q5_full_U.keys())
    q5_full = [q5_full_U[keys[3]], q5_full_U[keys[1]], q5_full_U[keys[5]], q5_full_U[keys[2]], q5_full_U[keys[4]], q5_full_U[keys[-1]], q5_full_U[keys[-3]], q5_full_U[keys[-2]], q5_full_U[keys[0]]]


with open("../busyness-results/q8-full.json") as f:
    data = json.load(f)
    q8_full_U = parse_full(data, True)
    keys = list(q8_full_U.keys())
    q8_full = [q8_full_U[keys[-2]], q8_full_U[keys[-3]], q8_full_U[keys[2]], q8_full_U[keys[3]], q8_full_U[keys[-1]], q8_full_U[keys[4]], q8_full_U[keys[5]], q8_full_U[keys[1]], q8_full_U[keys[0]]]

with open("../busyness-results/q11-full.json") as f:
    data = json.load(f)
    q11_full_U = parse_full(data, True)
    keys = list(q11_full_U.keys())
    q11_full = [q11_full_U[keys[2]], q11_full_U[keys[1]], q11_full_U[keys[3]], q11_full_U[keys[0]]]
tasks = [item[10:-10] for sub_list in [q1_full, q2_full, q5_full, q8_full, q11_full] for item in sub_list]

dummy = np.random.normal(50, 20, 100)
dummy = [i for i in dummy if i < 90 and i > 10]
tasks.append(dummy)


In [None]:
import matplotlib.pyplot as plt

def adjacent_values(vals, q1, q3):
    upper_adjacent_value = q3 + (q3 - q1) * 1.5
    upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1])

    lower_adjacent_value = q1 - (q3 - q1) * 1.5
    lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1)
    return lower_adjacent_value, upper_adjacent_value

fig, ax = plt.subplots(figsize=(22, 4))
ops = [2, 2, 9, 9, 4]
index = 1
xticks = []
dic = {}
for i in ops:
    for j in range(i):
        xticks.append(index)
        dic[index] = j+1
        index+=1
#dic[index] = ""
xticks.append(27)
colors = ["#e69f00", "#0072b2", "#009e73", "#cc79a7", "#56b4e9"]
cmap = []
for i in range(len(ops)):
    for j in range(ops[i]):
        cmap.append(colors[i])
bplot = ax.violinplot(tasks, widths=0.75)

for patch, color in zip(bplot['bodies'], np.array(cmap).flat):
    patch.set_facecolor(color)

quartile1 = [None] * 27
medians = [None] * 27
quartile3 = [None] * 27
for i in range(len(tasks)):
    q1, m, q3 = np.percentile(tasks[i], [25, 50, 75], axis=0)
    quartile1[i] = q1
    medians[i] = m
    quartile3[i] = q3

whiskers = np.array([
    adjacent_values(sorted_array, q1, q3)
    for sorted_array, q1, q3 in zip(tasks, quartile1, quartile3)])
whiskers_min, whiskers_max = whiskers[:, 0], whiskers[:, 1]

inds = np.arange(1, len(medians) + 1)
ax.vlines(inds, quartile1, quartile3, color='k', linestyle='-', lw=3)
ax.vlines(inds, whiskers_min, whiskers_max, color='k', linestyle='-', lw=1)
ax.scatter(inds, medians, marker='o', color='red', s=30, zorder=2, alpha=0.5)


ax.set_yticks([0, 50, 100])
ax.set_xticks(xticks)

labels = [xticks[i] if t not in dic.keys() else dic[t] for i,t in enumerate(xticks)]
labels = ["M", "SINK", "F", "SINK", "F", "GBW", "M", "M", "GB", "M", "J", "M", "SINK",  "F", "F", "GBW", "GBW", "M", "M", "J", "M" , "SINK", "F", "GBW", "M", "SINK", "OPE"]

ax.set_xticklabels(labels, rotation=0)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.axvline(x = 2.5, color = 'k', linewidth=3.0)
plt.axvline(x = 4.5, color = 'k', linewidth=3.0)
plt.axvline(x = 13.5, color = 'k', linewidth=3.0)
plt.axvline(x = 22.5, color = 'k', linewidth=3.0)
plt.axvline(x = 26.5, color = 'k', linestyle='dotted')

plt.axhline(y = 100, color = 'g', linestyle='dotted')
plt.axhline(y = 25, xmax=25.5/28, color = 'lightgray', linestyle='dotted')
plt.axhline(y = 50, xmax=25.5/28, color = 'lightgray', linestyle='dotted')
plt.axhline(y = 75, xmax=25.5/28, color = 'lightgray', linestyle='dotted')

plt.ylim(0, 110)
plt.xlim(0.5,29)
parallelisms = ["", 12, 4, 5, 1, 3, 35, 1, 1, 2, 1, 3, 1, 1, 4, 4, 8, 10, 1, 1, 18, 1, 1, 4, 42, 1, 1, "//"]
for i in range(len(parallelisms)):
    plt.text(i, 105, parallelisms[i], fontsize=15, horizontalalignment='center')
plt.text(1.5, -15, "Q1", fontsize=15, horizontalalignment='center', weight='semibold')
plt.text(3.5, -15, "Q2", fontsize=15, horizontalalignment='center', weight='semibold')
plt.text(9, -15, "Q5", fontsize=15, horizontalalignment='center', weight='semibold')
plt.text(18, -15, "Q8", fontsize=15, horizontalalignment='center', weight='semibold')
plt.text(24.5, -15, "Q11", fontsize=15, horizontalalignment='center', weight='semibold')
ax.set_ylabel("Busyness (%)", fontsize=15)
#ax.set_xlabel("Operator parallelism", fontsize=15, labelpad=6.0)
ax.xaxis.set_label_position('top')

plt.plot([27, 27.75], [50.5, 50.5], "r--")
ax.text(27.85, 49.3, "Median", fontsize=12)

ax.plot([27, 27.75], [64, 64], "r--")
ax.text(28.2, 60, "Third\nquartile", horizontalalignment='center', fontsize=12)

ax.plot([27, 27.75], [40, 40], "r--")
ax.text(28.2, 36, "First\nquartile", horizontalalignment='center', fontsize=12)

ax.plot([27.18, 27.6], [107, 107], "r--")
ax.text(28.3, 106, "Parallelism", horizontalalignment='center', fontsize=12)

ax.plot([27, 27.75], [89, 89], "r--")
ax.text(27.85, 87.8, "Max", fontsize=12)

ax.plot([27, 27.75], [14.2, 14.2], "r--")
ax.text(27.85, 13, "Min", fontsize=12)

ax.plot([27.2, 27.6], [70, 75], "r--")
ax.plot([27.6, 27.75], [75, 75], "r--")
ax.text(27.85, 73.8, "Density", fontsize=12)

plt.tight_layout(pad=-1)
plt.savefig("boxplot2.pdf", format="pdf", bbox_inches="tight")
plt.show()
