In [None]:
import pandas as pd
import numpy as np
import json
import os
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

In [None]:
sns.set_palette("colorblind")

# Data Loading

In [None]:
pwd = "."
figs_dir = "./figs"
!mkdir -p figs

In [None]:
nbs = [
  "lextoumbourou/feedback3-eda-hf-custom-trainer-sift",
  "paultimothymooney/kaggle-survey-2022-all-results",
  "dataranch/supermarket-sales-prediction-xgboost-fastai",
  "kkhandekar/environmental-vs-ai-startups-india-eda",
  "ampiiere/animal-crossing-villager-popularity-analysis",
  "aieducation/what-course-are-you-going-to-take",
  "saisandeepjallepalli/adidas-retail-eda-data-visualization",
  "joshuaswords/netflix-data-visualization",
  "spscientist/student-performance-in-exams",
  "ibtesama/getting-started-with-a-movie-recommendation-system",

  "nickwan/creating-player-stats-using-tracking-data",
  "erikbruin/nlp-on-student-writing-eda",
  "madhurpant/beautiful-kaggle-2022-analysis",
  "pmarcelino/comprehensive-data-exploration-with-python",
  "gksriharsha/eda-speedtests",
  "mpwolke/just-you-wait-rishi-sunak",
  "sanket7994/imdb-dataset-eda-project",
  "roopacalistus/retail-supermarket-store-analysis",
  "sandhyakrishnan02/indian-startup-growth-analysis",
  "roopacalistus/exploratory-data-analysis-retail-supermarket"
]
nb_labels = [
  "feedb-eda-hf-sift",
  "kaggle-survey-2022",
  "sales-pred-xgboost",
  "env-ai-startups-eda",
  "animal-crossing",
  "course-prediction",
  "adidas-retail-eda",
  "netflix-data-viz",
  "student-perf",
  "movie-recomm",

  "player-stats-tracking",
  "nlp-stud-writ-eda",
  "kaggle-2022-analysis",
  "data-expl-with-python",
  "eda-speedtests",
  "just-you-wait-rishi-sunak",
  "imdb-dataset-eda",
  "smarket-store-analysis",
  "indian-startup-growth",
  "eda-retail-supermarket"
]

def only_hits_nb(df):
  assert (type(df) == pd.DataFrame or type(df) == pd.Series)
  return df[:10]

def only_hits_cell(df):
  assert (type(df) == pd.DataFrame or type(df) == pd.Series)
  hits = [s for s in df.index if int(s.split('_')[1]) < 10]
  return df.loc[hits]
  

num_runs = 10
len(nbs)

In [None]:
def read_stats(version, read_mem=True):
  cell_to_all_runs = {}
  nb_to_mem_runs = {}
  # mem_version = "pandas"
  # if read_mem and version.split('-')[2] != "modin_OFF":
  #   mem_version = "modin"
  for nb_idx, nb in enumerate(nbs):
    for r in range(num_runs):
      nb_filename = '_'.join(nb.split('/')) + '.json'
      filepath = f"{pwd}/{version}/s-{r}/{nb_filename}"
      fp = open(filepath, 'r')
      d = json.load(fp)
      fp.close()
      nb_mem_key = f'NB_{nb_idx}'
      if read_mem:
        # Store Memory Results. One per notebook
        try:
          nb_to_mem_runs[nb_mem_key].append(d[f'max-mem-in-mb'])
        except:
          nb_to_mem_runs[nb_mem_key] = [d[f'max-mem-in-mb']]
      # Store timing results. One per cell.
      for c_idx, c in enumerate(d['cells']):
        key = f"C_{nb_idx}_{c_idx}"
        cell_time = c[f'wall-time']
        try:
          cell_to_all_runs[key].append(cell_time)
        except:
          cell_to_all_runs[key] = [cell_time]

  ### Verify all cells have same number of timings ###
  keys = list(cell_to_all_runs)
  for i in range(len(keys)):
    for j in range(i+1, len(keys)):
      ki = keys[i]
      kj = keys[j]
      vi = cell_to_all_runs[ki]
      vj = cell_to_all_runs[kj]
      assert len(vi) == len(vj)

  time_df = pd.DataFrame(cell_to_all_runs)
  if read_mem:
    mem_df = pd.DataFrame(nb_to_mem_runs)
  else:
    mem_df = None
  return time_df, mem_df

In [None]:
# All versions have the raw cells. Pick one at random and get the raw.
def find_raw(cell_name):
  spl = cell_name.split('_')
  search_nb_idx = int(spl[1])
  search_cell_idx = int(spl[2])
  nb = nbs[search_nb_idx]
  print(f"--- {nb} ---")
  nb_filename = '_'.join(nb.split('/')) + '.json'
  random_run = 4
  filepath = f"{pwd}/stats-rewr_OFF-modin_OFF-repl_STD-sliced_exec_ON/s-{random_run}/{nb_filename}"
  fp = open(filepath, 'r')
  d = json.load(fp)
  fp.close()
  for c_idx, c in enumerate(d['cells']):
    if c_idx == search_cell_idx:
      return c['raw']
  assert 0

# Rewriter vs Pandas

In [None]:
orig, _ = read_stats("stats-rewr_OFF-modin_OFF-repl_STD-sliced_exec_ON")
rewr, _ = read_stats("stats-rewr_ON-modin_OFF-repl_STD-sliced_exec_ON")

## Discard Very Fast Cells

In [None]:
cells_to_drop = []
for c in orig.columns:
  if orig[c].mean() < 50:
    orig = orig.drop(c, axis=1)
    rewr = rewr.drop(c, axis=1)
    cells_to_drop.append(c)

print(len(rewr.columns))

## Coefficient of Variation

In [None]:
# Coefficient of variation
def coef_of_var(df):
  return df.apply(lambda cell: cell.std() / cell.mean())

CV_orig = coef_of_var(orig)
CV_orig.hist(bins=60, figsize=(20, 4))

## Geomean Difference

In [None]:
mean_orig = orig.apply(lambda cell: cell.mean())
mean_rewr = rewr.apply(lambda cell: cell.mean())
mean_diff = mean_orig - mean_rewr
mean_df = pd.DataFrame({'Mean Diff': mean_diff})

In [None]:
mean_df['Std. Dev.'] = mean_df.apply(lambda c: orig[c.name].std(), axis=1)

### Largest Speedups

In [None]:
mean_df.nlargest(n=20, columns='Mean Diff')

### Largest Slowdowns

In [None]:
largest_abs_slowdowns = mean_df.nsmallest(n=10, columns='Mean Diff')
largest_abs_slowdowns

## Relative Speedups

In [None]:
mean_speedup = mean_orig / mean_rewr
mean_speedup = only_hits_cell(mean_speedup)
index_to_nums = mean_speedup.reset_index().drop('index', axis=1)
index_to_nums = index_to_nums.sort_values(by=0)[0]
drop_within_10_perc = index_to_nums.drop(index_to_nums[(0.9 < index_to_nums) & (index_to_nums < 1.1)].index)

print(len(drop_within_10_perc))

thing_to_plot = drop_within_10_perc
ax = thing_to_plot.plot(kind="bar", figsize=(20, 4) )

ax.axhline(1, color='red')
ax.axhline(2, color=(0.1, 0.2, 0.5, 0.6), dashes=(5, 2, 1, 2))
ax.axhline(5, color=(0.1, 0.2, 0.5, 0.6), dashes=(5, 2, 1, 2))

ax.yaxis.set_tick_params(labelsize=18)

ax.set_xlabel("Cells", fontsize=20)
ax.set_ylabel("Relative Speedup", fontsize=20)

ax.set_yscale("log")

ax.set_xticks([])
ax.set_yticks([0.2, 0.5, 1, 2, 5, 10, 20, 50])
ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
# ax.set_xticks([])
plt.savefig(f'{figs_dir}/cell_level.pdf', bbox_inches='tight')
ax

In [None]:
only_cell_slowdowns = mean_speedup.drop(mean_speedup[(0.9 < mean_speedup)].index)
only_cell_slowdowns = only_cell_slowdowns.sort_values()

# ax = only_cell_slowdowns.plot(kind="bar", figsize=(6, 2), color=(56/255, 176/255, 194/255, 1))
# ax = only_cell_slowdowns.plot(kind="bar", figsize=(8, 2), color=(183/255, 19/255, 104/255, 1))
ax = only_cell_slowdowns.plot(kind="bar", figsize=(12, 4))
ax.set_ylabel('Relative Speedup', fontsize=17)
ax.set_xlabel('Cells', fontsize=17)
# ax.axhline(1, color='red')
ax.yaxis.set_tick_params(labelsize=15)
ax.grid(visible=True, axis='y')
plt.ylim(top=1.0, bottom=0.6)
ax.set_xticks([])
ax.set_yticks([x for x in np.arange(0.6, 1.0 + 0.001, 0.1)])

rects = ax.patches

abs_values = [-mean_df.loc[i]['Mean Diff'] for i in only_cell_slowdowns.index]

for rect, label in zip(rects, abs_values):
    height = rect.get_height()
    ax.text(
        rect.get_x() + rect.get_width() / 2, height + 0.01, str(int(label)) + "ms", ha="center", va="bottom", fontsize=15
    )


plt.savefig(f'{figs_dir}/cells_only_slowdowns.pdf', bbox_inches='tight', pad_inches=0)
ax

In [None]:
mean_speedup.nlargest(10)

In [None]:
mean_speedup.nsmallest(10)

## Geomean Per-Cell Relative Speedup

In [None]:
stats.gmean(mean_speedup)

## Whole Notebooks

In [None]:
max_nb = max([int(nb.split("_")[1]) for nb in orig.columns])
def get_whole_mean(df):
    nb_means = []
    for i in range(max_nb + 1):
        # All columns (i.e., cells) of the i-th notebook
        col_set = [col for col in df.columns if col.split("_")[1] == str(i)]
        # Sum horizontally. Each element of the resulting Series is one run of the whole notebook
        nb_runs = df[col_set].sum(axis=1)
        # Take the mean (across runs)
        nb_means.append(nb_runs.mean())
    
    return pd.Series(nb_means)

In [None]:
orig_whole = get_whole_mean(orig)
rewr_whole = get_whole_mean(rewr)

In [None]:
mean_whole_speedup = orig_whole / rewr_whole

mean_whole_speedup = only_hits_nb(mean_whole_speedup)

new_index = []
for nb_idx in mean_whole_speedup.index:
  new_index.append(nb_labels[nb_idx])
mean_whole_speedup.index = pd.Series(new_index)

to_plot_sorted = mean_whole_speedup.sort_values()
ax = to_plot_sorted.plot(kind="bar", figsize=(10, 6) )
ax.set_ylabel('Relative Speedup', fontsize=20)
ax.set_xlabel('Notebooks', fontsize=20)
ax.axhline(1, color='red')
ax.grid(visible=True, axis='y')
ax.yaxis.set_tick_params(labelsize=18)
plt.xticks(rotation=70, fontsize=14)
plt.savefig(f'{figs_dir}/nb_level.pdf', bbox_inches='tight')
ax

In [None]:
mean_whole_speedup.sort_values()

## Geomean Per-NB Relative Speedup

In [None]:
stats.gmean(mean_whole_speedup)

# Compare with Modin

In [None]:
base_less_repl, mem_base = read_stats("stats-rewr_OFF-modin_OFF-repl_LESS-sliced_exec_ON")
rewr_less_repl, mem_rewr = read_stats("stats-rewr_ON-modin_OFF-repl_LESS-sliced_exec_ON")
modin_12, mem_modin12 = read_stats("stats-rewr_OFF-modin_12-repl_LESS-sliced_exec_ON")
modin_8, mem_modin8 = read_stats("stats-rewr_OFF-modin_8-repl_LESS-sliced_exec_ON")
modin_4, mem_modin4 = read_stats("stats-rewr_OFF-modin_4-repl_LESS-sliced_exec_ON")

In [None]:
_mod_mean_base = base_less_repl.apply(lambda cell: cell.mean())
_mod_mean_rewr = rewr_less_repl.apply(lambda cell: cell.mean())
_mod_mean_modin12 = modin_12.apply(lambda cell: cell.mean())
_mod_mean_modin8 = modin_8.apply(lambda cell: cell.mean())
_mod_mean_modin4 = modin_4.apply(lambda cell: cell.mean())


_mod_rewr_diff = _mod_mean_base - _mod_mean_rewr
_mod_modin12_diff = _mod_mean_base - _mod_mean_modin12
_mod_mean_df = pd.DataFrame({'Mean Rewr': _mod_rewr_diff, 'Mean Modin-12': _mod_modin12_diff})

In [None]:
_mod_mean_df.nlargest(n=10, columns='Mean Modin-12')

In [None]:
(_mod_mean_modin12 / _mod_mean_rewr).max()

## Whole Notebooks

In [None]:
_mod_base_whole = get_whole_mean(base_less_repl)
_mod_rewr_whole = get_whole_mean(rewr_less_repl)
_mod_mod12_whole = get_whole_mean(modin_12)
_mod_mod8_whole = get_whole_mean(modin_8)
_mod_mod4_whole = get_whole_mean(modin_4)

In [None]:
_mod_gmean_rewr_whole_speedup = _mod_base_whole / _mod_rewr_whole
_mod_gmean_mod12_whole_speedup = _mod_base_whole / _mod_mod12_whole
_mod_gmean_mod8_whole_speedup = _mod_base_whole / _mod_mod8_whole
_mod_gmean_mod4_whole_speedup = _mod_base_whole / _mod_mod4_whole
whole_nb_df = pd.DataFrame({'Dias': _mod_gmean_rewr_whole_speedup, 
                            'Modin-4': _mod_gmean_mod4_whole_speedup,
                            'Modin-8': _mod_gmean_mod8_whole_speedup,
                            'Modin-12': _mod_gmean_mod12_whole_speedup
                            })

whole_nb_df = only_hits_nb(whole_nb_df)

new_index = []
for nb_idx in whole_nb_df.index:
  new_index.append(nb_labels[nb_idx])
whole_nb_df = whole_nb_df.set_index(pd.Series(new_index))


ax = whole_nb_df.plot(kind="bar", width=0.7, figsize=(20, 4) )
ax.set_ylabel('Relative Speedup', fontsize=20)
ax.set_xlabel('Notebooks', fontsize=20)
ax.axhline(1, color='red')
ax.set_yticks([x for x in np.arange(0, 3.6, 0.5)])
ax.grid(visible=True, axis='y')
ax.yaxis.set_tick_params(labelsize=20)
ax.legend(fontsize=20)
plt.xticks(rotation=70, fontsize=14)
plt.savefig(f'{figs_dir}/modin_nb.pdf', bbox_inches='tight')
ax

## Summary Numbers vs Modin

In [None]:
speedup_vs_modin = (_mod_mod12_whole / _mod_rewr_whole)

In [None]:
speedup_vs_modin.max()

In [None]:
stats.gmean(speedup_vs_modin)

# Memory Consumption

In [None]:
mean_mem_base = mem_base.apply(lambda nb: nb.mean())
mean_mem_rewr = mem_rewr.apply(lambda nb: nb.mean())
mean_mem_modin4 = mem_modin4.apply(lambda nb: nb.mean())
rewr_mem_diff = mean_mem_rewr - mean_mem_base
modin12_mem_diff = mean_mem_modin4 - mean_mem_base
# mean_mem_df = pd.DataFrame({'SYSTEM': rewr_mem_diff, 'Modin-4': modin12_mem_diff})

In [None]:
# Mem Usage .csv
mean_mem_df = pd.DataFrame({'Pandas': mean_mem_base, 'Modin-4': mean_mem_modin4, 'Dias': mean_mem_rewr})
mean_mem_df.to_csv(f'{figs_dir}/test.csv')

In [None]:
mean_mem_df = only_hits_nb(mean_mem_df)

In [None]:
# Remove those we could not measure
mean_mem_df = mean_mem_df.loc[mean_mem_modin4 != -1]
mean_mem_df

In [None]:
# gmean_mem_df['Modin-4'] = 1 - gmean_mem_df['Modin-4']


In [None]:
new_index = []
for nb in mean_mem_df.index:
  nb_idx = int(nb.split('_')[1])
  new_index.append(nb_labels[nb_idx])
mean_mem_df_names = mean_mem_df.set_index(pd.Series(new_index))

for col in mean_mem_df_names.columns:
  mean_mem_df_names[col] = mean_mem_df_names[col] / 1024
mean_mem_df_names

In [None]:
import matplotlib.gridspec as gridspec

nrows = 2
ncols = 1
gs = gridspec.GridSpec(nrows, ncols, height_ratios=[0.2, 0.8])
fig = plt.figure(figsize=(16, 6), dpi=100)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
fig.subplots_adjust(hspace=0.2)

# -.5 otherwise some bars are hidden on the left.
xlim = [-0.5, len(mean_mem_df_names)]
top_y = [40, 90]
bott_y = [0, 20]
ylim = [[], top_y, bott_y]

bar_width = 0.8

# three_colors = [(0.643, 0.424, 0.718, 1), (0.478, 0.643, 0.341, 1), (0.796, 0.416, 0.286, 1)]
# mean_mem_df_names.plot(kind='bar', width=bar_width, ax=ax1, legend=False, color=three_colors)
mean_mem_df_names.plot(kind='bar', width=bar_width, ax=ax1, legend=False)

ax1.set_xlim(*xlim)
ax1.set_ylim(*ylim[1])
# ax1.grid(color='lightgrey')

# mean_mem_df_names.plot(kind='bar', width=bar_width, ax=ax2, legend=True, color=three_colors)
mean_mem_df_names.plot(kind='bar', width=bar_width, ax=ax2, legend=True)
ax2.legend(fontsize=20)
ax2.set_xlim(*xlim)
ax2.set_ylim(*ylim[2])
ax2.set_xlabel('Notebooks', labelpad=20, fontsize=25)
ax2.set_ylabel('Memory+Disk Usage (GB)', fontsize=25)
# ax2.grid(color='lightgrey')
ax2.yaxis.set_label_coords(-0.08, 0.7)

ax2.xaxis.set_tick_params(labelsize=20, rotation=70)

ax1.yaxis.set_tick_params(labelsize=25)
ax2.yaxis.set_tick_params(labelsize=25)
# ax2.yaxis.offsetText.set_visible(False)
# ax2.yaxis.get_major_ticks()[0].label1.set_visible(False)
# ax2.ticklabel_format(axis='y', style='sci', scilimits=(6, 6))

ax1.spines['bottom'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax1.set_xticks([])

ax1.set_yticks([40, 90])
ax2.set_yticks([0, 5, 10, 20])

ax1.axhline(40+0.1, color=(0.1, 0.2, 0.5, 0.3), dashes=(5, 2, 1, 2))
ax2.axhline(20 - 0.1, color=(0.1, 0.2, 0.5, 0.3), dashes=(5, 2, 1, 2))
ax2.axhline(10, color=(0.1, 0.2, 0.5, 0.3), dashes=(5, 2, 1, 2))
ax2.axhline(5, color=(0.1, 0.2, 0.5, 0.3), dashes=(5, 2, 1, 2))

slant = 0.5
kwargs = dict(marker=[(-1, -slant), (1, slant)], markersize=12,
        linestyle='none', color='k', mec='k', mew=1, clip_on=False)
ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs)
ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs)

plt.savefig(f'{figs_dir}/mem_usage.pdf', bbox_inches='tight')

In [None]:
rewr_mem_diff

In [None]:
rewr_mem_diff['NB_7']

# Ablation Sliced Execution

In [None]:
rewr_se_off, _ = read_stats("stats-rewr_ON-modin_OFF-repl_STD-sliced_exec_OFF")

In [None]:
for c in orig.columns:
  if orig[c].mean() < 50:
    rewr_se_off = rewr_se_off.drop(c, axis=1)

In [None]:
mean_rewr_se_off = rewr_se_off.apply(lambda cell: cell.mean())

In [None]:
rewr_se_off_whole = get_whole_mean(rewr_se_off)
rewr_se_off_speedup = rewr_se_off_whole / rewr_whole

In [None]:
rewr_se_off_speedup = only_hits_nb(rewr_se_off_speedup)

new_index = []
for nb_idx in rewr_se_off_speedup.index:
  new_index.append(nb_labels[int(nb_idx)])
rewr_se_off_speedup.index = pd.Series(new_index)

# ax = rewr_se_off_speedup.plot(kind="bar", figsize=(6, 2), color=(0.2, 0.4, 0.6, 0.6))
ax = rewr_se_off_speedup.plot(kind="bar", figsize=(6, 2))
ax.set_ylabel('Relative Slowdown')
ax.set_xlabel('Notebooks')
ax.axhline(1, color='red')
ax.grid(visible=True, axis='y')
ax.xaxis.set_tick_params(labelsize=8, rotation=70)
plt.ylim(top=1.15, bottom=0.9)
plt.savefig(f'{figs_dir}/no_sliced_exec.pdf', bbox_inches='tight', pad_inches=0)
ax

# Rewriter Stats

In [None]:
cell_rewr_stats = dict()
for nb_idx, nb in enumerate(nbs):
  for r in range(num_runs):
    nb_filename = '_'.join(nb.split('/')) + '.json'
    filepath = f"{pwd}/stats-rewr_stats/s-{r}/{nb_filename}"
    fp = open(filepath, 'r')
    d = json.load(fp)
    fp.close()
    for c_idx, c in enumerate(d['cells']):
      key = f"C_{nb_idx}_{c_idx}"
      try:
        cell_rewr_stats[key].append(c)
      except:
        cell_rewr_stats[key] = [c]

## How Many Patterns We Needed

In [None]:
patts_needed = set()
for cell_key in cell_rewr_stats:
  cell = cell_rewr_stats[cell_key]
  for run in cell:
    patts_needed |= set(run['patts-hit'].keys())

len(patts_needed)

## Max and Mean Absolute Overheads (in any run)

In [None]:
# NOTE: We find the max overhead _in any run_, not means.

all_ohs = []
for cell_key in cell_rewr_stats:
  cell = cell_rewr_stats[cell_key]
  for run in cell:
    oh = run['wall-time'] - run['rewritten-exec-time']
    all_ohs.append(oh)

max_oh = max(all_ohs)
print(f"Maximum Overhead: {max_oh: .2f}ms")
mean_oh = stats.gmean(all_ohs)
print(f"GMean Overhead: {mean_oh: .2f}ms")

## Do we every slow down a rewritten cell (beyond interactive latency) ?

In [None]:
# Pretty small value for interactive latency
interactive_latency = 10
slowdown_rewr_cell = False

# Go through each cell in the stats. If we hit on this cell, then check the
# original stats and see if there's a difference outside the interactive
# latency.
for cell_key in cell_rewr_stats:
  if cell_key not in orig.columns:
    # Dropped as very fast. Skip
    continue
  # Get only one run. If we hit, we hit in all the runs.
  cell = cell_rewr_stats[cell_key][0]
  if cell['patts-hit'] != {}:
    diff = mean_rewr[cell_key] - mean_orig[cell_key]
    if diff > interactive_latency:
      print(f"Slowdown: {diff: .2f}")
      slowdown_rewr_cell = True

if not slowdown_rewr_cell:
  print("NO!")