### Results for: "Cloud-compatible pipelines for processing and evaluating large-scale electrophysiology data"

This notebook reproduces the panels for Figures 4, 5 and S1 of the *Cloud-compatible pipelines for processing and evaluating large-scale electrophysiology data* manuscript.

In [None]:
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from itertools import combinations
from functools import reduce

import warnings

from utils import plot_aggregated_results, plot_performance_curves, stat_test

%matplotlib ipympl

In [None]:
warnings.filterwarnings("ignore")

In [None]:
save_figs = True

In [None]:
data_folder = Path("../data")
results_folder = Path("../results")

# Sorters

In [None]:
figures_sorters = results_folder / "sorters"
figures_sorters.mkdir(parents=True, exist_ok=True)

In [None]:
np1_metrics = pd.read_csv(data_folder / "sorters" / "np1" / "metrics_sorted.csv", index_col=False)
np2_metrics = pd.read_csv(data_folder / "sorters" / "np2" / "metrics_sorted.csv", index_col=False)
all_metrics_sorters = pd.concat([np1_metrics, np2_metrics])

In [None]:
colors_sorters = {
    "kilosort25": "C3",
    "kilosort4": "C4"
}
order_sorters = np.unique(all_metrics_sorters["sorting_case"])

In [None]:
metric_res_name = "rp_contamination"
fig, ax = plt.subplots(figsize=(12, 7))
sns.boxenplot(data=all_metrics_sorters, y=metric_res_name, x="sorting_case",  palette=colors_sorters, ax=ax, showfliers=False, log_scale=True,
              order=order_sorters)
sns.despine(fig)
ax.legend().remove()
ax.set_ylabel("")
ax.set_xlabel("")
ax.set_xticklabels([])
if save_figs:
    fig.savefig(figures_sorters / "rp_violations.pdf")

In [None]:
metric_res_name = "presence_ratio"
fig, ax = plt.subplots(figsize=(12, 7))
sns.histplot(data=all_metrics_sorters, x=metric_res_name, hue="sorting_case", stat="probability", palette=colors_sorters, bins=20, fill=False, ax=ax)
sns.despine(fig)
ax.legend().remove()
ax.set_ylabel("")
ax.set_xlabel("")
if save_figs:
    fig.savefig(figures_sorters / "presence_ratio.pdf")

In [None]:
test_res = stat_test(all_metrics_sorters, column_group_by="sorting_case", test_columns=["rp_violations", "presence_ratio"], paired=False, verbose=True)

In [None]:
all_metrics_sorters.groupby("sorting_case")["sorting_case"].count()

In [None]:
dataframes_sorters_all = {}

for probe in ["np1", "np2"]:
    dataframes_sorters_all[probe] = {}
    df_folder = data_folder / "sorters" / probe
    dataframes = {}
    for csv_file in df_folder.iterdir():
        if csv_file.suffix != ".csv":
            continue
        df_name = csv_file.stem
        df = pd.read_csv(csv_file, index_col=False)
        dataframes[df_name] = df
    dataframes_sorters_all[probe] = dataframes

In [None]:
figs_probe = {}
for probe, dataframes in dataframes_sorters_all.items():
    figs, dfs_merged = plot_aggregated_results(dataframes, colors_sorters)
    # test
    print(f"\n\nTests for probe {probe}:")
    test_res = stat_test(dataframes["performances"], column_group_by="sorting_case", test_columns=["accuracy", "precision", "recall"], paired=True, verbose=True)
    figs_probe[probe] = figs
    if save_figs:
        for fname, fig in figs.items():
            fig.savefig(figures_sorters / f"{probe}_{fname}.pdf")

In [None]:
# plot run times
for probe, df_all in dataframes_sorters_all.items():
    print(f"Probe {probe}")
    df_runtimes = df_all["run_times"]
    print(df_runtimes.groupby("sorting_case").count()["run_times"])
    df_runtimes.loc[:, "run_times_rel"] = df_runtimes["run_times"] /  df_runtimes["duration"]
    fig, ax = plt.subplots(figsize=(7, 7))
    sns.boxenplot(data=df_runtimes, y="run_times_rel", x="sorting_case",  palette=colors_sorters, ax=ax, showfliers=True, log_scale=False,
                  order=order_sorters)
    sns.despine(fig)
    ax.legend().remove()
    ax.set_ylabel("")
    ax.set_xlabel("")
    ax.set_xticklabels([])
    if save_figs:
        fig.savefig(figures_sorters / f"run_times_{probe}.pdf")

# Lossy

In [None]:
figures_lossy = results_folder / "lossy"
figures_lossy.mkdir(parents=True, exist_ok=True)

In [None]:
np1_metrics = pd.read_csv(data_folder / "lossy" / "np1" / "metrics_sorted.csv", index_col=False)
np2_metrics = pd.read_csv(data_folder / "lossy" / "np2" / "metrics_sorted.csv", index_col=False)

In [None]:
all_metrics_lossy = pd.concat([np1_metrics, np2_metrics])

In [None]:
all_metrics_lossy.groupby("sorting_case")["sorting_case"].count()

In [None]:
order_lossy = np.unique(all_metrics_lossy["sorting_case"])
order_lossy = [order_lossy[0]] + list(order_lossy[1:][::-1])
colors_lossy = {}
cmap_is = np.linspace(0.2, 0.6, len(order_lossy))
cmap = "Greens_r"
for i, s in enumerate(order_lossy[1:]):
    cmap = plt.get_cmap(cmap)
    colors_lossy[s] = cmap(cmap_is[i])
colors_lossy["lossless"] = "C1"

In [None]:
metric_res_name = "rp_contamination"
fig, ax = plt.subplots(figsize=(12, 7))
sns.boxenplot(data=all_metrics_lossy, y=metric_res_name, x="sorting_case",  palette=colors_lossy, ax=ax, showfliers=False, log_scale=True,
              order=order_lossy)
sns.despine(fig)
ax.legend().remove()
ax.set_ylabel("")
ax.set_xlabel("")
ax.set_xticklabels([])
if save_figs:
    fig.savefig(figures_lossy / "rp_violations.pdf")

In [None]:
metric_res_name = "presence_ratio"
fig, ax = plt.subplots(figsize=(12, 7))
sns.histplot(data=all_metrics_lossy, x=metric_res_name, hue="sorting_case", stat="probability", palette=colors_lossy, bins=20, fill=False, ax=ax)
sns.despine(fig)
ax.legend().remove()
ax.set_ylabel("")
ax.set_xlabel("")
# ax.set_xlim(-0.02, 0.3)
if save_figs:
    fig.savefig(figures_lossy / "presence_ratio.pdf")

In [None]:
test_res = stat_test(all_metrics_lossy, column_group_by="sorting_case", test_columns=["rp_violations", "presence_ratio"], verbose=True, sig=0.01)

In [None]:
dataframes_lossy_all = {}

for probe in ["np1", "np2"]:
    dataframes_lossy_all[probe] = {}
    df_folder = data_folder / "lossy" / probe
    dataframes = {}
    for csv_file in df_folder.iterdir():
        if csv_file.suffix != ".csv":
            continue
        df_name = csv_file.stem
        df = pd.read_csv(csv_file, index_col=False)
        dataframes[df_name] = df
    dataframes_lossy_all[probe] = dataframes

In [None]:
figs_probe = {}
for probe, dataframes in dataframes_lossy_all.items():
    figs, _ = plot_aggregated_results(dataframes, colors_lossy, include_string_in_pair="lossless")
    # add Kilosort25 line
    axes_perf = figs["performance"].get_axes()
    df_units_ks25 = dataframes_sorters_all[probe]["performances"].query("sorting_case == 'kilosort25'")
    _ = plot_performance_curves(df_units_ks25, ["accuracy", "precision", "recall"], ["kilosort25"], colors_sorters, axes_perf, lw=1.5, ls="--", alpha=0.8)

    figs_probe[probe] = figs
    print(f"\n\nTests for probe {probe}:")
    # we have to make sure counts are the same
    df = dataframes["performances"]
    case_counts = df["sorting_case"].value_counts()
    if len(set(case_counts.values)) > 1:
        print("Filtering entries to ensure paired samples")
        min_case = case_counts.idxmin()
        # Step 3: Extract the combinations from the smallest case
        valid_keys = df[df["sorting_case"] == min_case][["stream_name", "session", "case", "gt_unit_id"]].drop_duplicates()
        # Step 4: Filter the full DataFrame to keep only rows with those combinations
        filtered_df = df.merge(valid_keys, on=["stream_name", "session", "case", "gt_unit_id"], how="inner")
    else:
        filtered_df = df
    perfs_df = dataframes["performances"]
    test_res = stat_test(filtered_df, column_group_by="sorting_case", test_columns=["accuracy", "precision", "recall"], paired=True, verbose=True)
    if save_figs:
        for fname, fig in figs.items():
            fig.savefig(figures_lossy / f"{probe}_{fname}.pdf")