# Semi-synthetic benchmark

In [None]:
#!python -m pip install numpy pandas scikit-learn spellmatch[benchmark] tqdm

In [None]:
from functools import partial
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid
from tqdm.auto import tqdm

from spellmatch.assignment import assign
from spellmatch.benchmark import run_benchmark
from spellmatch.benchmark.metrics import default_metrics
from spellmatch.io import write_scores
from spellmatch.matching.algorithms.spellmatch import Spellmatch

rng = np.random.default_rng(seed=123)

## Section thickness benchmark

In [None]:
source_points_dir = "source_points"
source_intensities_dir = "source_intensities"
source_clusters_dir = "source_clusters"

simutome_kwargs = {
    # see ../kuett_catena_2022/parameters.ipynb
    "exclude_cells": True,
    "cell_diameter_mean": 7.931,
    "cell_diameter_std": 1.768,
    # see ../kuett_catena_2022/parameters.ipynb
    "displace_cells": True,
    "cell_displacement_mean": 0,
    "cell_displacement_var": 1.0,
}
simutome_param_grid = None
section_thicknesses = [2.0, 4.0, 6.0, 8.0, 10.0]
num_sections = 1

algorithm_dict = {
    "spellmatch": (
        Spellmatch,
        {
            "intensity_transform": np.log1p,
            "scores_tol": 1e-6,
            "adj_radius": 15,
            "alpha": 0.8,
            "spatial_cdist_prior_thres": 25,
            "intensity_weight": 1,
            "intensity_interp_lmd": 1,
            "shared_intensity_pca_n_components": 10,
        },
        None,
        {
            "min_score_q25_union": partial(assign, min_score_quantile=0.25, direction="union", as_matrix=True),
            "max_only_intersect": partial(assign, max_only=True, direction="intersect", as_matrix=True),
            "linear_sum_forward": partial(assign, linear_sum=True, direction="forward", as_matrix=True),
        },
    ),
}

metric_dict = default_metrics

In [None]:
source_points_files = sorted(Path(source_points_dir).glob("*.csv"))
source_intensities_files = sorted(Path(source_intensities_dir).glob("*.csv"))
source_clusters_files = sorted(Path(source_clusters_dir).glob("*.csv"))

section_thickness_benchmark_dir = Path("benchmark") / "section_thickness"
section_thickness_benchmark_scores_dir = section_thickness_benchmark_dir / "scores"
section_thickness_benchmark_scores_dir.mkdir(exist_ok=True, parents=True)

benchmark = run_benchmark(
    source_points_files,
    source_intensities_files,
    source_clusters_files,
    simutome_kwargs,
    simutome_param_grid,
    section_thicknesses,
    num_sections,
    algorithm_dict,
    metric_dict,
    seed=rng,
)

infos = []
all_results = []
for i, (info, scores, results) in tqdm(enumerate(benchmark)):
    scores_file_name = f"scores{i:06d}.nc"
    write_scores(section_thickness_benchmark_scores_dir / scores_file_name, scores)
    infos.append(info)
    for result in results:
        result.update(infos)
    all_results += results
infos = pd.DataFrame(data=infos)
infos.to_csv(section_thickness_benchmark_dir / "infos.csv", index=False)
all_results = pd.DataFrame(data=all_results)
all_results.to_csv(section_thickness_benchmark_dir / "results.csv", index=False)

## Full benchmark

In [None]:
source_points_dir = "source_points"
source_clusters_dir = "source_clusters"
source_intensities_dir = "source_intensities"

simutome_kwargs = {
    # see ../kuett_catena_2022/parameters.ipynb
    "exclude_cells": True,
    "cell_diameter_mean": 7.931,
    "cell_diameter_std": 1.768,
    # see ../kuett_catena_2022/parameters.ipynb
    "displace_cells": True,
    "cell_displacement_mean": 0,
    "cell_displacement_var": 1.0,
}
simutome_param_grid = ParameterGrid(
    {
        # minor mis-alignment
        "image_transform": [
            {
                "image_scale": (1.0, 1.0),
                "image_rotation": image_rotation_degrees * np.pi / 180,
                "image_shear": 0.0,
                "image_translation": image_translation,
            }
            for image_rotation_degrees in [0.0, 2.0]
            for image_translation in [(0.0, 0.0), (5.0, 5.0)]
        ],
        # mis-segmentation / physical separation of "U-shaped" cells
        "cell_division": [
            {
                "cell_division_probab": 0.0,
            },
            {
                "cell_division_probab": 0.05,
                "cell_division_dist_mean": 6.0,
                "cell_division_dist_std": 1.0,
            },
        ],
        # permute a fraction of cells to capture intensity-related effects
        "cell_swapping": [
            {
                "cell_swapping_probab": 0.0,
            },
            {
                "cell_swapping_probab": 0.2,
            },
        ],
    }
)
section_thicknesses = [2.0]
num_sections = 1

algorithm_dict = {
    "spellmatch": (
        Spellmatch,
        {
            "intensity_transform": np.log1p,
            "scores_tol": 1e-6,
            "adj_radius": 15,
            "alpha": 0.8,
            "spatial_cdist_prior_thres": 25,
        },
        ParameterGrid(
            {
                "degrees": [
                    {
                        "degree_weight": 0.0,
                    },
                    {
                        "degree_weight": 1.0,
                        "degree_cdiff_thres": 3,
                    },
                ],
                "intensities": [
                    {
                        "intensity_weight": 0.0,
                    },
                    {
                        "intensity_weight": 1.0,
                        "intensity_interp_lmd": 0/3,
                    },
                    {
                        "intensity_weight": 1.0,
                        "intensity_interp_lmd": 1/3,
                    },
                    {
                        "intensity_weight": 1.0,
                        "intensity_interp_lmd": 2/3,
                    },
                    {
                        "intensity_weight": 1.0,
                        "intensity_interp_lmd": 3/3,
                    },
                ],
                "distances": [
                    {
                        "distance_weight": 0.0,
                    },
                    {
                        "distance_weight": 1.0,
                        "distance_cdiff_thres": 5,
                    },
                ],
            },
        ),
        {
            "min_score_q25_union": partial(assign, min_score_quantile=0.25, direction="union", as_matrix=True),
            "max_only_intersect": partial(assign, max_only=True, direction="intersect", as_matrix=True),
            "linear_sum_forward": partial(assign, linear_sum=True, direction="forward", as_matrix=True),
        },
    )
}

metric_dict = default_metrics

In [None]:
source_points_files = sorted(Path(source_points_dir).glob("*.csv"))
source_intensities_files = sorted(Path(source_intensities_dir).glob("*.csv"))
source_clusters_files = sorted(Path(source_clusters_dir).glob("*.csv"))

full_benchmark_dir = Path("benchmark") / "full"
full_benchmark_scores_dir = full_benchmark_dir / "scores"
full_benchmark_scores_dir.mkdir(exist_ok=True, parents=True)

benchmark = run_benchmark(
    source_points_files,
    source_intensities_files,
    source_clusters_files,
    simutome_kwargs,
    simutome_param_grid,
    section_thicknesses,
    num_sections,
    algorithm_dict,
    metric_dict,
    seed=rng,
)

infos = []
all_results = []
for i, (info, scores, results) in tqdm(enumerate(benchmark)):
    scores_file_name = f"scores{i:06d}.nc"
    write_scores(full_benchmark_scores_dir / scores_file_name, scores)
    infos.append(info)
    for result in results:
        result.update(infos)
    all_results += results
infos = pd.DataFrame(data=infos)
infos.to_csv(full_benchmark_dir / "infos.csv", index=False)
all_results = pd.DataFrame(data=all_results)
all_results.to_csv(full_benchmark_dir / "results.csv", index=False)