In [None]:
# import sys

# from pathlib import Path

# !{sys.executable} -m pip install -e {Path.cwd().parent}

In [None]:
import numpy as np
import pandas as pd
import shutil
import sys

from pathlib import Path
from tqdm.contrib import itertools as tqdm_itertools

from simutome import Simutome

# Synthetic data generation

In [None]:
from tifffile import imread

# load original data for setting parameters
img = imread("../data/datasets/jackson_fischer_2020/BaselTMA/raw/img/BaselTMA_SP41_15.475kx12.665ky_10000x8500_5_20170905_101_141_X12Y2_104_a0_full.tiff")
panel = pd.read_csv("../data/datasets/jackson_fischer_2020/BaselTMA/panel.csv")

In [None]:
cell_data_file = "../data/datasets/jackson_fischer_2020/BaselTMA/cell_data/BaselTMA_SP41_15.475kx12.665ky_10000x8500_5_20170905_101_141_X12Y2_104_a0_full.csv"
cell_id_col = "cell_id"
cell_coord_cols = ["centroid_x", "centroid_y"]
cell_intensity_cols = panel.loc[panel["keep"] == 1, "name"].tolist()
cell_cluster_col = "phenograph_cluster"

image_size = img.shape[:0:-1]
section_thickness = 3.0
seed = 123

dest_dir = "../data/models/jackson_fischer_2020/BaselTMA/BaselTMA_SP41_15.475kx12.665ky_10000x8500_5_20170905_101_141_X12Y2_104_a0_full"

In [None]:
image_occlusion = [
    {
        "image_occlusion": 0.0,
    },
    {
        "image_occlusion": 0.01,
    },
]

image_transform = [
    {
        "image_scale": (1.0, 1.0),
        "image_rotation": image_rotation,
        "image_shear": 0.0,
        "image_translation": (image_translation, image_translation),
    }
    for image_rotation in (0.0, 1.0, 2.0, 3.0)
    for image_translation in (0.0, 5.0, 10.0, 15.0)
]
        
cell_exclusion = [
    {
        "exclude_cells": False,
        "cell_radius_mean": None,
        "cell_radius_std": None,
    },
    {
        "exclude_cells": True,
        "cell_radius_mean": 4.0,
        "cell_radius_std": 1.0,
    },
]

cell_displacement = [
    {
        "displace_cells": False,
        "cell_displacement_mean": None,
        "cell_displacement_var": None,
    },
    {
        "displace_cells": True,
        "cell_displacement_mean": 0.0,
        "cell_displacement_var": 1.0,
    },
]

cell_division = [
    {
        "cell_division_probab": 0.0,
        "cell_division_dist_mean": None,
        "cell_division_dist_std": None,
    },
    {
        "cell_division_probab": 0.01,
        "cell_division_dist_mean": 4.0,
        "cell_division_dist_std": 1.0,
    },
]

cell_swapping = [
    {
        "cell_swapping_probab": 0.0,
    },
    {
        "cell_swapping_probab": 0.2,
    },
]

n = 10

In [None]:
Path(dest_dir).mkdir(exist_ok=True, parents=True)
shutil.copyfile(cell_data_file, Path(dest_dir) / "orig_cell_data.csv")
simulated_cell_data_dir_path = Path(dest_dir) / "simulated_cell_data"
simulated_cell_data_dir_path.mkdir(exist_ok=True)

usecols = [cell_id_col] + cell_coord_cols
if cell_intensity_cols:
    usecols += cell_intensity_cols
if cell_cluster_col:
    usecols.append(cell_cluster_col)
cell_data = pd.read_csv(cell_data_file, usecols=usecols)
cell_ids = cell_data[cell_id_col].values
cell_coords = cell_data[cell_coord_cols].values
cell_intensities = cell_data[cell_intensity_cols].values if cell_intensity_cols else None
cell_clusters = cell_data[cell_cluster_col].values if cell_cluster_col else None

seed_rng = np.random.default_rng(seed=seed)

simulated_cell_data_params = []
simulated_cell_data_file_names = []
for param_set in tqdm_itertools.product(
    image_occlusion,
    image_transform,
    cell_exclusion,
    cell_displacement,
    cell_division,
    cell_swapping,
):
    params = {k: v for param_group in param_set for k, v in param_group.items()}
    params["shuffle_cells"] = True
    params["seed"] = seed_rng.integers(sys.maxsize)
    section_gen = Simutome(**params).generate_sections(
        cell_coords,
        section_thickness,
        image_size=image_size,
        cell_intensities=cell_intensities,
        cell_clusters=cell_clusters,
        n=n,
    )
    for section_number, (section_cell_indices, section_cell_coords, section_cell_intensities) in enumerate(section_gen):
        simulated_cell_data = {f"orig_{cell_id_col}": cell_ids[section_cell_indices]}
        for i, col in enumerate(cell_coord_cols):
            simulated_cell_data[col] = section_cell_coords[:, i]
        if section_cell_intensities is not None:
            for i, col in enumerate(cell_intensity_cols):
                simulated_cell_data[col] = section_cell_intensities[:, i]
        simulated_cell_data = pd.DataFrame(data=simulated_cell_data)
        simulated_cell_data_file = simulated_cell_data_dir_path / f"simulated_cell_data_{len(simulated_cell_data_file_names):06d}.csv"
        simulated_cell_data.to_csv(simulated_cell_data_file, index=False)
        simulated_cell_data_params.append({**params, "section_number": section_number})
        simulated_cell_data_file_names.append(simulated_cell_data_file.name)

simulated_cell_data_info = pd.DataFrame(data=simulated_cell_data_params, index=pd.Index(simulated_cell_data_file_names, name="file"))
simulated_cell_data_info.to_csv(Path(dest_dir) / "simulated_cell_data.csv", index=True)
simulated_cell_data_info