# Jackson & Fischer et al. (Nature, 2020)

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
raw_points_file = "raw/Basel_SC_locations.csv"
raw_features_file = "raw/Data_publication/BaselTMA/SC_dat.csv"
raw_clusters_file = "raw/Data_publication/BaselTMA/PG_final_k20.csv"

intensity_features = {
    "1261726In113Di Histone": "HH3",
    "473968La139Di Histone": "H3K27me3",
    "651779Pr141Di Cytoker": "CK5",
    "3281668Nd142Di Fibrone": "Fibronectin",
    "3111576Nd143Di Cytoker": "CK19",
    "971099Nd144Di Cytoker": "CK8/18",
    "Nd145Di Twist": "Twist",
    "77877Nd146Di CD68": "CD68",
    "346876Sm147Di Keratin": "CK14",
    "174864Nd148Di SMA": "SMA",
    "1921755Sm149Di Vimenti": "Vimentin",
    "322787Nd150Di cMyc": "cMYC",
    "201487Eu151Di cerbB": "HER2",
    "8001752Sm152Di CD3epsi": "CD3",
    "phospho Histone": "p-HH3",
    "3521227Gd155Di Slug": "Slug",
    "112475Gd156Di Estroge": "ER",
    "312878Gd158Di Progest": "PR",
    "207736Tb159Di p53": "p53",
    "6967Gd160Di CD44": "CD44",
    "71790Dy162Di CD45": "CD45",
    "117792Dy163Di GATA3": "GATA3",
    "361077Dy164Di CD20": "CD20",
    "92964Er166Di Carboni": "CAIX",
    "1031747Er167Di ECadhe": "E/P-Cadherin",
    "1441101Er168Di Ki67": "Ki-67",
    "1021522Tm169Di EGFR": "EGFR",
    "phospho S6": "p-S6",
    "378871Yb172Di vWF": "vWF/CD31",
    "phospho mTOR": "p-mTOR",
    "98922Yb174Di Cytoker": "CK7",
    "234832Lu175Di panCyto": "pan-CK",
    "198883Yb176Di cleaved": "CC3/cPARP",
    "10331253Ir191Di Iridium": "DNA1",
    "10331254Ir193Di Iridium": "DNA2",
}

cores = {
    "dense_homogeneous": "BaselTMA_SP41_191_X15Y7",
    "dense_heterogeneous": "BaselTMA_SP43_115_X4Y8",
    "sparse_homogeneous": "BaselTMA_SP42_84_X8Y1",
    "sparse_heterogeneous": "BaselTMA_SP43_17_X12Y4",
}

## Data download

1. Download the files `SingleCell_and_Metadata.zip` and `singlecell_locations.zip` from https://zenodo.org/record/4607374 (version 2)
2. Extract the contents of both files to the `raw` directory (create the directory if it does not exist)

## Data extraction

In [3]:
points = pd.read_csv(raw_points_file)
points = points.loc[
    :,  # points["core"].isin(cores.values()),
    ["core", "ObjectNumber_renamed", "Location_Center_X", "Location_Center_Y"],
].set_index(["core", "ObjectNumber_renamed"])
points.rename(columns={"Location_Center_X": "x", "Location_Center_Y": "y"}, inplace=True)
points.index.names = ["core", "cell"]

In [4]:
features = pd.read_csv(raw_features_file)
features = features.loc[
    :,  # features["core"].isin(cores.values()),
    ["core", "CellId", "channel", "mc_counts"],
].pivot(index=["core", "CellId"], columns="channel", values="mc_counts")
intensities = features.loc[:, list(intensity_features.keys())]
intensities.rename(columns=intensity_features, inplace=True)
intensities.index.names = ["core", "cell"]
del features

In [5]:
clusters = pd.read_csv(raw_clusters_file)
clusters = clusters.loc[
    :,  # clusters["core"].isin(cores.values()),
    ["core", "CellId", "PhenoGraphBasel"],
].set_index(["core", "CellId"])
clusters.rename(columns={"PhenoGraphBasel": "cluster"}, inplace=True)
clusters.index.names = ["core", "cell"]

In [6]:
points_dir = Path("points")
clusters_dir = Path("clusters")
intensities_dir = Path("intensities")

points_dir.mkdir(exist_ok=True)
clusters_dir.mkdir(exist_ok=True)
intensities_dir.mkdir(exist_ok=True)

for core_label, core in cores.items():
    current_points = points.loc[core].sort_index()
    current_clusters = clusters.loc[core].loc[current_points.index]
    current_intensities = intensities.loc[core].loc[current_points.index]
    current_points.to_csv(points_dir / f"{core_label}.csv")
    current_clusters.to_csv(clusters_dir / f"{core_label}.csv")
    current_intensities.to_csv(intensities_dir / f"{core_label}.csv")