# Phenotype Data

## Library Import

In [None]:
import json
import os

import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.model_selection import train_test_split
from tqdm import tqdm

CANCER_TYPES = [
    "Adenocarcinoma",
    "Squamous cell carcinoma",
    "Adeno squamous cell carcinoma",
    "Control",
    "Large cell carcinoma",
    "Large cell neuroendocrine carcinoma",
    "NSCLC",
    "Mesotheliom",
    "Basaloides Ca",
    "NA",
]

TARGETS = [
    "Relapse",
    "Ev.O",
    "Grade",
    "Stage",
    "T.new",
    "N",
    "M.new",
    "R",
    "Chemo",
    "Radio",
    "DFS",
    "OS",
    "LN.Met",
    "Dist.Met",
]

IMC_TO_CODEX_SCALE_FACTOR = 2.75

## NSCLC Dataset

In [None]:
adata_raw = sc.read_h5ad("../data/phenotype/nsclc/sce_all_annotated.h5ad")
adata_raw

In [None]:
def filter_for_cancer_types(cell_information, cancer_types):
    assert set(cancer_types).issubset(set(CANCER_TYPES[:3]))
    return cell_information.loc[cell_information["DX.name"].isin(cancer_types)]


def load_adata(adata_path, cancer_types, targets):
    # load adata and cell information
    adata = sc.read_h5ad(adata_path)
    cell_information = adata.obs

    # replace -2147483648 with NaN
    cell_information_clean = cell_information.replace(-2147483648, np.nan)

    # filter for cancer types
    cell_information_clean = cell_information_clean.loc[
        (cell_information_clean["DX.name"] == "Adenocarcinoma")
        | (cell_information_clean["DX.name"] == "Squamous cell carcinoma")
        | (cell_information_clean["DX.name"] == "Adeno squamous cell carcinoma")
    ]
    cell_information_clean = filter_for_cancer_types(cell_information_clean, cancer_types)

    # normalize area
    cell_information_clean["Area"] = cell_information_clean["Area"] / cell_information_clean[
        "Area"
    ].quantile(0.95)

    # lower cell category, type and subtype (e.g. "T cell" -> "t cell")
    cell_information_clean["cell_category"] = cell_information_clean["cell_category"].str.lower()
    cell_information_clean["cell_type"] = cell_information_clean["cell_type"].str.lower()
    cell_information_clean["cell_subtype"] = cell_information_clean["cell_subtype"].str.lower()

    # drop NaNs in cell category, type and subtype
    cell_information_clean = cell_information_clean.dropna(
        subset=["cell_category", "cell_type", "cell_subtype"]
    )

    # convert LN.Met, Dist.Met, and NeoAdj to binary
    cell_information_clean["LN.Met"] = cell_information_clean["LN.Met"].cat.rename_categories(
        {"No LN Metastases": 0, "LN Metastases": 1}
    )
    cell_information_clean["Dist.Met"] = cell_information_clean["Dist.Met"].cat.rename_categories(
        {"No Dist. Metastases": 0, "Dist. Metastases": 1}
    )
    cell_information_clean["NeoAdj"] = cell_information_clean["NeoAdj"].cat.rename_categories(
        {"NoNeoAdjuvantTherapy": 0, "NeoAdjuvantTherapy": 1}
    )

    # drop NaNs in target columns
    cell_information_clean = cell_information_clean.dropna(subset=targets)

    return adata, cell_information_clean

In [None]:
adata_path = "../data/phenotype/nsclc/sce_all_annotated.h5ad"
target_names = ["Relapse", "Ev.O", "LN.Met"]

adata, cell_information_clean = load_adata(adata_path, CANCER_TYPES[:3], target_names)

In [None]:
cell_information_clean

In [None]:
def separate_regions(adata, cell_information, targets):
    # group regions by Tma_ac
    grouped = cell_information.groupby("Tma_ac", observed=True)

    # get the biomarker names = proteins measured
    biomarker_names = adata.var_names.values

    # initialize the regions and targets dictionaries
    # regions = {region_name: {"cell_ids": cell_ids, "coordinates": coordinates, "cell_types": cell_types, "sizes": sizes, "biomarkers": biomarkers}}
    # targets = {target1: [target_value1, target_value2, ...], target2: [target_value1, target_value2, ...], ...}
    regions = {}
    targets = {target: [] for target in targets}

    # iterate over the groups and extract the information for each region
    for name, group in grouped:
        cell_ids = group.index.values
        coordinates = group[["Center_X", "Center_Y"]].values
        cell_types = group["cell_subtype"].values
        sizes = group["Area"].values
        biomarkers = adata[cell_ids].layers["c_counts_asinh_scaled"].toarray()
        for target in targets:
            target_values = group[target]
            unique_targets = target_values.unique()
            assert len(unique_targets) == 1
            targets[target].append(unique_targets[0])
        regions[name] = {
            "cell_ids": cell_ids,
            "coordinates": coordinates,
            "cell_types": cell_types,
            "sizes": sizes,
            "biomarkers": biomarkers,
        }

    return regions, targets, biomarker_names

In [None]:
regions, targets, biomarker_names = separate_regions(adata, cell_information_clean, target_names)

In [None]:
def train_valid_split(cell_information, target_columns, valid_fraction):
    # concat all target columns to one string used for stratification
    cell_information["stratify"] = (
        cell_information[target_columns].astype(str).agg("".join, axis=1)
    )

    # split patients into train and valid sets using stratified sampling
    patient_df = cell_information[["Patient_ID", "stratify"]].drop_duplicates()
    train_patients, valid_patients = train_test_split(
        list(patient_df["Patient_ID"]),
        test_size=valid_fraction,
        stratify=list(patient_df["stratify"]),
        random_state=44,
    )

    # get the regions for train and valid sets
    train_regions = cell_information.loc[cell_information["Patient_ID"].isin(train_patients)]
    train_regions = list(train_regions["Tma_ac"].unique())
    valid_regions = cell_information.loc[cell_information["Patient_ID"].isin(valid_patients)]
    valid_regions = list(valid_regions["Tma_ac"].unique())

    return train_regions, valid_regions

In [None]:
valid_fraction = 0.2
output_dir = "../data/phenotype/regions"

train_regions, valid_regions = train_valid_split(
    cell_information_clean, ["Relapse", "Grade"], valid_fraction
)

os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, "train_regions_grade.json"), "w") as f:
    json.dump(train_regions, f)
with open(os.path.join(output_dir, "valid_regions_grade.json"), "w") as f:
    json.dump(valid_regions, f)

In [None]:
def store_regions(regions, targets, biomarker_names, output_dir):
    for name, region in tqdm(list(regions.items())):
        os.makedirs(os.path.join(output_dir, name), exist_ok=True)
        cell_ids = region["cell_ids"]
        # store cell coordinates as csv with cell_ids
        cell_coords = region["coordinates"]
        cell_coords *= IMC_TO_CODEX_SCALE_FACTOR
        coordinates_df = pd.DataFrame(cell_coords, columns=["X", "Y"])
        coordinates_df["CELL_ID"] = cell_ids
        coordinates_df.to_csv(os.path.join(output_dir, name, "coordinates.csv"), index=False)
        # store cell types as csv with cell_ids
        cell_types = region["cell_types"]
        cell_types_df = pd.DataFrame(cell_types, columns=["CELL_TYPE"])
        cell_types_df["CELL_ID"] = cell_ids
        cell_types_df.to_csv(os.path.join(output_dir, name, "cell_types.csv"), index=False)
        # store cell sizes as csv with cell_ids
        cell_sizes = region["sizes"]
        cell_sizes_df = pd.DataFrame(cell_sizes, columns=["SIZE"])
        cell_sizes_df["CELL_ID"] = cell_ids
        cell_sizes_df.to_csv(os.path.join(output_dir, name, "cell_sizes.csv"), index=False)
        # store biomarker expression as csv with cell_ids
        biomarkers = region["biomarkers"]
        biomarkers_df = pd.DataFrame(biomarkers, columns=biomarker_names)
        biomarkers_df["CELL_ID"] = cell_ids
        biomarkers_df.to_csv(os.path.join(output_dir, name, "expression.csv"), index=False)
    # store target values as csv, columns are target keys, rows are regions
    targets_df = pd.DataFrame(targets)
    targets_df.columns = [col.upper() for col in targets_df.columns]
    targets_df["REGION_ID"] = list(regions.keys())
    targets_df.to_csv(os.path.join(output_dir, "targets.csv"), index=False)

In [None]:
store_regions(regions, targets, biomarker_names, output_dir)