# Changes of cell type abundancies in PBMCs reflects the cancer and treatment effect of proton therapy (all cancers)

---
This notebook summarizes an essential part of the analysis corresponding to the results presented in figure 5 of the paper for all cancer patients. It can be used to rerun the analysis and regenerate the corresponding panels.

---

## 0. Environmental setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import matplotlib as mpl
from matplotlib.collections import PolyCollection
from matplotlib.legend_handler import HandlerTuple
from matplotlib.colors import to_rgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedGroupKFold

mpl.rcParams["figure.dpi"] = 300

import sys

sys.path.append("../..")
from src.utils.notebooks.eda import *
from src.utils.notebooks.figure3 import *
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

seed = 1234
random.seed(1234)
np.random.seed(1234)

%reload_ext nb_black

In [None]:
def plot_joint_markers_celltype(
    data,
    markers,
    marker_labels,
    label_col="condition",
    cut=0,
    palette=None,
    plot_type="violin",
    figsize=[6, 3],
):
    all_markers = []
    boxpairs = []
    labels = np.array(data.loc[:, label_col])
    for marker in markers:
        marker_data = np.array(data.loc[:, marker])
        marker_data = MinMaxScaler().fit_transform(marker_data.reshape(-1, 1))
        marker_data = pd.DataFrame(marker_data, columns=["norm_value"])
        marker_data["condition"] = labels
        marker_data["marker"] = marker
        all_markers.append(marker_data)
    all_markers = pd.concat(all_markers)
    all_markers.marker = all_markers.marker.map(dict(zip(markers, marker_labels)))
    for marker in np.unique(all_markers.marker):
        boxpairs.append(((marker, "CD3+"), (marker, "CD3-")))

    fig, ax = plot_marker_distribution(
        data=all_markers,
        marker="norm_value",
        label_col="marker",
        hue="condition",
        order=marker_labels,
        hue_order=["CD3+", "CD3-"],
        palette=palette,
        plot_type=plot_type,
        box_pairs=boxpairs,
        figsize=figsize,
        cut=cut,
    )
    return fig, ax

In [None]:
nuc_feature_desc = pd.read_csv(
    "../../data/chrometric_feature_description.csv", index_col=0
)
feature_name_dict = dict(
    zip(
        list(nuc_feature_desc.loc[:, "feature"]),
        list(nuc_feature_desc.loc[:, "long_name"]),
    )
)
feature_color_dict = {
    "morphology": "b",
    "intensity": "g",
    "boundary": "r",
    "texture": "c",
    "chromatin condensation": "m",
    "moments": "y",
    np.nan: "k",
}
feature_color_dict = {
    feature: feature_color_dict[category]
    for (feature, category) in zip(
        list(nuc_feature_desc.loc[:, "long_name"]),
        list(nuc_feature_desc.loc[:, "category"]),
    )
}

In [None]:
def get_ct_abundance_data(data, normalize=True):
    ct_abundance_data = {
        "id": [],
        "sample": [],
        "timepoint": [],
        "cancer": [],
        "n_pbmc": [],
    }
    all_celltypes = np.unique(data.loc[:, "celltype"])
    for idx in np.unique(data.loc[:, "id"]):
        id_data = data.loc[data.id == idx]

        sample_name = np.array(id_data.loc[:, "sample"])[0]
        timepoint = np.array(id_data.loc[:, "timepoint"])[0]
        condition = np.array(id_data.loc[:, "cancer"])[0]
        n_pbmc = len(id_data)

        ct_abundance_data["id"].append(idx)
        ct_abundance_data["sample"].append(sample_name)
        ct_abundance_data["timepoint"].append(timepoint)
        ct_abundance_data["cancer"].append(condition)
        ct_abundance_data["n_pbmc"].append(n_pbmc)

        for ct in all_celltypes:
            if ct not in ct_abundance_data:
                ct_abundance_data[ct] = []
            count = len(id_data.loc[id_data.loc[:, "celltype"] == ct])
            if normalize:
                count /= n_pbmc
            ct_abundance_data[ct].append(count)

    return pd.DataFrame(ct_abundance_data)

In [None]:
def plot_celltype_abundancies(
    data,
    celltypes,
    celltype_labels=None,
    cut=2,
    figsize=[4, 4],
    plot_type="violin",
    quantiles=None,
    test="Mann-Whitney",
):

    mg_colors = ["lightsteelblue", "royalblue", "midnightblue"]
    gl_colors = ["orange", "gold", "saddlebrown"]
    hn_colors = ["plum", "deeppink", "indigo"]

    violin_all_colors = mg_colors + gl_colors + hn_colors
    bar_all_colors = []
    for i in range(3):

        bar_all_colors.append(mg_colors[i])
        bar_all_colors.append(gl_colors[i])
        bar_all_colors.append(hn_colors[i])

    for i in range(len(celltypes)):
        fig, ax = plot_marker_distribution(
            data,
            figsize=figsize,
            marker=celltypes[i],
            label_col="cancer",
            order=["Meningioma", "Glioma", "Head & Neck"],
            box_pairs=[
                (("Meningioma", "prior"), ("Meningioma", "during")),
                (("Meningioma", "prior"), ("Meningioma", "end")),
                (("Meningioma", "during"), ("Meningioma", "end")),
                (("Glioma", "prior"), ("Glioma", "during")),
                (("Glioma", "prior"), ("Glioma", "end")),
                (("Glioma", "during"), ("Glioma", "end")),
                (("Head & Neck", "prior"), ("Head & Neck", "during")),
                (("Head & Neck", "prior"), ("Head & Neck", "end")),
                (("Head & Neck", "during"), ("Head & Neck", "end")),
            ],
            quantiles=quantiles,
            cut=cut,
            plot_type=plot_type,
            palette="gray",
            hue="timepoint",
            hue_order=["prior", "during", "end"],
            test=test,
        )
        ax.set_xlabel("Condition")
        if celltype_labels is not None:
            ax.set_ylabel(celltype_labels[i])

        handles = []
        if plot_type == "violin":
            for ind, obj in enumerate(ax.findobj(PolyCollection)):
                rgb = to_rgb(violin_all_colors[ind])
                obj.set_facecolor(rgb)
                handles.append(
                    plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor="black")
                )
        elif plot_type == "bar":
            for ind, bar in enumerate(ax.patches):
                rgb = to_rgb(bar_all_colors[ind])
                bar.set_color(rgb)
                handles_color = to_rgb(violin_all_colors[ind])
                handles.append(
                    plt.Rectangle(
                        (0, 0), 0, 0, facecolor=handles_color, edgecolor="black"
                    )
                )
        else:
            raise NotImplementedError

        ax.legend(
            handles=[
                tuple(handles[0::3]),
                tuple(handles[1::3]),
                tuple(handles[2::3]),
            ],
            labels=tp_order,
            title="Timepoint",
            handlelength=4,
            handler_map={tuple: HandlerTuple(ndivide=None, pad=0)},
            bbox_to_anchor=(1.02, 0.5),
            loc="center left",
            borderaxespad=0,
        )

        plt.show()
        plt.close()

In [None]:
def plot_cell_type_markers_dist(
    data,
    markers,
    marker_labels,
    quantiles=None,
    cut=2,
    plot_type="violin",
    palette=None,
    figsize=[4, 4],
    hue=None,
    hue_order=None,
    test="t-test_ind",
):
    for i in range(len(markers)):
        fig, ax = plot_marker_distribution(
            data,
            figsize=figsize,
            marker=markers[i],
            label_col="celltype",
            order=["CD3+", "CD3-"],
            box_pairs=[
                ("CD3+", "CD3-"),
            ],
            quantiles=quantiles,
            cut=cut,
            plot_type=plot_type,
            palette=palette,
            hue=hue,
            hue_order=hue_order,
            test=test,
        )
        ax.set_xlabel("Cell type")
        ax.set_ylabel(marker_labels[i])
        plt.show()
        plt.close()

In [None]:
def plot_timepoint_cancer_celltype_markers_dist(
    data,
    markers,
    marker_labels,
    celltypes,
    quantiles=None,
    cut=2,
    plot_type="violin",
    palette=None,
    figsize=[4, 4],
    hue=None,
    hue_order=None,
):
    mg_colors = ["lightsteelblue", "royalblue", "midnightblue"]
    gl_colors = ["orange", "gold", "saddlebrown"]
    hn_colors = ["plum", "deeppink", "indigo"]

    violin_all_colors = mg_colors + gl_colors + hn_colors
    bar_all_colors = []
    for i in range(3):
        bar_all_colors.append(mg_colors[i])
        bar_all_colors.append(gl_colors[i])
        bar_all_colors.append(hn_colors[i])

    for i in range(len(markers)):
        fig, ax = plt.subplots(figsize=figsize, ncols=len(celltypes), sharey=True)
        ax = ax.flatten()
        for j, celltype in enumerate(celltypes):

            fig, ax[j] = plot_marker_distribution(
                data.loc[data.loc[:, "celltype"] == celltype],
                figsize=figsize,
                marker=markers[i],
                label_col="cancer",
                order=["Meningioma", "Glioma", "Head & Neck"],
                box_pairs=[
                    (("Meningioma", "prior"), ("Meningioma", "during")),
                    (("Meningioma", "prior"), ("Meningioma", "end")),
                    (("Meningioma", "during"), ("Meningioma", "end")),
                    (("Glioma", "prior"), ("Glioma", "during")),
                    (("Glioma", "prior"), ("Glioma", "end")),
                    (("Glioma", "during"), ("Glioma", "end")),
                    (("Head & Neck", "prior"), ("Head & Neck", "during")),
                    (("Head & Neck", "prior"), ("Head & Neck", "end")),
                    (("Head & Neck", "during"), ("Head & Neck", "end")),
                ],
                quantiles=quantiles,
                cut=cut,
                plot_type=plot_type,
                palette="gray",
                hue="timepoint",
                hue_order=["prior", "during", "end"],
                fig=fig,
                ax=ax[j],
            )
        for j in range(len(ax)):
            ax[j].set_xlabel("Cancer type")
            ax[j].set_ylabel(marker_labels[i])
            if j != len(ax) - 1:
                ax[j].legend().remove()

            handles = []
            if plot_type == "violin":
                for ind, obj in enumerate(ax[j].findobj(PolyCollection)):
                    rgb = to_rgb(violin_all_colors[ind])
                    obj.set_facecolor(rgb)
                    handles.append(
                        plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor="black")
                    )
            elif plot_type == "bar":
                for ind, bar in enumerate(ax[j].patches):
                    rgb = to_rgb(bar_all_colors[ind])
                    bar.set_color(rgb)
                    handles_color = to_rgb(violin_all_colors[ind])
                    handles.append(
                        plt.Rectangle(
                            (0, 0), 0, 0, facecolor=handles_color, edgecolor="black"
                        )
                    )
            else:
                raise NotImplementedError

        ax[-1].legend(
            handles=[tuple(handles[::3]), tuple(handles[1::3]), tuple(handles[2::3])],
            labels=tp_order,
            title="Timepoint",
            handlelength=4,
            handler_map={tuple: HandlerTuple(ndivide=None, pad=0)},
            bbox_to_anchor=(1.02, 0.5),
            loc="center left",
            borderaxespad=0,
        )

        plt.show()
        plt.close()

In [None]:
color_palette = {
    "Meningioma": "cornflowerblue",
    "Glioma": "orange",
    "Head & Neck": "orchid",
}

In [None]:
mg_colors = ["lightsteelblue", "royalblue", "midnightblue"]
gl_colors = ["orange", "gold", "saddlebrown"]
hn_colors = ["plum", "deeppink", "indigo"]

---

## 1. Read in data

In this notebook we assess the differences of the cell states of PBMCs at three different time points of the proton therapy treatment: before, during (~3 weeks in) and at the end of the treatment (final week of treatment). To this end, we obtained PBMCs of 8 Meningioma, 8 Glioma and 6 Head & Neck cancer patients stained them for DNA and CD3 (alongside gH2AX and Lamin A/C) and obtained fluorescent images.

First, we read in the required data set that describe each PBMCs by a number of hand-crafted features extracted from the fluorescent images of the cells.

In [None]:
all_data = pd.read_csv("../../treated_population_data_w_cd3.csv", index_col=0)

In [None]:
all_data = preprocess_data(all_data, remove_constant_features=False)
all_data = all_data.rename(columns=feature_name_dict)

In [None]:
deselected_ids = ["p421", "p723"]

all_data = all_data.loc[~all_data.loc[:, "id"].isin(deselected_ids)]

In [None]:
fig, ax = plt.subplots(figsize=[12, 4], ncols=2)
tp_order = ["prior", "during", "end"]
sample_order = np.unique(all_data.loc[:, "sample"])
ax = ax.flatten()
ax[0] = sns.countplot(
    x="sample",
    data=all_data,
    ax=ax[0],
    order=sample_order,
    hue_order=tp_order,
    hue="timepoint",
    palette="gray",
)
ax[0].legend([], [], frameon=False)
ax[0].set_xlabel("ID of the biological sample")
ax[0].set_title("Distribution of biological samples")
for tick in ax[0].get_xticklabels():
    tick.set_rotation(45)

ax[1] = sns.countplot(
    x="timepoint",
    hue="timepoint",
    data=all_data,
    ax=ax[1],
    order=tp_order,
    dodge=False,
    palette="gray",
)
ax[1].set_xlabel("Timepoint with respect to the treatment")
ax[1].set_title("Distribution of the different timepoints")
ax[1].legend(loc="lower right")

plt.show()
plt.close()

---

#### Add cell type identities

In [None]:
celltype_cols = ["cd3"]
coding_dict = {0: "-", 1: "+"}
for celltype_col in celltype_cols:
    all_data.loc[:, celltype_col] = all_data.loc[:, celltype_col].map(coding_dict)

all_data["celltype"] = "CD3" + all_data["cd3"]
celltypes = np.unique(all_data.loc[:, "celltype"].dropna())

In [None]:
ct_abundance_data = get_ct_abundance_data(all_data, normalize=True)

#### Subsampling

We next subsample the data set such that for each timepoint we have the same number of nuclei in the data set. Additionally, we ensure that the individual timepoint population are approximately uniformly represented by the different biological (patient) samples.

In [None]:
sampled_tp1_data = get_stratified_data(
    all_data.loc[all_data.timepoint == "prior"],
    id_column="sample",
    cond_column="cancer",
    seed=1234,
)

In [None]:
sampled_tp2_data = get_stratified_data(
    all_data.loc[all_data.timepoint == "during"],
    id_column="sample",
    cond_column="cancer",
    seed=1234,
)

In [None]:
sampled_tp3_data = get_stratified_data(
    all_data.loc[all_data.timepoint == "end"],
    id_column="sample",
    cond_column="cancer",
    seed=1234,
)

In [None]:
sampled_data = pd.concat([sampled_tp1_data, sampled_tp2_data, sampled_tp3_data])

In [None]:
fig, ax = plt.subplots(figsize=[18, 4], ncols=3)
tp_order = ["prior", "during", "end"]
cancer_order = ["Meningioma", "Glioma", "Head & Neck"]
sample_order = np.unique(all_data.loc[:, "sample"])
ax = ax.flatten()
ax[0] = sns.countplot(
    x="sample",
    data=sampled_data,
    ax=ax[0],
    order=sample_order,
    hue_order=tp_order,
    hue="timepoint",
    palette="gray",
)
ax[0].legend([], [], frameon=False)
ax[0].set_xlabel("ID of the biological sample")
ax[0].set_title("Distribution of biological samples")
for tick in ax[0].get_xticklabels():
    tick.set_rotation(45)

ax[1] = sns.countplot(
    x="timepoint",
    hue="timepoint",
    data=sampled_data,
    ax=ax[1],
    order=tp_order,
    hue_order=tp_order,
    dodge=False,
    palette="gray",
)
ax[1].set_xlabel("Timepoint with respect to the treatment")
ax[1].set_title("Distribution of the different timepoints")
ax[1].legend(loc="lower right")

ax[2] = sns.countplot(
    x="cancer",
    hue="cancer",
    data=sampled_data,
    ax=ax[2],
    order=cancer_order,
    hue_order=cancer_order,
    dodge=False,
    palette=color_palette,
)
ax[2].set_xlabel("Cancer type")
ax[2].set_title("Distribution of the different cancer types")
ax[2].legend(loc="lower right")

plt.show()
plt.close()

----

#### Sample and feature selection

We now filter out constant features and nuclei with missing features.

In [None]:
data = preprocess_data(sampled_data, remove_constant_features=True)

---

#### Data preparation

After sampling the data, we will now prepare the data for the consecutive analysis, i.e. extracting only chrometric features and corresponding metadata information.

In [None]:
all_chrometric_data = get_chrometric_data(
    data,
    proteins=["gh2ax", "lamin", "cd3"],
    exclude_dna_int=True,
)

sample_labels = data.loc[:, "sample"]
tp_labels = data.loc[:, "timepoint"]
cancer_labels = data.loc[:, "cancer"]
ct_labels = data.loc[:, "celltype"]

Finally, we remove highly correlated features (Pearson $\rho > 0.8$) from the chrometric features.

In [None]:
chrometric_data = remove_correlated_features(all_chrometric_data, threshold=0.8)

---

## 3. Panels

Now we generate the individual panels for figure 4 of the paper.

---

### 3.a. T cell abundancies changes during the treatment

At first we look at the relative frequencies of CD3+/- PBMCs in our samples at the different treatment timepoints.

In [None]:
celltype_labels = []
for celltype in celltypes:
    celltype_labels.append("Fraction of {} among all PBMCs".format(celltype))

plot_celltype_abundancies(
    ct_abundance_data,
    celltypes=celltypes,
    celltype_labels=celltype_labels,
    figsize=[4, 4],
    plot_type="bar",
)

The above plot indicates that there are no significant differences between abundancies of CD3+ T cells between cancer types and/or the different treatment time points. 

___

### 3.b. Chrometric differences between CD3+/- PBMCs

We will now check whether or not there are significant differences between the previously identified chromatin biomarkers that capture the treatment effects between CD3+ positive and CD3- negative PBMCs.

#### Classification

To this end, we will first assess how well a RandomForest classifier can distinguish between the CD3+/- PBMCs based on the chrometric features. We first obtain a random balanced subset of CD3+ and CD3- PBMCs and then evaluate the class separability using a 10-fold stratfied cross-validation approach.

In [None]:
rfc = RandomForestClassifier(
    n_estimators=500, n_jobs=10, random_state=seed, class_weight="balanced"
)
idc = np.array(chrometric_data.index).reshape(-1, 1)
sampled_idc, _ = RandomUnderSampler(random_state=seed).fit_resample(idc, ct_labels)
sampled_ct_data = chrometric_data.loc[sampled_idc.flatten()]
sampled_ct_labels = ct_labels.loc[sampled_idc.flatten()]
sampled_sample_labels = sample_labels.loc[sampled_idc.flatten()]

In [None]:
len(sampled_sample_labels)

In [None]:
ct_cv_conf_mtx_nuclei = get_cv_conf_mtx(
    estimator=rfc,
    features=sampled_ct_data,
    labels=sampled_ct_labels,
    groups=sampled_sample_labels,
    scale_features=False,
    n_folds=len(set(sampled_sample_labels)),
    order=["CD3+", "CD3-"],
)
normalized_ct_cv_conf_mtx_nuclei = ct_cv_conf_mtx_nuclei.divide(
    ct_cv_conf_mtx_nuclei.sum(axis=1), axis=0
)

In [None]:
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    normalized_ct_cv_conf_mtx_nuclei,
    annot=True,
    fmt=".4f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    # cbar=False,
    annot_kws={"size": 16, "weight": "bold"},
)
ax.set_xlabel("Predicted cell type")
ax.set_ylabel("True cell type")
plt.show()

In [None]:
groupkfold = StratifiedGroupKFold(n_splits=len(set(sampled_sample_labels)))
cv_bacs = cross_val_score(
    rfc,
    cv=groupkfold,
    X=sampled_ct_data,
    y=sampled_ct_labels,
    groups=sampled_sample_labels,
    scoring="balanced_accuracy",
    n_jobs=10,
)
print("Balanced accuracy: {} (+/- {})".format(np.mean(cv_bacs), np.std(cv_bacs)))

We notice that the classifier can accurately distinguish between CD3+ and CD3- PBMCs based on the chrometric profiles as indicated in the above confusion matrix.

---

#### Feature importance

The feature importance plot shown below identifies the chrometric markers which are most indicative of the cell type.

In [None]:
def plot_feature_importance(
        importance,
        names,
        model_type,
        figsize=[6, 4],
        cmap=["gray"],
        n_features=10,
        feature_color_dict=None,
        labelsize=6,
):
    # Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    # Create a DataFrame using a Dictionary
    data = {"feature_names": feature_names, "feature_importance": feature_importance}
    fi_df = pd.DataFrame(data)

    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=["feature_importance"], ascending=False, inplace=True)
    fi_df = fi_df.head(n_features)
    # Define size of bar plot
    fig, ax = plt.subplots(figsize=figsize)
    # Plot Searborn bar chart
    ax = sns.barplot(
        x=fi_df["feature_importance"], y=fi_df["feature_names"], palette=cmap, ax=ax
    )
    if feature_color_dict is not None:
        for yticklabel in ax.get_yticklabels():
            yticklabel.set_color(feature_color_dict[yticklabel.get_text()])
            #yticklabel.set_rotation(90)
    ax.tick_params(axis="x", labelsize=labelsize)
    ax.tick_params(axis="y", labelsize=labelsize)

    # xticklabel.set_ha("right")
    # Add chart labels
    ax.set_title(model_type + "FEATURE IMPORTANCE")
    ax.set_xlabel("")
    ax.set_ylabel("")
    return fig, ax



def plot_feature_importance_for_estimator(
        estimator,
        features,
        labels,
        scale_features=True,
        cmap=["gray"],
        figsize=[6, 4],
        n_features=10,
        feature_color_dict=None,
        labelsize=6,
):
    if scale_features:
        sc = StandardScaler()
        features = pd.DataFrame(
            sc.fit_transform(features), index=features.index, columns=features.columns
        )
    estimator = estimator.fit(features, labels)
    fig, ax = plot_feature_importance(
        estimator.feature_importances_,
        features.columns,
        "RFC",
        figsize=figsize,
        cmap=cmap,
        n_features=n_features,
        feature_color_dict=feature_color_dict,
        labelsize=labelsize,
    )
    ax.set_title("")
    return fig, ax


In [None]:
fig, ax = plot_feature_importance_for_estimator(
    rfc,
    sampled_ct_data,
    sampled_ct_labels,
    scale_features=False,
    cmap=["gray"],
    figsize=[2, 2],
    feature_color_dict=feature_color_dict,
    n_features=15,
)
# plt.yticks(rotation=90)
# plt.xticks(rotation=90)
plt.show()

In [None]:
plt.show()

#### CD3+ marker screen

The results of a t-test based marker screen validates that in particular shape features like the solidity, the nuclear size as well as the curvature of the boundary of the projected nuclear mask carry information that distinguish the individual cell types.

In [None]:
marker_screen_results_ct = find_markers(chrometric_data, ct_labels)
marker_screen_results_ct.loc[marker_screen_results_ct.label == "CD3+"].head(10)

We note that the CD3+ cells show on average a more curved nuclear boundary with less changes of the same that features a more convex shape as apposed to CD3- cells.

In [None]:
markers = ["volume", "std_curvature", "concavity_3d"]
marker_labels = [
    r"Nuclear volume in $px^3$",
    "Standard deviation of the curvature",
    "Concavity",
]
plot_cell_type_markers_dist(
    data=all_data, markers=markers, marker_labels=marker_labels, cut=0, palette="gray"
)

In [None]:
markers = [
    "volume",
    "hetero_to_euchromatin_volume_ratio",
    "std_curvature",
    "concavity_3d",
]
marker_labels = [
    r"Nuclear volume in $px^3$",
    "HC/EC ratio",
    "Standard deviation of the curvature",
    "Concavity",
]
plot_cell_type_markers_dist(
    data=all_data,
    markers=markers,
    marker_labels=marker_labels,
    cut=0,
    palette="gray",
    plot_type="bar",
)

In [None]:
markers = [
    "volume",
    "hetero_to_euchromatin_volume_ratio",
    "std_curvature",
    "concavity_3d",
]
marker_labels = [
    r"Nuclear volume in $px^3$",
    "HC/EC ratio",
    "Standard deviation of the curvature",
    "Concavity",
]

fig, ax = plot_joint_markers_celltype(
    data,
    markers,
    marker_labels,
    label_col="celltype",
    palette="gray",
    figsize=[8, 4],
    cut=0,
)
ax.set_ylabel("Normalized marker value")
ax.set_xlabel("Chrometric Marker")
sns.move_legend(
    ax,
    "lower center",
    bbox_to_anchor=(0.5, 1),
    ncol=2,
    title=None,
    frameon=False,
)
plt.show()

---

### 3b. Parametric analysis captures captures differences of PBMCs at different timepoints of the proton therapy treatment

Since there are significant differences of the chrometric phenotypes between CD3+ and CD3- negative cells we will now look at the previously identifed features that are the most significantly changing for both CD3+ and CD3- negative cells.

In [None]:
markers = [
    "volume",
    "hetero_to_euchromatin_area_ratio",
    "radial_chromatin_content_p10",
    "std_curvature",
    "glcm_contrast_5px",
    "glcm_correlation_5px",
    "skewness_absolute_dna_intensity_distribution",
    "kurtosis_absolute_dna_intensity_distribution",
]
marker_labels = [
    r"Nuclear volume in px$^3$",
    "relative HC/EC ratio",
    "Fraction of the overall DNA intensity \n within the inner 10% of the nuclear volume",
    "Standard deviation of the curvature",
    "Contrast of the GLCM \n with a shift of 5px (2D)",
    "Correlation of the GLCM \n with a shift of 5px (2D)",
    "Skewness of the DNA\n intensity distribution (2D)",
    "Kurtosis of the DNA\n intensity distribution (2D)",
]
plot_timepoint_cancer_celltype_markers_dist(
    data,
    markers,
    marker_labels,
    celltypes=["CD3+", "CD3-"],
    cut=0,
    palette=color_palette,
    figsize=[9, 4],
)

In [None]:
plot_timepoint_cancer_celltype_markers_dist(
    data,
    markers,
    marker_labels,
    celltypes=["CD3+", "CD3-"],
    cut=0,
    palette=color_palette,
    figsize=[12, 4],
    plot_type="bar",
)

---

### 3d. Proteomic differences of PBMCs in cancer

Finally, we also assess the proteomic differences between the different treatment timepoint populations. To this end, we plot the relative Lamin and gH2AX expression measured by the sum of the intensities of the corresponding imaging channels normalized by the nuclear volume. Additionally, we plot the number of identified gH2AX foci which are computed as the local maxima peaks found in the corresponding channel images.

Note that those features are only available for the first data set that was stained for those proteins.

In [None]:
markers = [
    "rel_lamin_3d_int",
    "rel_gh2ax_3d_int",
    "gh2ax_foci_count",
    "gh2ax_sum_foci_area",
    "gh2ax_avg_foci_area",
]
marker_labels = [
    "Volume-normalized nuclear\nLamin A/C intensity",
    "Normalized nuclear\n" r"$\gamma$H2AX intensity",
    r"Number of $\gamma$H2AX foci",
    r"Sum of the $\gamma$H2AX foci area",
    r"Average size of the $\gamma$H2AX foci",
]
plot_timepoint_cancer_celltype_markers_dist(
    data,
    markers,
    marker_labels,
    quantiles=None,
    celltypes=["CD3+", "CD3-"],
    cut=0,
    plot_type="bar",
    palette=color_palette,
    figsize=[7, 4],
)

In [None]:
markers = [
    "rel_lamin_3d_int",
    "rel_gh2ax_3d_int",
    "gh2ax_foci_count",
    "gh2ax_sum_foci_area",
    "gh2ax_avg_foci_area",
]
marker_labels = [
    "Volume-normalized nuclear\nLamin A/C intensity",
    "Normalized nuclear\n" r"$\gamma$H2AX intensity",
    r"Number of $\gamma$H2AX foci",
    r"Sum of the $\gamma$H2AX foci area",
    r"Average size of the $\gamma$H2AX foci",
]
plot_cell_type_markers_dist(
    data=all_data,
    markers=markers,
    marker_labels=marker_labels,
    cut=0,
    palette="gray",
    plot_type="bar",
)

---

## 4. Supplemental

#### Tumor type classification with CD3+ PBMCs only

In [None]:
idc = ct_labels[ct_labels == "CD3+"].index
idc = tp_labels.loc[idc]
tp_idc = idc[idc == "prior"].index
idc = sample_labels.loc[tp_idc]
idc = idc[idc != "p41"].index

In [None]:
fig, ax = plt.subplots(figsize=[6, 4])
ax = sns.countplot(sample_labels.loc[tp_idc], palette=["b"])
plt.xticks(rotation=90)
plt.show()

In [None]:
rfc = RandomForestClassifier(
    n_estimators=500, n_jobs=10, random_state=seed, class_weight="balanced"
)

In [None]:
sample_labels.loc[idc].nunique()

In [None]:
ct_cv_conf_mtx_nuclei = get_cv_conf_mtx(
    estimator=rfc,
    features=chrometric_data.loc[idc],
    labels=cancer_labels.loc[idc],
    groups=sample_labels.loc[idc],
    scale_features=False,
    n_folds=sample_labels.loc[idc].nunique(),
    order=["Meningioma", "Glioma", "Head & Neck"],
)
normalized_ct_cv_conf_mtx_nuclei = ct_cv_conf_mtx_nuclei.divide(
    ct_cv_conf_mtx_nuclei.sum(axis=1), axis=0
)

In [None]:
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    normalized_ct_cv_conf_mtx_nuclei,
    annot=True,
    fmt=".4f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    # cbar=False,
    annot_kws={"size": 16, "weight": "bold"},
)
ax.set_xlabel("")
ax.set_ylabel("")
plt.show()

In [None]:
groupkfold = StratifiedGroupKFold(n_splits=sample_labels.loc[idc].nunique())
cv_bacs = cross_val_score(
    rfc,
    cv=groupkfold,
    X=chrometric_data.loc[idc],
    y=cancer_labels.loc[idc],
    groups=sample_labels.loc[idc],
    scoring="balanced_accuracy",
    n_jobs=10,
)
print("Balanced accuracy: {} (+/- {})".format(np.mean(cv_bacs), np.std(cv_bacs)))

---

#### Tumor type classification with CD3- PBMCs only

In [None]:
idc = ct_labels[ct_labels == "CD3-"].index
idc = tp_labels.loc[idc]
tp_idc = idc[idc == "prior"].index
idc = sample_labels.loc[tp_idc].index

In [None]:
fig, ax = plt.subplots(figsize=[6, 4])
ax = sns.countplot(sample_labels.loc[tp_idc], palette=["b"])
plt.xticks(rotation=90)
plt.show()

In [None]:
rfc = RandomForestClassifier(
    n_estimators=500, n_jobs=10, random_state=seed, class_weight="balanced"
)

In [None]:
sample_labels.loc[idc].nunique()

In [None]:
ct_cv_conf_mtx_nuclei = get_cv_conf_mtx(
    estimator=rfc,
    features=chrometric_data.loc[idc],
    labels=cancer_labels.loc[idc],
    groups=sample_labels.loc[idc],
    scale_features=False,
    n_folds=sample_labels.loc[idc].nunique(),
    order=["Meningioma", "Glioma", "Head & Neck"],
)
normalized_ct_cv_conf_mtx_nuclei = ct_cv_conf_mtx_nuclei.divide(
    ct_cv_conf_mtx_nuclei.sum(axis=1), axis=0
)

In [None]:
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    normalized_ct_cv_conf_mtx_nuclei,
    annot=True,
    fmt=".4f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    # cbar=False,
    annot_kws={"size": 16, "weight": "bold"},
)
ax.set_xlabel("")
ax.set_ylabel("")
plt.show()

In [None]:
groupkfold = StratifiedGroupKFold(n_splits=sample_labels.loc[idc].nunique())
cv_bacs = cross_val_score(
    rfc,
    cv=groupkfold,
    X=chrometric_data.loc[idc],
    y=cancer_labels.loc[idc],
    groups=sample_labels.loc[idc],
    scoring="balanced_accuracy",
    n_jobs=10,
)
print("Balanced accuracy: {} (+/- {})".format(np.mean(cv_bacs), np.std(cv_bacs)))