# Nuclear chromatin phenotypes of PBMCs reflects the treatment effect of proton therapy (all cancers)

---
This notebook summarizes the analysis corresponding to the results presented in figure 4 of the paper for all cancer patients. It can be used to rerun the analysis and regenerate the corresponding panels.

---

## 0. Environmental setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import matplotlib as mpl
from matplotlib.collections import PolyCollection
from matplotlib.legend_handler import HandlerTuple
from matplotlib.colors import to_rgb
from sklearn.model_selection import StratifiedGroupKFold, cross_val_score

mpl.rcParams["figure.dpi"] = 1200

# SMALL_SIZE = 16
# MEDIUM_SIZE = 18
# BIGGER_SIZE = 20

# mpl.rc("font", size=SMALL_SIZE, weight="normal")  # controls default text sizes
# mpl.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
# mpl.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
# mpl.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
# mpl.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
# mpl.rc("legend", fontsize=SMALL_SIZE)  # legend fontsize
# mpl.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title

import sys

sys.path.append("../..")
from src.utils.notebooks.eda import *
from src.utils.notebooks.figure3 import *
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

seed = 1234
random.seed(1234)
np.random.seed(1234)

%reload_ext nb_black

In [None]:
nuc_feature_desc = pd.read_csv(
    "../../data/chrometric_feature_description.csv", index_col=0
)
feature_name_dict = dict(
    zip(
        list(nuc_feature_desc.loc[:, "feature"]),
        list(nuc_feature_desc.loc[:, "long_name"]),
    )
)
feature_color_dict = {
    "morphology": "b",
    "intensity": "g",
    "boundary": "r",
    "texture": "c",
    "chromatin condensation": "m",
    "moments": "y",
    np.nan: "k",
}
feature_color_dict = {
    feature: feature_color_dict[category]
    for (feature, category) in zip(
        list(nuc_feature_desc.loc[:, "long_name"]),
        list(nuc_feature_desc.loc[:, "category"]),
    )
}

In [None]:
def plot_timepoint_cancer_markers_dist(
    data,
    markers,
    marker_labels,
    quantiles=None,
    cut=2,
    plot_type="violin",
    palette=None,
    figsize=[4, 4],
    hue=None,
    hue_order=None,
):
    mg_colors = ["lightsteelblue", "royalblue", "midnightblue"]
    gl_colors = ["orange", "gold", "saddlebrown"]
    hn_colors = ["plum", "deeppink", "indigo"]

    violin_all_colors = mg_colors + gl_colors + hn_colors
    bar_all_colors = []
    for i in range(3):
        bar_all_colors.append(mg_colors[i])
        bar_all_colors.append(gl_colors[i])
        bar_all_colors.append(hn_colors[i])

    for i in range(len(markers)):
        fig, ax = plot_marker_distribution(
            data,
            figsize=figsize,
            marker=markers[i],
            label_col="cancer",
            order=["Meningioma", "Glioma", "Head & Neck"],
            box_pairs=[
                (("Meningioma", "prior"), ("Meningioma", "during")),
                (("Meningioma", "prior"), ("Meningioma", "end")),
                (("Meningioma", "during"), ("Meningioma", "end")),
                (("Glioma", "prior"), ("Glioma", "during")),
                (("Glioma", "prior"), ("Glioma", "end")),
                (("Glioma", "during"), ("Glioma", "end")),
                (("Head & Neck", "prior"), ("Head & Neck", "during")),
                (("Head & Neck", "prior"), ("Head & Neck", "end")),
                (("Head & Neck", "during"), ("Head & Neck", "end")),
            ],
            quantiles=quantiles,
            cut=cut,
            plot_type=plot_type,
            palette="gray",
            hue="timepoint",
            hue_order=["prior", "during", "end"],
        )
        ax.set_xlabel("Cancer type")
        ax.set_ylabel(marker_labels[i])

        for label in ax.get_xticklabels():
            label.set_fontweight("bold")

        handles = []
        if plot_type == "violin":
            for ind, obj in enumerate(ax.findobj(PolyCollection)):
                rgb = to_rgb(violin_all_colors[ind])
                obj.set_facecolor(rgb)
                handles.append(
                    plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor="black")
                )
        elif plot_type == "bar":
            for ind, bar in enumerate(ax.patches):
                rgb = to_rgb(bar_all_colors[ind])
                bar.set_color(rgb)
                handles_color = to_rgb(violin_all_colors[ind])
                handles.append(
                    plt.Rectangle(
                        (0, 0), 0, 0, facecolor=handles_color, edgecolor="black"
                    )
                )
        else:
            raise NotImplementedError

        ax.legend(
            handles=[tuple(handles[::3]), tuple(handles[1::3]), tuple(handles[2::3])],
            labels=tp_order,
            title=None,
            handlelength=4,
            handler_map={tuple: HandlerTuple(ndivide=None, pad=0)},
            loc="lower center",
            borderaxespad=0,
            bbox_to_anchor=(0.5, 1),
            ncol=3,
            frameon=False,
            prop={"size": 20, "weight": "bold"},
        )
        plt.show()
        plt.close()

In [None]:
color_palette = {
    "Meningioma": "cornflowerblue",
    "Glioma": "orange",
    "Head & Neck": "orchid",
}

In [None]:
mg_colors = ["lightsteelblue", "royalblue", "midnightblue"]
gl_colors = ["orange", "gold", "saddlebrown"]
hn_colors = ["plum", "deeppink", "indigo"]

---

## 1. Read in data

In this notebook we assess the differences of the cell states of PBMCs at three different time points of the proton therapy treatment: before, during (~3 weeks in) and at the end of the treatment (final week of treatment). To this end, we obtained PBMCs of 10 Meningioma, 10 Glioma and 10 Head & Neck cancer patients stained them for DNA, gH2AX and Lamin A/C and obtained fluorescent images.

First, we read in the required data set that describe each PBMCs by a number of hand-crafted features extracted from the fluorescent images of the cells.

In [None]:
all_data = pd.read_csv("../../data/treated_population_data.csv", index_col=0)

Before we analyze it we preprocess to to drop features with missing entries or nuclei that do not pass the quality control.

In [None]:
all_data = preprocess_data(all_data, remove_constant_features=False)

In [None]:
all_data = all_data.rename(columns=feature_name_dict)
len(all_data)

In [None]:
count_matrix = all_data.groupby(["timepoint", "sample"]).size().unstack(fill_value=0)

# Create a color palette for the 'cancer' column
ordered_samples = sorted(
    list(all_data.loc[all_data.cancer == "Meningioma", "sample"].unique())
)
ordered_samples += sorted(
    list(all_data.loc[all_data.cancer == "Head & Neck", "sample"].unique())
)
ordered_samples += sorted(
    list(all_data.loc[all_data.cancer == "Glioma", "sample"].unique())
)
ordered_timepoints = ["prior", "during", "end"]

row_colors = (
    all_data.drop_duplicates("sample")
    .set_index("sample")
    .loc[ordered_samples]["cancer"]
    .map(color_palette)
)


# Plotting the heatmap with annotations
plt.figure(figsize=(6, 10))
sns.set(font_scale=1)
ax = sns.heatmap(
    count_matrix.transpose().loc[ordered_samples, ordered_timepoints],
    annot=True,
    fmt="d",
    cmap="Greys",
    linewidths=0.5,
    linecolor="gray",
    cbar=False,
)

# Adding row colors for the 'cancer' column
for tick_label, color in zip(ax.get_yticklabels(), row_colors):
    tick_label.set_backgroundcolor(color)

plt.title("Old processed data (all duplicates dropped)")
plt.xlabel("Timepoint")
plt.ylabel("Patient")
plt.show()

In [None]:
import matplotlib as mpl

mpl.style.use("default")
mpl.rcParams["figure.dpi"] = 1200

In [None]:
fig, ax = plt.subplots(figsize=[12, 4], ncols=2)
tp_order = ["prior", "during", "end"]
sample_order = np.unique(all_data.loc[:, "sample"])
ax = ax.flatten()
ax[0] = sns.countplot(
    x="sample",
    data=all_data,
    ax=ax[0],
    order=sample_order,
    hue_order=tp_order,
    hue="timepoint",
    palette="gray",
)
ax[0].legend([], [], frameon=False)
ax[0].set_xlabel("ID of the biological sample")
ax[0].set_title("Distribution of biological samples")
for tick in ax[0].get_xticklabels():
    tick.set_rotation(90)

ax[1] = sns.countplot(
    x="timepoint",
    hue="timepoint",
    data=all_data,
    ax=ax[1],
    order=tp_order,
    dodge=False,
    palette="gray",
)
ax[1].set_xlabel("Timepoint with respect to the treatment")
ax[1].set_title("Distribution of the different timepoints")
ax[1].legend(loc="lower right")

plt.show()
plt.close()

___

#### Subsampling

We next subsample the data set such that for each timepoint we have the same number of nuclei for each tumor type.  in the data set. Additionally, we ensure that each tumor type is represented by an equal number of nuclei per patient. However, before that we remove two samples namely P723 and P421 which have only very few nuclei (38 and 47 respectively).

In [None]:
deselected_ids = ["p421", "p723"]
all_data = all_data.loc[~all_data.loc[:, "id"].isin(deselected_ids)]

In [None]:
sampled_tp1_data = get_stratified_data(
    all_data.loc[all_data.timepoint == "prior"],
    id_column="sample",
    cond_column="cancer",
    seed=1234,
)

In [None]:
Counter(sampled_tp1_data.cancer)

In [None]:
sampled_tp2_data = get_stratified_data(
    all_data.loc[all_data.timepoint == "during"],
    id_column="sample",
    cond_column="cancer",
    seed=1234,
)

In [None]:
Counter(sampled_tp2_data.cancer)

In [None]:
sampled_tp3_data = get_stratified_data(
    all_data.loc[all_data.timepoint == "end"],
    id_column="sample",
    cond_column="cancer",
    seed=1234,
)

In [None]:
Counter(sampled_tp3_data.cancer)

In [None]:
sampled_data = pd.concat([sampled_tp1_data, sampled_tp2_data, sampled_tp3_data])

In [None]:
fig, ax = plt.subplots(figsize=[18, 4], ncols=3)
tp_order = ["prior", "during", "end"]
cancer_order = ["Meningioma", "Glioma", "Head & Neck"]
sample_order = np.unique(all_data.loc[:, "sample"])
ax = ax.flatten()
ax[0] = sns.countplot(
    x="sample",
    data=sampled_data,
    ax=ax[0],
    order=sample_order,
    hue_order=tp_order,
    hue="timepoint",
    palette="gray",
)
ax[0].legend([], [], frameon=False)
ax[0].set_xlabel("ID of the biological sample")
ax[0].set_title("Distribution of biological samples")
for tick in ax[0].get_xticklabels():
    tick.set_rotation(90)

ax[1] = sns.countplot(
    x="timepoint",
    hue="timepoint",
    data=sampled_data,
    ax=ax[1],
    order=tp_order,
    hue_order=tp_order,
    dodge=False,
    palette="gray",
)
ax[1].set_xlabel("Timepoint with respect to the treatment")
ax[1].set_title("Distribution of the different timepoints")
ax[1].legend(loc="lower right")

ax[2] = sns.countplot(
    x="cancer",
    hue="cancer",
    data=sampled_data,
    ax=ax[2],
    order=cancer_order,
    hue_order=cancer_order,
    dodge=False,
    palette=color_palette,
)
ax[2].set_xlabel("Cancer type")
ax[2].set_title("Distribution of the different cancer types")
ax[2].legend(loc="lower right")

plt.show()
plt.close()

----

#### Sample and feature selection

We now filter out constant features and nuclei with missing features.

In [None]:
data = preprocess_data(sampled_data, remove_constant_features=True)

In [None]:
all_data_columns = set(data.columns)

---

#### Data preparation

After sampling the data, we will now prepare the data for the consecutive analysis, i.e. extracting only chrometric features and corresponding metadata information.

In [None]:
all_chrometric_data = get_chrometric_data(
    data,
    proteins=["gh2ax", "lamin", "cd3"],
    exclude_dna_int=True,
)

sample_labels = data.loc[:, "sample"]
tp_labels = data.loc[:, "timepoint"]
cancer_labels = data.loc[:, "cancer"]

Finally, we remove highly correlated features (Pearson $\rho > 0.8$) from the chrometric features.

In [None]:
chrometric_data = remove_correlated_features(all_chrometric_data, threshold=0.8)

---

## 3. Panels

Now we generate the individual panels for figure 4 of the paper.

---

### 3b. Parametric analysis captures captures differences of PBMCs at different timepoints of the proton therapy treatment

The montage already indicate significant changes in particular between the chrometric pheontype of the PBMCs prior the treatment and the end of it. We will now turn to the assessment of the parametric descriptions of the nuclear phenotypes of the PBMCs at those different timepoints. To this end, we first visualize the data set using a tSNE plot to assess potential large-scale differences between the timepoint populations and individual patient samples.

In [None]:
chrometric_embs = get_tsne_embs(chrometric_data)
chrometric_embs["timepoint"] = np.array(tp_labels)
chrometric_embs["sample"] = np.array(sample_labels)
chrometric_embs["cancer"] = np.array(cancer_labels)

In [None]:
fig, ax = plt.subplots(figsize=[18, 16])
ax = sns.scatterplot(
    data=chrometric_embs,
    x="tSNE 1",
    y="tSNE 2",
    hue="cancer",
    hue_order=cancer_order,
    ax=ax,
    s=15,
    style="timepoint",
    style_order=tp_order,
    palette=color_palette,
)
ax.set_xlim([-50, 50])
ax.set_ylim([-55, 55])
plt.show()

Note that the above plot excluds one outlier for better visualization.

In [None]:
fig, ax = plt.subplots(figsize=[18, 16])
ax = sns.scatterplot(
    data=chrometric_embs,
    x="tSNE 1",
    y="tSNE 2",
    hue="sample",
    style="timepoint",
    style_order=tp_order,
    hue_order=sample_order,
    ax=ax,
    s=15,
    marker="o",
    palette="tab20",
)
ax.set_xlim([-50, 50])
ax.set_ylim([-55, 55])
plt.legend(
    bbox_to_anchor=(1.02, 0.5), loc="center left", borderaxespad=0, title="sample"
)
plt.show()

The tSNE plot also shows that especially the chrometric phenotypes of the PBMCs at the intermediate timepoint of the proton therapy look different from the prior and the end of treatment population.
The patient samples are fairly well mixed with probably the small exception of the PBMCs of P29 and P22.

In [None]:
cancer_groups = ["Meningioma", "Glioma", "Head & Neck"]
tps = ["prior", "during", "end"]
for cg in cancer_groups:
    for tp in tps:
        fig, ax = plt.subplots(figsize=[8, 6])
        ax = sns.scatterplot(
            data=chrometric_embs.loc[
                (chrometric_embs.cancer == cg) & (chrometric_embs.timepoint == tp)
            ],
            x="tSNE 1",
            y="tSNE 2",
            hue="sample",
            hue_order=np.unique(
                chrometric_embs.loc[
                    (chrometric_embs.cancer == cg) & (chrometric_embs.timepoint == tp),
                    "sample",
                ]
            ),
            ax=ax,
            s=12,
            marker="o",
            palette="tab10",
        )
        plt.legend(
            bbox_to_anchor=(0.5, 1.05),
            loc="center",
            borderaxespad=0,
            title="",
            ncol=10,
            fancybox=False,
            frameon=False,
            columnspacing=0.1,
        )
        ax.set_xlim([-60, 60])
        ax.set_ylim([-60, 60])
        ax.set_xlabel("")
        ax.set_ylabel("")
        plt.show()

---

## Classification of the different cancer types.

In [None]:
prior_idc = tp_labels.loc[tp_labels == "prior"].index
prior_cancer_labels = cancer_labels.loc[prior_idc]
prior_chrometric_data = chrometric_data.loc[prior_idc]
prior_sample_labels = sample_labels.loc[prior_idc]

In [None]:
during_idc = tp_labels.loc[tp_labels == "during"].index
during_cancer_labels = cancer_labels.loc[during_idc]
during_chrometric_data = chrometric_data.loc[during_idc]
during_sample_labels = sample_labels.loc[during_idc]

In [None]:
end_idc = tp_labels.loc[tp_labels == "end"].index
end_cancer_labels = cancer_labels.loc[end_idc]
end_chrometric_data = chrometric_data.loc[end_idc]
end_sample_labels = sample_labels.loc[end_idc]

### Prior: Leave-one-patient-out

In [None]:
rfc = RandomForestClassifier(
    n_estimators=500, n_jobs=10, random_state=seed, class_weight="balanced"
)

In [None]:
lopo_prior_cv_result = summarize_group_cv_results_by_fold(
    model=rfc,
    features=prior_chrometric_data,
    labels=prior_cancer_labels,
    groups=prior_sample_labels,
    balance_train=True,
)

np.random.seed(seed + 1111)
bs = range(10)

lopo_prior_perm_cv_results = []

for b in tqdm(bs):
    prior_perm_cancer_labels = get_permute_group_labels(
        prior_cancer_labels, prior_sample_labels
    )[0]
    lopo_perm_cv_result = summarize_group_cv_results_by_fold(
        model=rfc,
        features=prior_chrometric_data,
        labels=prior_perm_cancer_labels,
        groups=prior_sample_labels,
        balance_train=True,
    )
    lopo_perm_cv_result["permutation"] = b
    lopo_prior_perm_cv_results.append(lopo_perm_cv_result)

lopo_prior_perm_cv_results = pd.concat(lopo_prior_perm_cv_results)
lopo_prior_perm_cv_results["condition"] = "Permuted"
lopo_prior_cv_result["condition"] = "Observed"
all_lopo_prior_results = lopo_prior_cv_result.append(lopo_prior_perm_cv_results)
all_lopo_prior_results["timepoint"] = "prior"

In [None]:
fig, ax = plot_lopo_cv_results_by_class(
    all_lopo_prior_results,
    cancer_order,
    x="majority_class",
    y="score",
    hue="condition",
    figsize=[6, 4],
    test="Mann-Whitney",
    pval_text_format="star",
    alpha=0.5,
)
ax.set_xlabel("Cancer types")
ax.set_ylabel("Classification accuracy by patient")
plt.show()

---

#### During: Leave-one-patient-out

In [None]:
lopo_during_cv_result = summarize_group_cv_results_by_fold(
    model=rfc,
    features=during_chrometric_data,
    labels=during_cancer_labels,
    groups=during_sample_labels,
    balance_train=True,
)

np.random.seed(seed)
bs = range(10)

lopo_during_perm_cv_results = []

for b in tqdm(bs):
    during_perm_cancer_labels = get_permute_group_labels(
        during_cancer_labels, during_sample_labels
    )[0]
    lopo_perm_cv_result = summarize_group_cv_results_by_fold(
        model=rfc,
        features=during_chrometric_data,
        labels=during_perm_cancer_labels,
        groups=during_sample_labels,
        balance_train=True,
    )
    lopo_perm_cv_result["permutation"] = b
    lopo_during_perm_cv_results.append(lopo_perm_cv_result)

lopo_during_perm_cv_results = pd.concat(lopo_during_perm_cv_results)
lopo_during_perm_cv_results["condition"] = "Permuted"
lopo_during_cv_result["condition"] = "Observed"
all_lopo_during_results = lopo_during_cv_result.append(lopo_during_perm_cv_results)
all_lopo_during_results["timepoint"] = "during"

In [None]:
fig, ax = plot_lopo_cv_results_by_class(
    all_lopo_during_results,
    cancer_order,
    x="majority_class",
    y="score",
    hue="condition",
    figsize=[6, 4],
    test="Mann-Whitney",
    pval_text_format="star",
    alpha=0.5,
)
ax.set_xlabel("Cancer types")
ax.set_ylabel("Classification accuracy by patient")
plt.show()

---
#### End of treatment: Leave-one-patient-out

In [None]:
lopo_end_cv_result = summarize_group_cv_results_by_fold(
    model=rfc,
    features=end_chrometric_data,
    labels=end_cancer_labels,
    groups=end_sample_labels,
    balance_train=True,
)

np.random.seed(seed + 2222)
bs = range(10)

lopo_end_perm_cv_results = []

for b in tqdm(bs):
    end_perm_cancer_labels = get_permute_group_labels(
        end_cancer_labels, end_sample_labels
    )[0]
    lopo_perm_cv_result = summarize_group_cv_results_by_fold(
        model=rfc,
        features=end_chrometric_data,
        labels=end_perm_cancer_labels,
        groups=end_sample_labels,
        balance_train=True,
    )
    lopo_perm_cv_result["permutation"] = b
    lopo_end_perm_cv_results.append(lopo_perm_cv_result)

lopo_end_perm_cv_results = pd.concat(lopo_end_perm_cv_results)
lopo_end_perm_cv_results["condition"] = "Permuted"
lopo_end_cv_result["condition"] = "Observed"
all_lopo_end_results = lopo_end_cv_result.append(lopo_end_perm_cv_results)
all_lopo_end_results["timepoint"] = "end"

In [None]:
fig, ax = plot_lopo_cv_results_by_class(
    all_lopo_end_results,
    cancer_order,
    x="majority_class",
    y="score",
    hue="condition",
    figsize=[6, 4],
    test="Mann-Whitney",
    pval_text_format="star",
    alpha=0.5,
)
ax.set_xlabel("Cancer types")
ax.set_ylabel("Classification accuracy by patient")
plt.show()

#### Summary: Leave-one-patient-out

In [None]:
all_lopo_tp_results = all_lopo_prior_results.append(all_lopo_during_results).append(
    all_lopo_end_results
)
all_lopo_tp_results.loc[:, "tp"] = all_lopo_tp_results.loc[:, "timepoint"]
all_lopo_tp_results.loc[
    all_lopo_tp_results.condition == "Permuted", "tp"
] = "permutation"

In [None]:
fig, ax = plot_lopo_cv_results_by_class(
    all_lopo_tp_results,
    tp_order,
    x="timepoint",
    y="score",
    hue="condition",
    figsize=[6, 4],
    test="Mann-Whitney",
    pval_text_format="star",
    alpha=0.5,
)
ax.set_xlabel("Treatment timepoint")
ax.set_ylabel("Classification accuracy by patient")
plt.show()

In [None]:
fig, ax = plot_lopo_cv_results_timepoints(
    all_lopo_tp_results,
    order=["prior", "during", "end", "permutation"],
    class_palette=color_palette,
    box_palette=["lightgray"],
    figsize=[6, 6],
)
ax.set_ylabel("Classification accuracy")
ax.set_xlabel("Timepoint")
plt.show()

In [None]:
all_lopo_tp_results.groupby("tp").describe().loc[
    ["prior", "during", "end", "permutation"]
].iloc[:, :8]

In [None]:
fig, ax = plt.subplots(figsize=[4, 8])
ax = sns.barplot(
    data=all_lopo_tp_results,
    x="tp",
    y="score",
    order=["prior", "during", "end", "permutation"],
    # fliersize=2,
    palette=["gray"],
    ax=ax,
)

box_pairs = [
    ("prior", "during"),
    ("during", "end"),
    ("prior", "end"),
    ("prior", "permutation"),
    ("during", "permutation"),
    ("end", "permutation"),
]

annotator = Annotator(
    ax,
    box_pairs,
    data=all_lopo_tp_results,
    x="tp",
    y="score",
    order=["prior", "during", "end", "permutation"],
    plot="barplot",
)
annotator.configure(
    test="Mann-Whitney",
    text_format="star",
    loc="inside",
    comparisons_correction="Benjamini-Hochberg",
)
annotator.apply_test()
annotator.annotate()
ax.set_xlabel("")
ax.set_ylabel("")
# ax.set_ylim([0, 1])
# fig.savefig(
#     os.path.join(output_dir, "cancer_separation_accuracy.png"),
#     dpi=1200,
#     transparent=True,
# )
plt.show()

---

### 3c. Nuclear chromatin biomarkers identifying cancer populations

#### Feature importance

We have validated that there are significant differences between the individual treatment timepoints in particular when comparing PBMCs at the end of the treatment with those of during respectively prior the treatment. We next assess the implicit feature importance of a RandomForest classifier trained on the task to distinguish between the timepoint populations types in order to get an idea of the features are most indicative for the treatment effect.

In [None]:
chrometric_data = chrometric_data.rename(columns=feature_name_dict)

In [None]:
fig, ax = plot_feature_importance_for_estimator(
    rfc,
    chrometric_data,
    tp_labels,
    cmap=["gray"],
    figsize=[2, 3],
    feature_color_dict=feature_color_dict,
    n_features=15,
)


In [None]:
fig, ax = plot_feature_importance_for_estimator(
    rfc,
    chrometric_data,
    cancer_labels,
    cmap=["gray"],
    figsize=[2, 3],
    feature_color_dict=feature_color_dict,
    n_features=15,
)

In [None]:
marker_screen_results_tp = find_markers(chrometric_data, tp_labels)
marker_screen_results_cancer = find_markers(chrometric_data, cancer_labels)

#### Prior treatment

In [None]:
marker_screen_results_tp.loc[marker_screen_results_tp.label == "prior"].head(10)

---

#### During treatment

Next we look at the features whose mean is significantly different in the during treatment population.

In [None]:
marker_screen_results_tp.loc[marker_screen_results_tp.label == "during"].head(10)

---

#### End of treatment

Finally, we also evaluate the chrometric phenotype of PBMCs at the end of the proton therapy treatment.

In [None]:
marker_screen_results_tp.loc[marker_screen_results_tp.label == "end"].head(10)

---

#### Meningioma

In [None]:
marker_screen_results_cancer.loc[
    marker_screen_results_cancer.label == "Meningioma"
].head(10)

#### Glioma

In [None]:
marker_screen_results_cancer.loc[marker_screen_results_cancer.label == "Glioma"].head(
    10
)

#### Head & Neck cancers

In [None]:
marker_screen_results_cancer.loc[
    marker_screen_results_cancer.label == "Head & Neck"
].head(10)

In [None]:
color_palette

In [None]:
data.groupby(["timepoint", "cancer"]).std_curvature.describe()

---

In [None]:
data = data.rename(columns=feature_name_dict)
markers = [
    "volume",
    "hetero_to_euchromatin_area_ratio",
    "radial_chromatin_content_p10",
    "std_curvature",
    "glcm_contrast_5px",
    "glcm_correlation_5px",
]
marker_labels = [
    r"Nuclear volume in px$^3$",
    "relative HC/EC ratio",
    "Fraction of the overall DNA intensity \n within the inner 10% of the nuclear volume",
    "Standard deviation of the curvature",
    "Contrast of the GLCM \n with a shift of 5px (2D)",
    "Correlation of the GLCM \n with a shift of 5px (2D)",
]
plot_timepoint_cancer_markers_dist(
    data, markers, marker_labels, cut=0, palette=color_palette, figsize=[6.5, 3.5]
)

In [None]:
plot_timepoint_cancer_markers_dist(
    data,
    markers,
    marker_labels,
    cut=0,
    plot_type="bar",
    palette=color_palette,
    figsize=[5, 4],
)

---

### 3d. Proteomic differences of PBMCs in cancer

Finally, we also assess the proteomic differences between the different treatment timepoint populations. To this end, we plot the relative Lamin and gH2AX expression measured by the sum of the intensities of the corresponding imaging channels normalized by the nuclear volume. Additionally, we plot the number of identified gH2AX foci which are computed as the local maxima peaks found in the corresponding channel images.

Note that those features are only available for the first data set that was stained for those proteins.

In [None]:
markers = [
    "rel_lamin_3d_int",
    "rel_gh2ax_3d_int",
    "gh2ax_foci_count",
    "gh2ax_sum_foci_area",
    "gh2ax_avg_foci_area",
]
marker_labels = [
    "Volume-normalized nuclear\nLamin A/C intensity",
    "Normalized nuclear\n" r"$\gamma$H2AX intensity",
    r"Number of $\gamma$H2AX foci",
    r"Sum of the $\gamma$H2AX foci area",
    r"Average size of the $\gamma$H2AX foci",
]
plot_timepoint_cancer_markers_dist(
    data,
    markers,
    marker_labels,
    quantiles=None,
    cut=0,
    plot_type="bar",
    palette=color_palette,
    figsize=[6, 4],
)

In [None]:
markers = [
    "rel_lamin_3d_int",
    "rel_gh2ax_3d_int",
    "gh2ax_foci_count",
    "gh2ax_sum_foci_area",
    "gh2ax_avg_foci_area",
]
marker_labels = [
    "Volume-normalized nuclear\nLamin A/C intensity",
    "Normalized nuclear\n" r"$\gamma$H2AX intensity",
    r"Number of $\gamma$H2AX foci",
    r"Sum of the $\gamma$H2AX foci area",
    r"Average size of the $\gamma$H2AX foci",
]
plot_timepoint_cancer_markers_dist(
    data,
    markers,
    marker_labels,
    quantiles=None,
    cut=0,
    plot_type="violin",
    palette=color_palette,
    figsize=[5, 4],
)

---


---

## 4. Supplemental

In [None]:
markers = [
    "volume",
    "hetero_to_euchromatin_area_ratio",
    "radial_chromatin_content_p10",
    "std_curvature",
    "glcm_contrast_5px",
    "glcm_correlation_5px",
]
marker_labels = [
    r"Nuclear volume in px$^3$",
    "relative HC/EC ratio",
    "Fraction of the overall DNA intensity \n within the inner 10% of the nuclear volume",
    "Standard deviation of the curvature",
    "Contrast of the GLCM \n with a shift of 5px (2D)",
    "Correlation of the GLCM \n with a shift of 5px (2D)",
]
plot_timepoint_cancer_markers_dist(
    all_data, markers, marker_labels, cut=0, palette=color_palette, figsize=[6.5, 3.5]
)

---

### 3d. Proteomic differences of PBMCs in cancer

Finally, we also assess the proteomic differences between the different treatment timepoint populations. To this end, we plot the relative Lamin and gH2AX expression measured by the sum of the intensities of the corresponding imaging channels normalized by the nuclear volume. Additionally, we plot the number of identified gH2AX foci which are computed as the local maxima peaks found in the corresponding channel images.

Note that those features are only available for the first data set that was stained for those proteins.

In [None]:
markers = [
    "rel_lamin_3d_int",
    "rel_gh2ax_3d_int",
    "gh2ax_foci_count",
    "gh2ax_sum_foci_area",
    "gh2ax_avg_foci_area",
]
marker_labels = [
    "Volume-normalized nuclear\nLamin A/C intensity",
    "Normalized nuclear\n" r"$\gamma$H2AX intensity",
    r"Number of $\gamma$H2AX foci",
    r"Sum of the $\gamma$H2AX foci area",
    r"Average size of the $\gamma$H2AX foci",
]
plot_timepoint_cancer_markers_dist(
    all_data,
    markers,
    marker_labels,
    quantiles=None,
    cut=0,
    plot_type="bar",
    palette=color_palette,
    figsize=[6, 4],
)