# Nuclear chromatin phenotypes of PBMCs distinguish control and cancer populations

---
This notebook summarizes the analysis corresponding to the results presented in figure 2 of the paper. It can be used to rerun the analysis and regenerate the corresponding panels.

---

## 0. Environmental setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 1200

import sys

sys.path.append("../..")
from src.utils.notebooks.eda import *
from src.utils.notebooks.figure3 import *
from src.utils.notebooks.figure2 import *
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, StratifiedGroupKFold


# SMALL_SIZE = 16
# MEDIUM_SIZE = 18
# BIGGER_SIZE = 20

# mpl.rc("font", size=SMALL_SIZE, weight="bold")  # controls default text sizes
# mpl.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
# mpl.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
# mpl.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
# mpl.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
# mpl.rc("legend", fontsize=SMALL_SIZE)  # legend fontsize
# mpl.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title

seed = 1234
random.seed(1234)
np.random.seed(1234)

%reload_ext nb_black

In [None]:
# def plot_feature_importance(
#     importance,
#     names,
#     model_type,
#     figsize=[6, 4],
#     cmap=["gray"],
#     n_features=10,
#     feature_color_dict=None,
#     labelsize=6,
# ):
#     # Create arrays from feature importance and feature names
#     feature_importance = np.array(importance)
#     feature_names = np.array(names)

#     # Create a DataFrame using a Dictionary
#     data = {"feature_names": feature_names, "feature_importance": feature_importance}
#     fi_df = pd.DataFrame(data)

#     # Sort the DataFrame in order decreasing feature importance
#     fi_df.sort_values(by=["feature_importance"], ascending=False, inplace=True)
#     fi_df = fi_df.head(n_features)
#     # Define size of bar plot
#     fig, ax = plt.subplots(figsize=figsize)
#     # Plot Searborn bar chart
#     ax = sns.barplot(
#         y=fi_df["feature_importance"], x=fi_df["feature_names"], palette=cmap, ax=ax
#     )
#     if feature_color_dict is not None:
#         for xticklabel in ax.get_xticklabels():
#             xticklabel.set_color(feature_color_dict[xticklabel.get_text()])
#             xticklabel.set_rotation(90)
#     ax.tick_params(axis="x", labelsize=labelsize)
#     ax.tick_params(axis="y", labelsize=labelsize)

#     # xticklabel.set_ha("right")
#     # Add chart labels
#     ax.set_title(model_type + "FEATURE IMPORTANCE")
#     ax.set_xlabel("")
#     ax.set_ylabel("")
#     return fig, ax

In [None]:
def save_plotted_figure(
    output_dir="/home/paysan_d/Desktop/figures_chromark/fig1/",
    dpi=1200,
    transparent=True,
):
    global fig_count
    global fig

    os.makedirs(output_dir, exist_ok=True)
    fig.savefig(
        os.path.join(output_dir, "{}.png".format(fig_count)),
        dpi=dpi,
        transparent=transparent,
    )
    fig_count += 1

In [None]:
import re


def sorted_nicely(l):
    """ Sort the given iterable in the way that humans expect."""
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)]
    return sorted(l, key=alphanum_key)

In [None]:
nuc_feature_desc = pd.read_csv(
    "../../data/chrometric_feature_description.csv", index_col=0
)
feature_name_dict = dict(
    zip(
        list(nuc_feature_desc.loc[:, "feature"]),
        list(nuc_feature_desc.loc[:, "long_name"]),
    )
)
feature_color_dict = {
    "morphology": "b",
    "intensity": "g",
    "boundary": "r",
    "texture": "c",
    "chromatin condensation": "m",
    "moments": "y",
    np.nan: "k",
}
feature_color_dict = {
    feature: feature_color_dict[category]
    for (feature, category) in zip(
        list(nuc_feature_desc.loc[:, "long_name"]),
        list(nuc_feature_desc.loc[:, "category"]),
    )
}

In [None]:
color_palette = {"Control": "mediumseagreen", "Cancer": "tomato"}

---

## 1. Read in data

To assess the differences in the cancer populations compared to the control population, we obtained PBMCs of 10 healthy control samples and 10 cancer patients of various cancer types. For each patient we obtained a number of images of the PBMCs showing their DNA, gH2AX and Lamin A/C content using corresponding fluorescent stains.

First, we read in the required data set that describe each PBMCs by a number of hand-crafted features extracted from the fluorescent images of the cells.

In [None]:
all_data = pd.read_csv("../../data/control_pancancer_population_data.csv", index_col=0)

In [None]:
all_data = preprocess_data(all_data, remove_constant_features=False)

In [None]:
all_data = all_data.rename(columns=feature_name_dict)

hv_data = all_data.loc[all_data.condition == "Control"].copy()
pancancer_data = all_data.loc[all_data.condition == "Cancer"].copy()

In [None]:
len(all_data)

In [None]:
fig, ax = plt.subplots(figsize=[12, 4], ncols=2)
cond_order = ["Control", "Cancer"]
sample_order = sorted_nicely(np.unique(all_data.loc[:, "sample"]))
ax = ax.flatten()
ax[0] = sns.countplot(
    x="sample",
    data=all_data,
    ax=ax[0],
    order=sample_order,
    hue_order=cond_order,
    hue="condition",
    dodge=False,
    palette=color_palette,
)
ax[0].legend([], [], frameon=False)
ax[0].set_xlabel("ID of the biological sample")
ax[0].set_title("Distribution of biological samples")
for tick in ax[0].get_xticklabels():
    tick.set_rotation(45)

ax[1] = sns.countplot(
    x="condition",
    hue="condition",
    data=all_data,
    ax=ax[1],
    order=cond_order,
    dodge=False,
    palette=color_palette,
    hue_order=cond_order,
)
ax[1].set_xlabel("Condition")
ax[1].set_title("Distribution of conditions")

plt.show()
plt.close()

Note that while the data set consists of 10 cancer patients and 10 healthy controls the number of nuclei varies significantly between each biological sample.

___

#### Subsampling

We first subsample the data set such that for each condition (control vs. cancer) we have the same number of nuclei in the data set. Additionally, we ensure that for the cancer population we have approximately equal number of nuclei from each biological sample.

In [None]:
sampled_data = get_stratified_data(hv_data, pancancer_data)

In [None]:
fig, ax = plt.subplots(figsize=[12, 4], ncols=2)
cond_order = ["Control", "Cancer"]
sample_order = sorted_nicely(np.unique(all_data.loc[:, "sample"]))
ax = ax.flatten()
ax[0] = sns.countplot(
    x="sample",
    data=sampled_data,
    ax=ax[0],
    order=sample_order,
    hue_order=cond_order,
    hue="condition",
    dodge=False,
    palette=color_palette,
)
ax[0].legend([], [], frameon=False)
ax[0].set_xlabel("ID of the biological sample")
ax[0].set_title("Distribution of biological samples")
for tick in ax[0].get_xticklabels():
    tick.set_rotation(45)

ax[1] = sns.countplot(
    x="condition",
    hue="condition",
    data=sampled_data,
    ax=ax[1],
    order=cond_order,
    dodge=False,
    palette=color_palette,
    hue_order=cond_order,
)
ax[1].set_xlabel("Condition")
ax[1].set_title("Distribution of conditions")
ax[1].legend(loc="lower right")

plt.show()
plt.close()

The sampled data set consists of 2160 nuclei from the healthy controls as well as from the cancer patients.

----

#### Sample and feature selection

We now filter out constant features and nuclei with missing features. We will do that for both replicates individually. However, we will also combine the two replicates to provide a joint analysis.

In [None]:
data = preprocess_data(sampled_data, remove_constant_features=True)

---

#### Data preparation

After sampling the data, we will now prepare the data for the consecutive analysis, i.e. extracting only chrometric features and corresponding metadata information.

In [None]:
all_chrometric_data = get_chrometric_data(
    data,
    proteins=["gh2ax", "lamin"],
    exclude_dna_int=True,
)

sample_labels = data.loc[:, "sample"]
cond_labels = data.loc[:, "condition"]

Finally, we remove highly correlated features (Pearson $\rho > 0.8$) from the chrometric features.

In [None]:
chrometric_data = remove_correlated_features(all_chrometric_data, threshold=0.8)

---

## 3. Panels

Now we generate the individual panels for figure 2 of the paper.


### 3a. Visualization of the global nuclear phenotypes of control vs cancer population

In [None]:
# sns.heatmap(
#     StandardScaler().fit_transform(chrometric_data)[:10],
#     vmin=-2,
#     vmax=2,
#     cmap="seismic",
# )

First, we provide a visual representation of the different nuclear phenotypes in health and cancer. To this end, we will randomly sample 36 nuclei from each condition and plot a corresponding montage of the max-z projected DNA images. To visualize size differences each nuclei is padded to a size of 150x150 pixels.

In [None]:
image_file_path = "preprocessed/full_pipeline/segmentation/nuclei_images"
sampled_ctrl_images = get_random_images(
    data.loc[data.condition == "Control"],
    image_file_path,
    data_dir_col="data_dir",
    n_images=36,
    seed=1234,
    file_ending=".tif",
)

sampled_cancer_images = get_random_images(
    data.loc[data.condition == "Cancer"],
    image_file_path,
    data_dir_col="data_dir",
    n_images=36,
    seed=1234,
    file_ending=".tif",
)

#### Control population

In [None]:
fig, ax_ctrl = plot_montage(
    sampled_ctrl_images, pad_size=150, cmap="inferno", channel_first=False
)
fig.set_facecolor(color_palette["Control"])

#### Cancer population

In [None]:
fig, ax_cancer = plot_montage(
    sampled_cancer_images, pad_size=150, cmap="inferno", channel_first=False
)
fig.set_facecolor(color_palette["Cancer"])

The above montages show that the nuclei of the cancer samples often feature a signficantly different chrometric phenotype compared to those of PBMCs from the healthy controls. For instance we observe a frequent fragmentation of the nucleus which leads to highly concave nuclear shape.

---

### 3b. Parametric analysis captures large-scale differences between control and cancer population

We will now use the chrometric features to describe the nuclear phenotype of the PBMCs in the two conditions and assess their differences. To this end, we first visualize the data set using a tSNE plot to visualize the high-dimensional data set in 2D and show the large-scale differences of the chrometric phenotypes of PBMCs in healthy and cancer subjects respectively.

In [None]:
chrometric_embs = get_tsne_embs(chrometric_data)
chrometric_embs["condition"] = np.array(cond_labels)
chrometric_embs["sample"] = np.array(sample_labels)

In [None]:
pca_embs, pca = get_pca_embs(chrometric_data)
pca_embs["condition"] = np.array(cond_labels)
pca_embs["sample"] = np.array(sample_labels)

In [None]:
fig, ax = plt.subplots(figsize=[9, 6])
ax = sns.scatterplot(
    data=chrometric_embs,
    x="tSNE 1",
    y="tSNE 2",
    hue="condition",
    hue_order=cond_order,
    ax=ax,
    s=18,
    marker="o",
    palette=color_palette,
    legend=False,
)
plt.show()

#### Healthy controls

In [None]:
control_chrometric_feats = chrometric_data.loc[
    cond_labels.loc[cond_labels == "Control"].index
]
control_sample_labels = sample_labels.loc[
    cond_labels.loc[cond_labels == "Control"].index
]

Visually, we do not see any strong batch/patient-specific effects. We can further quantify that by clustering the nuclei of the healthy control population using a Gaussian Mixture clustering approach. We then validate that for each sample the conditional distribution over the identified clusters is approximately the same. Thereby, we identify the optimal number of clusters by the choice that minimizes the BIC criterion when using a full covariance structure and upper bounding the number of clusters by the number of samples/patients.

The row-normalized co-occurence matrix that visualizes the conditional distributions for each sample/patient is shown below.

In [None]:
fig, ax = plt.subplots(figsize=[9, 6])
ax = sns.scatterplot(
    data=chrometric_embs.loc[chrometric_embs.condition == "Control"],
    x="tSNE 1",
    y="tSNE 2",
    hue="sample",
    hue_order=sample_order[:10],
    ax=ax,
    s=12,
    marker="o",
    palette=sns.color_palette("tab20", 20)[:10],
)
plt.legend(
    bbox_to_anchor=(0.5, 1.05),
    loc="center",
    borderaxespad=0,
    title="Sample",
    ncol=10,
    fancybox=False,
    frameon=False,
    columnspacing=0.4,
)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=[6, 6])
ax = sns.scatterplot(
    data=pca_embs.loc[chrometric_embs.condition == "Control"],
    x="PC 1",
    y="PC 2",
    hue="sample",
    hue_order=sample_order[:10],
    ax=ax,
    s=5,
    marker="o",
    palette=sns.color_palette("tab20", 20)[:10],
)
plt.legend(
    bbox_to_anchor=(1.02, 0.5), loc="center left", borderaxespad=0, title="sample"
)
ax.set_xlabel("PC 1 ({}%)".format(np.round(pca.explained_variance_ratio_[0], 4) * 100))
ax.set_ylabel("PC 2 ({}%)".format(np.round(pca.explained_variance_ratio_[1], 4) * 100))
ax.set_xlim([-6, 10])
ax.set_ylim([-10, 10])

plt.show()

In [None]:
tmp = pca_embs.loc[pca_embs.condition == "Control"]
not_plotted = (
    np.sum(tmp.loc[:, "PC 1"] > 10)
    + np.sum(tmp.loc[:, "PC 1"] < -6)
    + np.sum(tmp.loc[:, "PC 2"] > 10)
    + np.sum(tmp.loc[:, "PC 2"] < -10)
)
print(
    "{} nuclei of a total {} are not shown in the LDA plot.".format(
        not_plotted, len(tmp)
    )
)

In [None]:
fig, ax = plt.subplots(figsize=[9, 6])
sample_cluster_count_mtx = get_batch_gmm_cluster_count_mtx(
    control_chrometric_feats, control_sample_labels
).loc[sample_order[:10]]
ax = sns.heatmap(sample_cluster_count_mtx, annot=True, fmt=".2f", ax=ax, cmap="viridis")
plt.show()

#### Cancer patients

In [None]:
cancer_chrometric_feats = chrometric_data.loc[
    cond_labels.loc[cond_labels != "Control"].index
]
cancer_sample_labels = sample_labels.loc[
    cond_labels.loc[cond_labels != "Control"].index
]

In [None]:
fig, ax = plt.subplots(figsize=[9, 6])
ax = sns.scatterplot(
    data=chrometric_embs.loc[chrometric_embs.condition == "Cancer"],
    x="tSNE 1",
    y="tSNE 2",
    hue="sample",
    hue_order=sample_order[10:],
    ax=ax,
    s=12,
    marker="o",
    palette=sns.color_palette("tab20", 20)[10:],
)
plt.legend(
    bbox_to_anchor=(0.5, 1.05),
    loc="center",
    borderaxespad=0,
    title="Sample",
    ncol=10,
    fancybox=False,
    frameon=False,
    columnspacing=0.4,
)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=[6, 6])
ax = sns.scatterplot(
    data=pca_embs.loc[pca_embs.condition == "Cancer"],
    x="PC 1",
    y="PC 2",
    hue="sample",
    hue_order=sample_order[10:],
    ax=ax,
    s=5,
    marker="o",
    palette=sns.color_palette("tab20", 20)[10:],
)
plt.legend(
    bbox_to_anchor=(1.02, 0.5), loc="center left", borderaxespad=0, title="sample"
)
ax.set_xlabel("PC 1 ({}%)".format(np.round(pca.explained_variance_ratio_[0], 4) * 100))
ax.set_ylabel("PC 2 ({}%)".format(np.round(pca.explained_variance_ratio_[1], 4) * 100))
ax.set_xlim([-6, 10])
ax.set_ylim([-10, 10])
plt.show()

In [None]:
tmp = pca_embs.loc[pca_embs.condition == "Cancer"]
not_plotted = (
    np.sum(tmp.loc[:, "PC 1"] > 10)
    + np.sum(tmp.loc[:, "PC 1"] < -6)
    + np.sum(tmp.loc[:, "PC 2"] > 10)
    + np.sum(tmp.loc[:, "PC 2"] < -10)
)
print(
    "{} nuclei of a total {} are not shown in the LDA plot.".format(
        not_plotted, len(tmp)
    )
)

In [None]:
fig, ax = plt.subplots(figsize=[9, 6])
sample_cluster_count_mtx = get_batch_gmm_cluster_count_mtx(
    cancer_chrometric_feats, cancer_sample_labels
).loc[sample_order[10:]]

ax = sns.heatmap(sample_cluster_count_mtx, annot=True, fmt=".2f", ax=ax, cmap="viridis")

plt.show()

---

#### Classification of the control resp. cancer PBMCs

To quantify the separability of the two conditions we perform a 10-fold stratified cross-validation analysis using a RandomForest classifier. The classifier provides a simple non-linear classification model which also yields an importance measure for the individual chrometric features indicating which ones are most different between the two populations.

In [None]:
rfc = RandomForestClassifier(
    n_estimators=500, n_jobs=10, random_state=seed, class_weight="balanced"
)

##### Nuclei split

At first we will split the data randomly on a nuclei-basis, i.e. nuclei of the same biological sample will be likely included in both the training and the test sets.

In [None]:
cond_cv_conf_mtx_nuclei = get_cv_conf_mtx(
    estimator=rfc,
    features=chrometric_data,
    labels=cond_labels,
    scale_features=False,
    n_folds=10,
    order=cond_order,
)
normalized_cv_conf_mtx_nuclei = cond_cv_conf_mtx_nuclei.divide(
    cond_cv_conf_mtx_nuclei.sum(axis=1), axis=0
)

In [None]:
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    normalized_cv_conf_mtx_nuclei,
    annot=True,
    fmt=".4f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    # cbar=False,
    annot_kws={"size": 16, "fontweight": "bold"},
)
ax.set_xlabel("Predicted condition")
ax.set_ylabel("True condition")
plt.show()

The classifier is able to accurately distinguish nuclei from the control and cancer population. This suggests that there are large differences with respect to the chrometric phenotypes of PBMCs in the absence compared to in the presence of cancer. While the tSNE plot already suggested that this is the case, a linear discriminant analysis shows that using a linear combination of the chrometric features the two classes can be accurately distinguished.

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)

In [None]:
lda_cond_cv_conf_mtx_nuclei = get_cv_conf_mtx(
    estimator=lda,
    features=chrometric_data,
    labels=cond_labels,
    scale_features=True,
    n_folds=10,
    order=cond_order,
)
lda_normalized_cv_conf_mtx_nuclei = lda_cond_cv_conf_mtx_nuclei.divide(
    lda_cond_cv_conf_mtx_nuclei.sum(axis=1), axis=0
)

In [None]:
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    lda_normalized_cv_conf_mtx_nuclei,
    annot=True,
    fmt=".4f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    # cbar=False,
)
ax.set_xlabel("Predicted condition")
ax.set_ylabel("True condition")

plt.show()

In [None]:
lda_transformed = pd.DataFrame(
    lda.fit(chrometric_data, cond_labels).transform(chrometric_data),
    columns=["LD 1"],
    index=chrometric_data.index,
)
lda_transformed["condition"] = np.array(cond_labels)
lda_transformed["sample"] = np.array(sample_labels)
fig, ax = plt.subplots(figsize=[9, 6])
ax = sns.histplot(
    data=lda_transformed,
    x="LD 1",
    hue="condition",
    hue_order=cond_order,
    palette=color_palette,
    kde=True,
    legend=False,
)

---
#### Patient split


While the previous analysis assess the level of differences of the chrometric phenotypes of the PBMCs between the control and cancer population, the classifier can make use of patient specific characteristics during the classification. In a diagnostic use case such information would not be available. To evaluate how well a classifier would be able to predict for unseen patient the corresponding condition simply based on the chrometric phenotypes of the PBMCs, we also assess the class separability using a stratified 10-fold patient-cross-validation approach. Thereby at each iteration 2 of the patients of each condition (healthy control and cancer) are hold out for the test set.

In [None]:
cond_cv_conf_mtx_patient = get_cv_conf_mtx(
    estimator=rfc,
    features=chrometric_data,
    labels=cond_labels,
    groups=sample_labels,
    scale_features=False,
    n_folds=20,
    order=cond_order,
    balance_train=True,
)
normalized_cv_conf_mtx_patient = cond_cv_conf_mtx_patient.divide(
    cond_cv_conf_mtx_patient.sum(axis=1), axis=0
)

In [None]:
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    normalized_cv_conf_mtx_patient,
    annot=True,
    fmt=".4f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    # cbar=False,
    annot_kws={"size": 20, "weight": "bold"},
)
ax.set_xlabel("Predicted condition")
ax.set_ylabel("True condition")
plt.show()

In [None]:
groupkfold = StratifiedGroupKFold(n_splits=20)
cv_bacs = cross_val_score(
    rfc,
    cv=groupkfold,
    X=chrometric_data,
    y=cond_labels,
    groups=sample_labels,
    scoring="balanced_accuracy",
    n_jobs=10,
)
print("Balanced accuracy: {} (+/- {})".format(np.mean(cv_bacs), np.std(cv_bacs)))

In [None]:
from sklearn.metrics import make_scorer, confusion_matrix


def sensitivity_scorer(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
    return sensitivity


def specificity_scorer(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    return specificity


sensitivity = make_scorer(sensitivity_scorer)
specificity = make_scorer(specificity_scorer)

In [None]:
cv_sens = cross_val_score(
    rfc,
    cv=groupkfold,
    X=chrometric_data,
    y=cond_labels,
    groups=sample_labels,
    scoring=sensitivity,
    n_jobs=10,
)
print("Sensitivity:", np.mean(cv_sens[cv_sens > 0]), np.std(cv_sens[cv_sens > 0]))

In [None]:
cv_specs = cross_val_score(
    rfc,
    cv=groupkfold,
    X=chrometric_data,
    y=cond_labels,
    groups=sample_labels,
    scoring=specificity,
    n_jobs=10,
)
print("Specificity:", np.mean(cv_specs[cv_specs > 0]), np.std(cv_specs[cv_specs > 0]))

---

#### Ablation study

In [None]:
nc_abl_results = run_nuclei_ablation_study_cv(
    estimator=rfc,
    features=chrometric_data,
    labels=cond_labels,
    groups=sample_labels,
    n_repeats=10,
    balance_train=True,
    scale_features=True,
    n_folds=len(set(sample_labels)),
    random_state=1234,
)

In [None]:
nc_abl_results.frac_nuclei = np.round(nc_abl_results.frac_nuclei, 2)
g = sns.catplot(
    data=nc_abl_results,
    x="frac_nuclei",
    y="lopo_accuracy",
    kind="point",
    errorbar="se",
    capsize=0.2,
    height=4,
    aspect=1.5,
)
g.set_xlabels("")
g.set_ylabels("")
g.set(ylim=(0.68, 0.78))
# g.set_xlabels("Fraction of nuclei\n(training set)")
# g.set_ylabels("Average accuracy\n(leave one patient out)")
# g.set_titles("Control vs. Cancer")

In [None]:
pt_abl_results = run_patient_ablation_study_cv(
    estimator=rfc,
    features=chrometric_data,
    labels=cond_labels,
    groups=sample_labels,
    n_repeats=10,
    balance_train=True,
    scale_features=True,
    n_folds=len(set(sample_labels)),
    random_state=1234,
)

In [None]:
pt_abl_results.groupby("n_train_patients").describe()

In [None]:
g = sns.catplot(
    data=pt_abl_results,
    x="n_train_patients",
    y="lopo_accuracy",
    kind="point",
    errorbar="se",
    capsize=0.2,
    height=4,
    aspect=1.5,
)
g.set_xlabels("")
g.set_ylabels("")
g.set(ylim=(0.68, 0.78))
# g.set_xlabels("Number of patients\n(training set)")
# g.set_ylabels("Average accuracy\n(leave one patient out)")
# g.set_titles("Control vs. Cancer")

---

In [None]:
cond_cv_conf_mtx_patient_lda = get_cv_conf_mtx(
    estimator=lda,
    features=chrometric_data,
    labels=cond_labels,
    groups=sample_labels,
    scale_features=False,
    n_folds=20,
    order=cond_order,
    balance_train=True,
)
normalized_cv_conf_mtx_patient_lda = cond_cv_conf_mtx_patient_lda.divide(
    cond_cv_conf_mtx_patient_lda.sum(axis=1), axis=0
)

In [None]:
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    normalized_cv_conf_mtx_patient_lda,
    annot=True,
    fmt=".4f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    # cbar=False,
    annot_kws={"size": 20, "weight": "bold"},
)
ax.set_xlabel("Predicted condition")
ax.set_ylabel("True condition")
plt.show()

In [None]:
groupkfold = StratifiedGroupKFold(n_splits=20)
cv_bacs_lda = cross_val_score(
    lda,
    cv=groupkfold,
    X=chrometric_data,
    y=cond_labels,
    groups=sample_labels,
    scoring="balanced_accuracy",
    n_jobs=10,
)
print(
    "Balanced accuracy: {} (+/- {})".format(np.mean(cv_bacs_lda), np.std(cv_bacs_lda))
)

---

##### Leave-one-patient out cross-validation

In addition to the previous study we run a leave-one-patient out cross-validation in order to characterize how the individual patients contribute to the separability of the cancer and the control population. In particular, we are interested in patients whose representative PBMC population is particularly accurate or inaccurate classified when the classifier is trained on the data of all other patients. Note that to avoid class imbalance, at each iteration were we leave out patient with a specific cancer type, we take a balanced random subsample among the PBMC population of all other patients for training such that each cancer type is equally represented.

In [None]:
lopo_cv_result = summarize_group_cv_results_by_fold(
    model=rfc,
    features=chrometric_data,
    labels=cond_labels,
    groups=sample_labels,
    balance_train=True,
)

In [None]:
tumor_types = ["Control", "Cancer"]
lopo_patient_cv_mtx = pd.DataFrame(
    np.zeros((2, 2)), index=tumor_types, columns=tumor_types
)
for c in tumor_types:
    for p in tumor_types:
        lopo_patient_cv_mtx.loc[c, p] = len(
            lopo_cv_result.loc[
                (lopo_cv_result.majority_class == c)
                & (lopo_cv_result.majority_predicted_class == p)
            ]
        )
normalized_lopo_patient_cv_mtx = lopo_patient_cv_mtx.divide(
    lopo_patient_cv_mtx.sum(axis=1), axis=0
)

In [None]:
normalized_lopo_patient_cv_mtx.index = ["Control", "Tumor"]
normalized_lopo_patient_cv_mtx.columns = ["Control", "Tumor"]
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    normalized_lopo_patient_cv_mtx,
    annot=True,
    fmt=".2f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    annot_kws={"size": 16, "weight": "bold"},
    # cbar=False,
)
ax.set_xlabel("Predicted cancer type")
ax.set_ylabel("True cancer type")
plt.show()

In [None]:
lopo_cv_result.groupby("majority_class").score.describe()

In [None]:
lopo_cv_result["label"] = "Control"
lopo_cv_result.loc[lopo_cv_result.group.str.startswith("p"), "label"] = "Cancer"
lopo_cv_result["prop_predict_cancer"] = lopo_cv_result.score
lopo_cv_result.loc[lopo_cv_result.group.str.startswith("h"), "prop_predict_cancer"] = (
    1
    - lopo_cv_result.loc[
        lopo_cv_result.group.str.startswith("h"), "prop_predict_cancer"
    ]
)
lopo_cv_result.describe()

In [None]:
patient_level_cv_results = {"prop_cancer_cutoff": [], "n_correct": []}
for i in tqdm(np.arange(0, 10000)):
    prop_cutoff = i / 10000
    patient_level_cv_results["prop_cancer_cutoff"].append(prop_cutoff)
    n_correct = 0
    for i in range(len(lopo_cv_result)):
        if lopo_cv_result.iloc[i].prop_predict_cancer > prop_cutoff:
            pred = "Cancer"
        else:
            pred = "Control"
        if pred == lopo_cv_result.iloc[i].label:
            n_correct += 1

    patient_level_cv_results["n_correct"].append(n_correct)

patient_level_cv_results = pd.DataFrame(patient_level_cv_results)
patient_level_cv_results["accuracy"] = np.array(
    patient_level_cv_results.n_correct
) / len(lopo_cv_result)

In [None]:
ax = sns.stripplot(data=lopo_cv_result, x="label", y="prop_predict_cancer")

In [None]:
fig, ax = plt.subplots(figsize=[6, 4])
ax = sns.lineplot(data=patient_level_cv_results, x="prop_cancer_cutoff", y="accuracy")
ax.set_ylim([0.4, 1.05])
ax.set_xlim([0, 1])
ax.set_ylabel("Classification accuracy")
ax.set_xlabel('Threshold for the fraction of "tumor-like" PBMCs')
plt.axvspan(
    np.min(
        patient_level_cv_results.loc[
            patient_level_cv_results.accuracy
            == np.max(patient_level_cv_results.accuracy),
            "prop_cancer_cutoff",
        ]
    ),
    np.max(
        patient_level_cv_results.loc[
            patient_level_cv_results.accuracy
            == np.max(patient_level_cv_results.accuracy),
            "prop_cancer_cutoff",
        ]
    ),
    color="gold",
    alpha=0.5,
    label="Perfect classification\n accuracy",
)
# ax.legend(loc="lower right", fontsize=8)

In [None]:
fig, ax = plt.subplots(figsize=[6, 4])
ax = sns.lineplot(data=patient_level_cv_results, x="prop_cancer_cutoff", y="accuracy")
ax.set_ylim([0.4, 1.05])
ax.set_xlim([0, 1])
ax.set_ylabel("Classification accuracy")
ax.set_xlabel('Threshold for the fraction of "tumor-like" PBMCs')
plt.axvspan(
    np.min(
        patient_level_cv_results.loc[
            patient_level_cv_results.accuracy
            == np.max(patient_level_cv_results.accuracy),
            "prop_cancer_cutoff",
        ]
    ),
    np.max(
        patient_level_cv_results.loc[
            patient_level_cv_results.accuracy
            == np.max(patient_level_cv_results.accuracy),
            "prop_cancer_cutoff",
        ]
    ),
    color="gold",
    alpha=0.5,
    label="Perfect classification\n accuracy",
)
# ax.legend(loc="lower right", fontsize=8)

In [None]:
lopo_cv_result_lda = summarize_group_cv_results_by_fold(
    model=lda,
    features=chrometric_data,
    labels=cond_labels,
    groups=sample_labels,
    balance_train=True,
)

In [None]:
lopo_cv_result_lda.describe()

To compare the performance to a random baseline and thus be able to assess if the classification performance is significantly better than random chance. We repeat that procedure 10 times when we randomly permute the cancer types of the individual patients before hand.

In [None]:
np.random.seed(seed)
bs = range(10)

lopo_perm_cv_results = []

for b in tqdm(bs):
    perm_cond_labels = get_permute_group_labels(cond_labels, sample_labels)[0]
    lopo_perm_cv_result = summarize_group_cv_results_by_fold(
        model=rfc,
        features=chrometric_data,
        labels=perm_cond_labels,
        groups=sample_labels,
        balance_train=True,
    )
    lopo_perm_cv_result["permutation"] = b
    lopo_perm_cv_results.append(lopo_perm_cv_result)
lopo_perm_cv_results = pd.concat(lopo_perm_cv_results)

In [None]:
lopo_perm_cv_results["condition"] = "Permuted"
lopo_cv_result["condition"] = "Observed"
all_lopo_results = lopo_cv_result.append(lopo_perm_cv_results)

We will now jointly plot the performance measured by the (balanced) accuracy score for each sample and thereby distinguish between the scores obtained with and without permuting the cancer labeles.

In [None]:
fig, ax = plot_lopo_cv_results_by_class(
    all_lopo_results,
    cond_order,
    x="majority_class",
    y="score",
    hue="condition",
    figsize=[6, 4],
    test="Mann-Whitney",
    pval_text_format="star",
    alpha=0.5,
)
# ax.set_xlabel("Condition")
# ax.set_ylabel("Classification accuracy by patient")
ax.set_xlabel("")
ax.set_ylabel("")
plt.show()

The above plot validates that the performance in both conditions is signficantly higher than what we expect by random chance. However, we also notice that there are is one healthy control and and one cancer patients for which just roughly 50% of all PBMCs are correctly classified by the Random Forest model. This is also shown in bar plot below.

In [None]:
fig, ax = plt.subplots(figsize=[6, 4])
sample_colors = [
    color_palette[k] for k in list(lopo_cv_result.loc[:, "majority_class"])
]
sample_palette = dict(zip(list(lopo_cv_result.loc[:, "group"]), sample_colors))
ax = sns.barplot(
    data=lopo_cv_result,
    x="group",
    y="score",
    palette=sample_palette,
    order=list(lopo_cv_result.sort_values("score").loc[:, "group"]),
)
plt.xticks(rotation=90)
ax.set_xlabel("Patient sample")
ax.set_ylabel("Classification accuracy")
plt.show()

In [None]:
lopo_cv_result.score.mean()

In [None]:
lopo_cv_result.score.std()

The two patients in question are healthy control HV2 and cancer patient P19, note that importantly for those samples the prediction probability of the true condition is fairly close two 50% emphasizing the difficulty of the classifier to identify the condition of those two samples.

In [None]:
lopo_cv_result.sort_values("score").head(2)

Finally, we will plot the overall performance of the leave one out cross-validation approach against the random background which we obtained by permuting the condition labels. Note that we color individual points corresponding to individual samples based on the average prediction performance of the actual condition.

In [None]:
fig, ax = plot_lopo_cv_results(
    data=all_lopo_results,
    alpha=0.7,
    cbar_label="Prediction probability \n of the true condition",
)
ax.set_xlabel("")
ax.set_ylabel("Accuracy by LoPo CV fold")

plt.show()

---

#### Effect of chemotherapy

In the previous analyses all 10 cancer patients wre used. However, five of those patients namely P11, P13, P14, P16 and P17 have undergone chemotherapy prior to the proton therapy treatment respectively in the case of P16 started chemotherapy in addition to the proton therapy treatment.

We will now check if there exist strong differences between these two cancer population, which could confound the analyses.

In [None]:
additional_therapy_labels = np.array(["No treatment"] * len(cond_labels))
ct_patients = ["p11", "p13", "p14", "p16", "p17"]
for patient in ct_patients:
    additional_therapy_labels[sample_labels == patient] = "Chemotherapy"
additional_therapy_labels = pd.Series(
    additional_therapy_labels, index=cond_labels.index
)

In [None]:
cancer_chrometric_data = chrometric_data.loc[cond_labels == "Cancer"]
therapy_labels = additional_therapy_labels.loc[cancer_chrometric_data.index]
cancer_sample_labels = sample_labels.loc[cancer_chrometric_data.index]

In [None]:
cond_cv_conf_mtx_patient_ct = get_cv_conf_mtx(
    estimator=rfc,
    features=cancer_chrometric_data,
    labels=therapy_labels,
    groups=cancer_sample_labels,
    scale_features=False,
    n_folds=5,
    order=["No treatment", "Chemotherapy"],
)
normalized_cv_conf_mtx_patient_ct = cond_cv_conf_mtx_patient_ct.divide(
    cond_cv_conf_mtx_patient_ct.sum(axis=1), axis=0
)

In [None]:
fig, ax = plt.subplots(figsize=[5, 4])
ax = sns.heatmap(
    normalized_cv_conf_mtx_patient_ct,
    annot=True,
    fmt=".4f",
    cmap="viridis",
    vmin=0,
    vmax=1,
    # cbar=False,
    annot_kws={"size": 20, "weight": "bold"},
)
ax.set_xlabel("Predicted condition")
ax.set_ylabel("True condition")

plt.show()

In [None]:
fig, ax = plot_feature_importance_for_estimator(
    rfc,
    features=cancer_chrometric_data,
    labels=therapy_labels,
    scale_features=False,
    cmap=["gray"],
    figsize=[2, 1],
    feature_color_dict=feature_color_dict,
    n_features=15,
)

In [None]:
lopo_cv_result = summarize_group_cv_results_by_fold(
    model=rfc,
    features=cancer_chrometric_data,
    labels=therapy_labels,
    groups=cancer_sample_labels,
    balance_train=True,
)

In [None]:
np.random.seed(seed)
bs = range(10)

lopo_perm_cv_results = []

for b in tqdm(bs):
    perm_therapy_labels = get_permute_group_labels(
        therapy_labels, cancer_sample_labels
    )[0]
    lopo_perm_cv_result = summarize_group_cv_results_by_fold(
        model=rfc,
        features=cancer_chrometric_data,
        labels=perm_therapy_labels,
        groups=cancer_sample_labels,
        balance_train=True,
    )
    lopo_perm_cv_result["permutation"] = b
    lopo_perm_cv_results.append(lopo_perm_cv_result)
lopo_perm_cv_results = pd.concat(lopo_perm_cv_results)

In [None]:
lopo_perm_cv_results["condition"] = "Permuted"
lopo_cv_result["condition"] = "Observed"
all_lopo_results = lopo_cv_result.append(lopo_perm_cv_results)

We will now jointly plot the performance measured by the (balanced) accuracy score for each sample and thereby distinguish between the scores obtained with and without permuting the cancer labeles.

In [None]:
fig, ax = plot_lopo_cv_results_by_class(
    all_lopo_results,
    ["No treatment", "Chemotherapy"],
    x="majority_class",
    y="score",
    hue="condition",
    figsize=[6, 4],
    test="Mann-Whitney",
    pval_text_format="star",
    alpha=0.5,
)
# ax.set_xlabel("Condition")
# ax.set_ylabel("Classification accuracy by patient")
ax.set_xlabel("")
ax.set_ylabel("")
plt.show()

In [None]:
groupkfold = StratifiedGroupKFold(n_splits=5)
cv_bacs = cross_val_score(
    rfc,
    cv=groupkfold,
    X=cancer_chrometric_data,
    y=therapy_labels,
    groups=cancer_sample_labels,
    scoring="balanced_accuracy",
    n_jobs=5,
)
print("Balanced accuracy: {} (+/- {})".format(np.mean(cv_bacs), np.std(cv_bacs)))

---

### 3c. Nuclear chromatin biomarkers identifying cancer populations


#### Feature importance

After having validated that there are significant differences between the control and the cancer population with respect to the chrometric pheontypes of the PBMCS, we next assess the implicit feature importance of a RandomForest classifier trained on the task to distinguish between control and cancer samples in order to get an idea of the features which are most indicative of the cancer condition.

In [None]:
fig, ax = plot_feature_importance_for_estimator(
    rfc,
    chrometric_data,
    cond_labels,
    scale_features=False,
    cmap=["gray"],
    figsize=[2, 1],
    feature_color_dict=feature_color_dict,
    n_features=15,
)

The analysis suggests that the heterochromatin content, the size of the nucleus, its shape as well as the DNA intensity distribution of the DNA inside the nucleus are altered in the cancer population.

---

### 3c. Nuclear chromatin biomarkers identifying cancer populations

While the previously shown feature importance plots already suggest a number of candidate chrometric biomarkers that capture the differences of the nuclear phenotypes of the PBMCs in the different cancer types, we run marker screen by testing for differential distributions of the individual chrometric features between the control and cancer population. To this end, we apply a t-test to test for difference in the means and adjust for multiple testing using the Benjamini-Hochberg procedure.

In [None]:
marker_screen_results = find_markers(chrometric_data, cond_labels)

In [None]:
marker_screen_results.loc[marker_screen_results.label == "Cancer"].head(10)

We find that the most significantly different features are a number of size features such as the maximum caliper and median radius, as well as features related to the curvature of the nucleus and the intranuclear chromatin compaction in the (rel_hc_volume and kurtosis) as well as the nuclear shape related (extent)

---

As a joint proxy to study the alterations in size, we focus at the nuclear volume, the variation in the shape by the concavity as well as the aspect ratio of the nucleus and the change in chromatin compaction by the relative heterochromatin to euchromatin ratio.

In [None]:
markers = [
    "volume",
    "hetero_to_euchromatin_volume_ratio",
    "concavity_3d",
    "aspect_ratio",
    "std_curvature",
]
marker_labels = [
    r"Nuclear volume in px$^3$",
    "Relative HC/EC ratio",
    "Concavity (3D)",
    "Aspect ratio (2D)",
    "Standard deviation of the curvature",
    "Kurtosis of the DNA\n intensity distribution (2D)",
]
plot_ctrl_cancer_markers_dist(
    data, markers, marker_labels, cut=0, palette=color_palette
)

In [None]:
markers = [
    "volume",
    "hetero_to_euchromatin_volume_ratio",
    "concavity_3d",
    "aspect_ratio",
    "std_curvature",
]
marker_labels = [
    r"Nuclear volume in px$^3$",
    "Relative HC/EC ratio",
    "Concavity (3D)",
    "Aspect ratio (2D)",
    "Standard deviation of the curvature",
    "Kurtosis of the DNA\n intensity distribution (2D)",
]
plot_ctrl_cancer_markers_dist(
    data, markers, marker_labels, cut=0, palette=color_palette, plot_type="bar"
)

In [None]:
markers = [
    "volume",
    "hetero_to_euchromatin_volume_ratio",
    "concavity_3d",
]
marker_labels = [
    "Nuclear volume\n" + r"in px$^3$",
    "HC/EC ratio",
    "Concavity (3D)",
]
fig, ax = plot_joint_markers_ctrl_cancer(
    data,
    markers,
    marker_labels,
    figsize=[5, 3],
    cut=0,
    palette=color_palette,
)
ax.set_ylabel("Normalized marker value")
ax.set_xlabel("Chrometric Marker")
sns.move_legend(
    ax,
    "lower center",
    bbox_to_anchor=(0.5, 1),
    ncol=2,
    title=None,
    frameon=False,
)
plt.show()

In [None]:
markers = [
    "aspect_ratio",
    "std_curvature",
]
marker_labels = [
    "Aspect ratio (2D)",
    "Standard deviation of the curvature",
]
fig, ax = plot_joint_markers_ctrl_cancer(
    data,
    markers,
    marker_labels,
    figsize=[6, 4],
    cut=0,
    palette=color_palette,
)
ax.set_ylabel("Normalized marker value")
ax.set_xlabel("Chrometric Marker")
sns.move_legend(
    ax,
    "lower center",
    bbox_to_anchor=(0.5, 1),
    ncol=2,
    title=None,
    frameon=False,
)
plt.show()

---

### 3d. Proteomic differences of PBMCs in cancer

Finally, we also assess the proteomic differences between the control and cancer populations. To this end, we plot the relative Lamin and gH2AX expression measured by the sum of the intensities of the corresponding imaging channels normalized by the nuclear volume. Additionally, we plot the number of identified gH2AX foci which are computed as the local maxima peaks found in the corresponding channel images.

Note that those features are only available for the first data set that was stained for those proteins.

In [None]:
markers = [
    "rel_lamin_3d_int",
    "rel_gh2ax_3d_int",
    "gh2ax_foci_count",
    "gh2ax_sum_foci_area",
    "gh2ax_avg_foci_area",
]
marker_labels = [
    "Volume-normalized nuclear\nLamin A/C intensity",
    "Normalized nuclear\n" r"$\gamma$H2AX intensity",
    r"Number of $\gamma$H2AX foci",
    r"Sum of the $\gamma$H2AX foci area",
    r"Average size of the $\gamma$H2AX foci",
]
plot_ctrl_cancer_markers_dist(
    data,
    markers,
    marker_labels,
    quantiles=None,
    cut=0,
    plot_type="bar",
    palette=color_palette,
)


In [None]:
markers = [
    "rel_lamin_3d_int",
    "gh2ax_foci_count",
]
marker_labels = [
    "Volume-normalized nuclear\nLamin A/C intensity",
    r"Number of $\gamma$H2AX foci",
]
fig, ax = plot_joint_markers_ctrl_cancer(
    data,
    markers,
    marker_labels,
    figsize=[5, 3],
    cut=0,
    plot_type="bar",
    palette=color_palette,
    scale_to_control=True,
)
ax.set_ylabel("Normalized marker value")
ax.set_xlabel("Chrometric Marker")
ax.set_ylim([0, 2])
sns.move_legend(
    ax,
    "lower center",
    bbox_to_anchor=(0.5, 1),
    ncol=2,
    title=None,
    frameon=False,
)
plt.show()

---

## 4. Supplemental

In [None]:
age_dict = {
    "hv1": 24,
    "hv2": 33,
    "hv3": 42,
    "hv4": 42,
    "hv5": 60,
    "hv5": 26,
    "hv6": 60,
    "hv7": 32,
    "hv8": 28,
    "hv9": 45,
    "hv10": 35,
}
data["age"] = data.loc[:, "sample"].map(age_dict)

In [None]:
gender_dict = {
    "hv1": "male",
    "hv2": "male",
    "hv3": "female",
    "hv4": "male",
    "hv5": "male",
    "hv6": "male",
    "hv7": "male",
    "hv8": "female",
    "hv9": "female",
    "hv10": "male",
}
data["gender"] = data.loc[:, "sample"].map(gender_dict)

In [None]:
data.groupby(["sample", "age"]).mean().loc[
    :, ["gh2ax_foci_count", "gh2ax_sum_foci_area", "gh2ax_avg_foci_area"]
]

In [None]:
data.groupby(["sample", "age"]).std().loc[
    :, ["gh2ax_foci_count", "gh2ax_sum_foci_area", "gh2ax_avg_foci_area"]
]

In [None]:
data["age_group"] = "younger than 35"
data.loc[data.age > 44, "age_group"] = "older than 35"

sns.lineplot(data=data, x="age", y="gh2ax_foci_count")

In [None]:
sns.lineplot(data=data, x="age", y="gh2ax_sum_foci_area")

In [None]:
sns.lineplot(data=data, x="age", y="gh2ax_avg_foci_area")

In [None]:
ax = sns.violinplot(data=data, x="gender", y="gh2ax_foci_count")
annotator = Annotator(
    ax,
    [("male", "female")],
    data=data,
    x="gender",
    y="gh2ax_foci_count",
    plot="violinplot",
)
annotator.configure(
    test="Mann-Whitney",
    text_format="star",
    loc="inside",
    comparisons_correction="Benjamini-Hochberg",
)
annotator.apply_test()
annotator.annotate()