# Changes of cell type abundancies in PBMCs reflects the cancer and treatment effect of proton therapy (all cancers)

---
This notebook summarizes an essential part of the analysis corresponding to the results presented in figure 5 of the paper for all cancer patients. It can be used to rerun the analysis and regenerate the corresponding panels.

---

## 0. Environmental setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import matplotlib as mpl
from matplotlib.collections import PolyCollection
from matplotlib.legend_handler import HandlerTuple
from matplotlib.colors import to_rgb
from collections import Counter

mpl.rcParams["figure.dpi"] = 1200

import sys

sys.path.append("../..")
from src.utils.notebooks.eda import *
from src.utils.notebooks.figure3 import *
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

seed = 1234
random.seed(1234)
np.random.seed(1234)

%reload_ext nb_black

In [None]:
def get_ct_abundance_data(data, normalize=True):
    ct_abundance_data = {
        "id": [],
        "sample": [],
        "timepoint": [],
        "condition": [],
        "n_pbmc": [],
    }
    all_celltypes_2 = np.unique(data.loc[:, "celltype_2"])
    for idx in np.unique(data.loc[:, "id"]):
        id_data = data.loc[data.id == idx]

        sample_name = np.array(id_data.loc[:, "sample"])[0]
        timepoint = np.array(id_data.loc[:, "timepoint"])[0]
        condition = np.array(id_data.loc[:, "condition"])[0]
        n_pbmc = len(id_data)

        ct_abundance_data["id"].append(idx)
        ct_abundance_data["sample"].append(sample_name)
        ct_abundance_data["timepoint"].append(timepoint)
        ct_abundance_data["condition"].append(condition)
        ct_abundance_data["n_pbmc"].append(n_pbmc)

        for ct in all_celltypes_2:
            if ct not in ct_abundance_data:
                ct_abundance_data[ct] = []
            count = len(id_data.loc[id_data.loc[:, "celltype_2"] == ct])
            if normalize:
                count /= n_pbmc
            ct_abundance_data[ct].append(count)

    return pd.DataFrame(ct_abundance_data)

In [None]:
def plot_celltype_abundancies_with_control(
    data,
    celltypes,
    celltype_labels=None,
    cut=2,
    figsize=[4, 4],
    plot_type="violin",
    quantiles=None,
    test="Mann-Whitney",
):
    ctrl_colors = [
        "mediumseagreen",
        "mediumseagreen",
        "mediumseagreen",
        "mediumseagreen",
    ]
    mg_colors = ["lightsteelblue", "lightsteelblue", "royalblue", "midnightblue"]
    gl_colors = ["orange", "orange", "gold", "saddlebrown"]
    hn_colors = ["plum", "plum", "deeppink", "indigo"]

    violin_all_colors = ctrl_colors + mg_colors + gl_colors + hn_colors
    bar_all_colors = []
    for i in range(4):
        bar_all_colors.append(ctrl_colors[i])
        bar_all_colors.append(mg_colors[i])
        bar_all_colors.append(gl_colors[i])
        bar_all_colors.append(hn_colors[i])

    for i in range(len(celltypes)):
        fig, ax = plot_marker_distribution(
            data,
            figsize=figsize,
            marker=celltypes[i],
            label_col="condition",
            order=["Control", "Meningioma", "Glioma", "Head & Neck"],
            box_pairs=[
                (("Control", "control"), ("Meningioma", "prior")),
                (("Control", "control"), ("Meningioma", "during")),
                (("Control", "control"), ("Meningioma", "end")),
                (("Control", "control"), ("Glioma", "prior")),
                (("Control", "control"), ("Glioma", "during")),
                (("Control", "control"), ("Glioma", "end")),
                (("Control", "control"), ("Head & Neck", "prior")),
                (("Control", "control"), ("Head & Neck", "during")),
                (("Control", "control"), ("Head & Neck", "end")),
                (("Meningioma", "prior"), ("Meningioma", "during")),
                (("Meningioma", "prior"), ("Meningioma", "end")),
                (("Meningioma", "during"), ("Meningioma", "end")),
                (("Glioma", "prior"), ("Glioma", "during")),
                (("Glioma", "prior"), ("Glioma", "end")),
                (("Glioma", "during"), ("Glioma", "end")),
                (("Head & Neck", "prior"), ("Head & Neck", "during")),
                (("Head & Neck", "prior"), ("Head & Neck", "end")),
                (("Head & Neck", "during"), ("Head & Neck", "end")),
            ],
            quantiles=quantiles,
            cut=cut,
            plot_type=plot_type,
            palette="gray",
            hue="timepoint",
            hue_order=["control", "prior", "during", "end"],
            test=test,
        )
        ax.set_xlabel("Condition")
        if celltype_labels is not None:
            ax.set_ylabel(celltype_labels[i])

        handles = []
        if plot_type == "violin":
            for ind, obj in enumerate(ax.findobj(PolyCollection)):
                rgb = to_rgb(violin_all_colors[ind])
                obj.set_facecolor(rgb)
                handles.append(
                    plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor="black")
                )
        elif plot_type == "bar":
            for ind, bar in enumerate(ax.patches):
                rgb = to_rgb(bar_all_colors[ind])
                bar.set_color(rgb)
                handles_color = to_rgb(violin_all_colors[ind])
                handles.append(
                    plt.Rectangle(
                        (0, 0), 0, 0, facecolor=handles_color, edgecolor="black"
                    )
                )
        else:
            raise NotImplementedError

        ax.legend(
            handles=[
                tuple(handles[1:4]),
                tuple(handles[5::4]),
                tuple(handles[6::4]),
                tuple(handles[7::4]),
            ],
            labels=tp_order,
            title="Timepoint",
            handlelength=4,
            handler_map={tuple: HandlerTuple(ndivide=None, pad=0)},
            bbox_to_anchor=(1.02, 0.5),
            loc="center left",
            borderaxespad=0,
        )

        plt.show()
        plt.close()

In [None]:
def plot_celltype_abundancies(
    data,
    celltypes,
    celltype_labels=None,
    cut=2,
    figsize=[4, 4],
    plot_type="violin",
    quantiles=None,
    test="Mann-Whitney",
):

    mg_colors = ["lightsteelblue", "royalblue", "midnightblue"]
    gl_colors = ["orange", "gold", "saddlebrown"]
    hn_colors = ["plum", "deeppink", "indigo"]

    violin_all_colors = mg_colors + gl_colors + hn_colors
    bar_all_colors = []
    for i in range(3):

        bar_all_colors.append(mg_colors[i])
        bar_all_colors.append(gl_colors[i])
        bar_all_colors.append(hn_colors[i])

    for i in range(len(celltypes)):
        fig, ax = plot_marker_distribution(
            data,
            figsize=figsize,
            marker=celltypes[i],
            label_col="condition",
            order=["Meningioma", "Glioma", "Head & Neck"],
            box_pairs=[
                (("Meningioma", "prior"), ("Meningioma", "during")),
                (("Meningioma", "prior"), ("Meningioma", "end")),
                (("Meningioma", "during"), ("Meningioma", "end")),
                (("Glioma", "prior"), ("Glioma", "during")),
                (("Glioma", "prior"), ("Glioma", "end")),
                (("Glioma", "during"), ("Glioma", "end")),
                (("Head & Neck", "prior"), ("Head & Neck", "during")),
                (("Head & Neck", "prior"), ("Head & Neck", "end")),
                (("Head & Neck", "during"), ("Head & Neck", "end")),
            ],
            stat_annot="star",
            quantiles=quantiles,
            cut=cut,
            plot_type=plot_type,
            palette="gray",
            hue="timepoint",
            hue_order=["prior", "during", "end"],
            test=test,
        )
        ax.set_xlabel("Condition")
        if celltype_labels is not None:
            ax.set_ylabel(celltype_labels[i])

        handles = []
        if plot_type == "violin":
            for ind, obj in enumerate(ax.findobj(PolyCollection)):
                rgb = to_rgb(violin_all_colors[ind])
                obj.set_facecolor(rgb)
                handles.append(
                    plt.Rectangle((0, 0), 0, 0, facecolor=rgb, edgecolor="black")
                )
        elif plot_type == "bar":
            for ind, bar in enumerate(ax.patches):
                rgb = to_rgb(bar_all_colors[ind])
                bar.set_color(rgb)
                handles_color = to_rgb(violin_all_colors[ind])
                handles.append(
                    plt.Rectangle(
                        (0, 0), 0, 0, facecolor=handles_color, edgecolor="black"
                    )
                )
        else:
            raise NotImplementedError

        ax.legend(
            handles=[
                tuple(handles[0::3]),
                tuple(handles[1::3]),
                tuple(handles[2::3]),
            ],
            labels=tp_order,
            title="Timepoint",
            handlelength=4,
            handler_map={tuple: HandlerTuple(ndivide=None, pad=0)},
            bbox_to_anchor=(1.02, 0.5),
            loc="center left",
            borderaxespad=0,
        )

        plt.show()
        plt.close()

In [None]:
cond_palette = {
    "Control": "mediumseagreen",
    "Meningioma": "cornflowerblue",
    "Glioma": "orange",
    "Head & Neck": "orchid",
}

In [None]:
mg_colors = ["lightsteelblue", "royalblue", "midnightblue"]
gl_colors = ["orange", "gold", "saddlebrown"]
hn_colors = ["plum", "deeppink", "indigo"]



---

## 1. Read in data

In this notebook we assess the differences of the abundancies of the different subsets of PBMCs at three different time points of the proton therapy treatment: before, during (~3 weeks in) and at the end of the treatment (final week of treatment). To this end, we obtained PBMCs of 8 Meningioma, 8 Glioma and 6 Head & Neck cancer patients stained them for DNA, CD16, CD4 and CD8 and obtained fluorescent images. Additionally, we will use a set of 8 healthy control patients for which we have stained the PBMCs for CD4 and CD8 to compare the abundancies of the corresponding subtypes of PBMCs of our treatment population to healthy controls.

First, we read in the required data set that describe each PBMCs by a number of hand-crafted features extracted from the fluorescent images of the cells.

In [None]:
all_data = pd.read_csv("../../data/treated_population_marker_data.csv", index_col=0)

In [None]:
all_data = preprocess_data(all_data, remove_constant_features=False)

In [None]:
fig, ax = plt.subplots(figsize=[14, 4], ncols=2)
tp_order = ["control", "prior", "during", "end"]
sample_order = np.unique(all_data.loc[:, "sample"])
ax = ax.flatten()
ax[0] = sns.countplot(
    x="sample",
    data=all_data,
    ax=ax[0],
    order=sample_order,
    hue_order=tp_order,
    hue="timepoint",
    palette="gray",
)
ax[0].legend([], [], frameon=False)
ax[0].set_xlabel("ID of the biological sample")
ax[0].set_title("Distribution of biological samples")
for tick in ax[0].get_xticklabels():
    tick.set_rotation(45)

ax[1] = sns.countplot(
    x="timepoint",
    hue="timepoint",
    data=all_data,
    ax=ax[1],
    order=tp_order,
    dodge=False,
    palette="gray",
)
ax[1].set_xlabel("Timepoint with respect to the treatment")
ax[1].set_title("Distribution of the different timepoints")
ax[1].legend(loc="lower right")

plt.show()
plt.close()

In [None]:
count_matrix = all_data.groupby(["timepoint", "sample"]).size().unstack(fill_value=0)

# Create a color palette for the 'cancer' column
ordered_samples = sorted(
    list(all_data.loc[all_data.condition == "Control", "sample"].unique())
)

ordered_samples += sorted(
    list(all_data.loc[all_data.condition == "Meningioma", "sample"].unique())
)
ordered_samples += sorted(
    list(all_data.loc[all_data.condition == "Head & Neck", "sample"].unique())
)
ordered_samples += sorted(
    list(all_data.loc[all_data.condition == "Glioma", "sample"].unique())
)
ordered_timepoints = ["control", "prior", "during", "end"]

row_colors = (
    all_data.drop_duplicates("sample")
    .set_index("sample")
    .loc[ordered_samples]["condition"]
    .map(cond_palette)
)


# Plotting the heatmap with annotations
plt.figure(figsize=(6, 10))
sns.set(font_scale=1)
ax = sns.heatmap(
    count_matrix.transpose().loc[ordered_samples, ordered_timepoints],
    annot=True,
    fmt="d",
    cmap="Greys",
    linewidths=0.5,
    linecolor="gray",
    cbar=False,
)

# Adding row colors for the 'cancer' column
for tick_label, color in zip(ax.get_yticklabels(), row_colors):
    tick_label.set_backgroundcolor(color)

plt.title("Processed data (w. duplicates)")
plt.xlabel("Timepoint")
plt.ylabel("Patient")
plt.show()

### Add cell type labels

We will now add combined cell type labels for all nuclei.

In [None]:
mpl.style.use("default")
mpl.rcParams["figure.dpi"] = 1200

In [None]:
data = all_data.copy()
celltype_cols = ["cd4", "cd8"]
coding_dict = {0: "-", 1: "+"}
for celltype_col in celltype_cols:
    data.loc[:, celltype_col] = data.loc[:, celltype_col].map(coding_dict)

data["celltype_2"] = "CD4" + data["cd4"] + "CD8" + data["cd8"]
celltypes_2 = np.unique(data.loc[:, "celltype_2"])

In [None]:
ct_abundance_data = get_ct_abundance_data(data, normalize=True)

As a first step let us have a look for each sample individually at the cell type distribution.

---

## 3. Panels

Now we generate the individual panels for figure 5 of the paper.

---

### 3b. Cell type abundancies changes during proton therapy treatment

In [None]:
celltype2_labels = []
for celltype in celltypes_2:
    celltype2_labels.append("Fraction of {} among all PBMCs".format(celltype))

plot_celltype_abundancies_with_control(
    ct_abundance_data,
    celltypes=celltypes_2,
    celltype_labels=celltype2_labels,
    figsize=[4, 4.5],
    plot_type="bar",
)