In [None]:
import anndata
import pandas as pd
import scanpy as scp
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib.patches import Rectangle

import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

from matplotlib import colors

%load_ext jupyter_black

protein_map = pd.read_csv(
    "2022_11_16_Protein-Gene mapping_markers for data integration.csv"
)

# remove non-relevant antigens
protein_map = protein_map[protein_map["Comment"].isnull()]
# remove whitespace from marker names
protein_map["Antigen (CODEX)"] = protein_map["Antigen (CODEX)"].str.strip()
protein_map["Gene"] = protein_map["Gene"].str.strip()

In [None]:
df = pd.read_csv("HCC_nojunk.csv")

In [None]:
df.head()

In [None]:
df_cd8 = df[df["Class"] == "CD8 T cells"]

In [None]:
ax = sns.scatterplot(data=df_cd8, x="lumap1", y="lumap2", hue="Class0")
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

In [None]:
ax = sns.scatterplot(data=df_cd8, x="lumap1", y="lumap2", hue="Tissue")
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

In [None]:
ax = sns.scatterplot(data=df_cd8, x="lumap1", y="lumap2", hue="Patient", s=10)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

In [None]:
count_df_ = (
    df_cd8.value_counts(["Tissue", "Class0"])
    .reset_index()
    .pivot(index="Tissue", columns="Class0", values="count")
).fillna(0)
count_df_.plot.barh(stacked=True)
plt.legend(bbox_to_anchor=(1.75, 0), loc="lower right")
plt.show()

count_df__norm = count_df_.div(count_df_.sum(axis=1), axis=0)
count_df__norm = count_df_.div(count_df_.sum(axis=1), axis=0)
count_df__norm.plot.barh(stacked=True)
plt.legend(bbox_to_anchor=(1.75, 0), loc="lower right")
plt.show()

In [None]:
df_cd8["Class0"].unique()

In [None]:
df_cd8_actual = df_cd8[df_cd8["Class0"] != "T cells CD57"]

In [None]:
df_cd8_actual.shape

In [None]:
df_cd8_actual_actual = df[
    (df["Class0"] == "T cells CD8 PD-1low")
    | (df["Class0"] == "T cells CD8 PD-L1+")
    | (df["Class0"] == "T cells CD8 PD-1high")
]
df_cd8_actual_actual.shape

In [None]:
df_cd8_actual.to_csv("HCC_nojunk_CD8_only.csv")

In [None]:
# df_cd8_actual = pd.read_csv("HCC_nojunk_CD8_only.csv")

obs_columns = [
    "Unnamed: 0",
    "X",
    "index",
    "Object.Id",
    "XMin",
    "XMax",
    "YMin",
    "YMax",
    "Tissue",
    "Patient",
    "Celltype",
    "Class",
    "Class0",
]

marker_columns = [
    "CD56.Cytoplasm.Intensity",
    "CD161.Cytoplasm.Intensity",
    "TCRValpha.Cytoplasm.Intensity",
    "CD39.Cytoplasm.Intensity",
    "CD25.Cytoplasm.Intensity",
    "CD57.Cytoplasm.Intensity",
    "CD40.Cytoplasm.Intensity",
    "ICOS.Cytoplasm.Intensity",
    "CD3.Cytoplasm.Intensity",
    "CD62L.Cytoplasm.Intensity",
    "LYVE.1.Cytoplasm.Intensity",
    "CD45RO.Cytoplasm.Intensity",
    "IL18Ra.Cytoplasm.Intensity",
    "PD.L1.Cytoplasm.Intensity",
    "CD45.Cytoplasm.Intensity",
    "CD34.Cytoplasm.Intensity",
    "CD163.Cytoplasm.Intensity",
    "Ki67.Nucleus.Intensity",
    "CD19.Cytoplasm.Intensity",
    "CD38.Cytoplasm.Intensity",
    "CD279.Cytoplasm.Intensity",
    "CD11c.Cytoplasm.Intensity",
    "CD8.Cytoplasm.Intensity",
    "CD11b.Cytoplasm.Intensity",
    "CD16.Cytoplasm.Intensity",
    "FoxP3.Nucleus.Intensity",
    "CD69.Cytoplasm.Intensity",
    "CD15.Cytoplasm.Intensity",
    "HNFalpha.Nucleus.Intensity",
    "pancytokeratin.Cytoplasm.Intensity",
    "HLADR.Cytoplasm.Intensity",
    "CD45RA.Cytoplasm.Intensity",
    "aSMA.Cytoplasm.Intensity",
    "CD4.Cytoplasm.Intensity",
    "CD66b.Cytoplasm.Intensity",
    "CD68.Cytoplasm.Intensity",
    "EPCAM.Cytoplasm.Intensity",
]

markers = []
for column in marker_columns:
    if "Intensity" in column:
        markers.append("-".join(column.split(".")[:-2]))
markers


X = df_cd8_actual.loc[:, marker_columns].values
adata = anndata.AnnData(X=X, obs=df_cd8_actual.loc[:, obs_columns])
adata.obsm

In [None]:
scp.pp.pca(adata, use_highly_variable=False)

scp.pp.neighbors(adata, n_neighbors=30)

In [None]:
scp.tl.umap(adata)

In [None]:
scp.pl.umap(adata, color=["Tissue", "Patient", "Class0"])

In [None]:
scp.tl.leiden(adata, resolution=1)

In [None]:
scp.pl.umap(adata, color=["Tissue", "Patient", "Class0", "leiden"])

In [None]:
mean_expressions = []
clusters = []
for leiden_cluster in adata.obs["leiden"].unique():
    cells = adata[adata.obs["leiden"] == leiden_cluster]
    print(leiden_cluster, ":", cells.shape[0])
    mean_expressions.append(cells.X.mean(axis=0))
    clusters.append(leiden_cluster)
mean_expressions = np.array(mean_expressions)

plt.figure(figsize=(12, 8))
sns.heatmap(
    mean_expressions.T,
    xticklabels=clusters,
    yticklabels=markers,
    linewidth=0.1,
    linecolor="k",
)

In [None]:
protein_map["Antigen (CODEX)"].values

In [None]:
adata.obs["XMean"] = (adata.obs["XMax"] + adata.obs["XMin"]) / 2
adata.obs["YMean"] = (adata.obs["YMax"] + adata.obs["YMin"]) / 2
adata.obsm["X_spatial"] = np.stack(
    [adata.obs["XMean"].values, adata.obs["YMean"].values], axis=1
)

In [None]:
for patient in adata.obs["Patient"].unique():
    scp.pl.embedding(
        adata[adata.obs["Patient"] == patient],
        basis="X_spatial",
        color=["Tissue", "Class0"],
        size=10,
        ncols=2,
    )

In [None]:
adata.write("CODEX_cd8.h5ad")