# Options & Data

In [39]:
%load_ext autoreload
%autoreload 2

import os
import re
import functools
import numpy as np
import pandas as pd
import corescpy as cr

# Count Threshold for Cell Quantification
count_threshold = 1

# File Paths
panel = "TUQ97N"
direc = "/mnt/cho_lab/bbdata2/"
dir_data = os.path.join(direc, f"outputs/{panel}")
dir_writeable = "/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library"
out_dir = os.path.join(dir_writeable, "outputs/TUQ97N/nebraska")
mdf = os.path.join(dir_writeable, "samples.csv")  # metadata
path_ann = "~/corescpy/examples/markers_lineages.csv"

# Constants
cso, csid, col_condition = (cr.pp.COL_SAMPLE_ID_O, cr.pp.COL_SAMPLE_ID,
                            cr.pp.COL_CONDITION)
col_inflamed, col_stricture = (cr.pp.COL_INFLAMED, cr.pp.COL_STRICTURE)

# Clustering Version
c_t = "leiden_res1pt5_dist0_npc30"  # high resolution
# c_t = "leiden_res0pt75_dist0pt3_npc30"  # medium resolution
# c_t = "leiden_res0pt5_dist0pt5_npc30"  # low resolution

# ToppGene Sources & Quantification Count Threshold
srcs = ["Cells of the human intestinal tract mapped across space and time",
        "Human Ileal Epithelial cells from Crohn’s Disease",
        "Human Ileal Immune cells from Crohn’s Disease"]
count_threshold = 1

# Display
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 100
pd.options.display.max_rows = 500

# Annotation Guide File
anf = pd.read_csv(path_ann)
assign = anf.dropna(subset=col_assignment).set_index(
    "gene").rename_axis("Gene")  # markers

# Metadata & List of Existing Xenium Data Directories
metadata = (pd.read_excel if mdf[-4:] == "xlsx" else pd.read_csv)(mdf)
metadata.loc[:, col_condition] = metadata.apply(lambda x: "Stricture" if x[
    col_stricture].lower() in ["stricture", "yes"] else x[
        col_inflamed].capitalize(), axis=1)  # inflamation/stricture condition
metadata.loc[:, csid] = metadata[col_condition] + "-" + metadata[cso]
samp_ids = dict(metadata.set_index(cso)[csid])  # map libid to condition-ID
files = functools.reduce(lambda i, j: i + j, [[os.path.join(
    run, i) for i in os.listdir(os.path.join(
        dir_data, run))] for run in os.listdir(dir_data)])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data

In [None]:
# Load Spatial Data
libid = "Stricture-50403C1"
file_path = np.array(files)[np.where(["-".join(libid.split(
    "-")[1:]) == os.path.basename(x).split("__")[2].split(
        "-")[0] for x in files])[0][0]]  # find file for sample
self = cr.Spatial(os.path.join(dir_data, file_path), library_id=libid)
self.update_from_h5ad(os.path.join(out_dir, libid + ".h5ad"))
self.get_layer("counts", inplace=True)

# Write Cluster Files
# self.write_clusters(out_dir, col_cell_type=c, overwrite=True,
#                     file_prefix=f"{self._library_id}__", n_top=True)

# ToppGene

In [None]:
self.get_layer("counts", inplace=True)
tgdf, mks = self.annotate_clusters(
        None, sources=srcs, col_cell_type=c_t, max_results=10000,
        name_pattern={srcs[0]: "SmallIntestine"}, p_threshold=1e-15,
        lfc_threshold=1, n_top_genes=20, n_top_annotations=40)
longs = [(" / Per Region, Age_group, Lineage, cell class, cell type", ""),
         ("SmallIntestine", "SmInt")]
for x in longs:
    tgdf.loc[:, "Name"] = tgdf.Name.apply(lambda y: re.sub(x[0], x[1], y))
tgdf.loc[:, "Name"] = tgdf.Name.apply(lambda y: "|".join(y.split("|")[
        :-1]) if y.split("|")[-1].split("-")[0] in y.split("-")[0] and (any(
            i in y for i in ["Child", "Adult", "Pediatric", "Trim"])) else y)
tgdf = tgdf.rename_axis([c_t, "Rank"])
tgdf = tgdf.join(tgdf.apply(
    lambda x: f"{x['GenesInTermInQuery']} / {x['GenesInQuery']}",
    axis=1).to_frame("Marker Matches"))
tgdf = tgdf[list(tgdf.columns[:1]) + ["Marker Matches"] + list(
        tgdf.columns[1:-1])]
tgdf = tgdf.rename_axis([c_t, "Rank"])
tgdf

# Quantification

## By Markers

In [None]:
%%time

outs, failed = {}, []  # hold results
for i, k in enumerate(self.rna.obs[c_t].unique()):
    if tgdf is not None and k in tgdf.reset_index(1).index.values:
        print(f"{'=' * 80}\n{k}\n ({i + 1}/{len(self.rna.obs[c_t].unique())})"
              f"\n{'=' * 80}\n\n{tgdf.loc[k].iloc[:, :3]}\n\n")
    try:
        outs[k] = self.print_markers(
                k, assign, col_cell_type=c_t, lfc_threshold=2,
                count_threshold=count_threshold, p_threshold=1e-15,
                print_threshold=15, n_top_genes=20)
    except Exception as err:
        failed += [(k, err)]
if len(failed) > 0:
    print(f"Failed: {pd.Series(dict(failed))}")

## By Specific Genes

In [None]:
%%time

ggg = pd.DataFrame(["CDKN1A", "CDKN2A", "TP53", "PLAUR", "IL6ST"])[
    0].to_frame("Gene").assign(Annotation="Senescence").set_index("Gene")
outs_bg, failed = {}, []  # hold results
for i, k in enumerate(self.rna.obs[c_t].unique()):
    outs_bg[k] = self.print_markers(
        k, ggg, col_cell_type=c_t, lfc_threshold=None, print_threshold=0,
        count_threshold=count_threshold, p_threshold=None, n_top_genes=None)
if len(failed) > 0:
    print(f"Failed: {pd.Series(dict(failed))}")

## Write

In [None]:
for i, x in [outs, outs_bg]:
    quant = {}
    for k in outs:
        percs_exp, n_exp = x[k][1].copy(), x[k][2].copy()
        quant[k] = pd.concat([
            x.rename_axis("Measure", axis=1).stack() for x in [
                percs_exp.rename_axis("Cell Types", axis=1).stack().replace(
                    "", np.nan).dropna().to_frame("n_exp"), n_exp.set_index(
                        "Cell Types", append=True).rename({
                            k: "Cluster"}, axis=1)]], keys=[
                                "quantification", "representation"]).unstack(
                                    0).unstack(-1).dropna(how="all", axis=1)
    quant = pd.concat(quant, names=["Cluster"])
    if out_dir:
        suf = "" if i == 0 else "_" + "_".join(ggg.reset_index(
            ).Gene.unique().to_list())  # file suffix if by gene
        quant.to_excel(os.path.join(
            out_dir, "quantification",
            f"{self._library_id}__{c_t}_quantification_annotation{suf}.xlsx"))

# View

## ToppGene/Quantifications

In [None]:
key_cluster = "32"
output = outs  # see "by markers" version
# output = outs_bg  # see "by specific genes" version

if tgdf is not None and key_cluster in tgdf.reset_index(1).index.values:
    print(f"{'=' * 80}\n\n{key_cluster}\n\n"
          f"{tgdf.loc[key_cluster].iloc[:, :-3]}")
_, percs_exp, n_exp, genes, msg = outs[key_cluster]
n_exp = n_exp.copy().set_index("Cell Types", append=True)
print(percs_exp.applymap(lambda x: x if x == "" else str(
    int(x)) + "%").stack().replace("", np.nan).dropna().sort_values(
            ascending=False).head(20))
print(f"{genes}\n\n{n_exp.applymap(int)}\n\n{msg}")
print(f"\n\nN = {sum(self.rna.obs[c_t] == k)}\n\n")
percs_exp.applymap(lambda x: x if x == "" else str(int(x)) + "%")

## Plotting

In [None]:
fig, axis = plt.subplots(1, 2)
self.plot_spatial(color=c_t, groups=["32"], ax=axis[0])
self.plot_spatial("Stricture-50336A___morphology_focus_scale4", ax=axis[1])

## Specific to Subsets of Cell Types

In [None]:
# comp = ["0", "5", "8", "10", "18", "20", "34"]
comp = ["2", "7", "11", "21", "28"]
count_threshold = 1
key_cluster = "2"
_, percs_exp, n_exp, genes, msg = self.print_markers(
    key_cluster, assign, col_cell_type=c_t, lfc_threshold=None,
    key_compare=comp, count_threshold=count_threshold, p_threshold=1,
    n_top_genes=list(assign[assign.Lump == "Epithelial"].index.intersection(
        self.rna.var_names)) + ["TP53", "PLAUR"])
n_exp = n_exp.copy().set_index("Cell Types", append=True)
print(f"{genes}\n\n{n_exp.applymap(int)}\n\n{msg}")
percs_exp.applymap(lambda x: x if x == "" else str(int(x)) + "%")

# Write All Clusters

In [30]:
samp_ids

{'50336C': 'Uninflamed-50336C',
 '50336B': 'Inflamed-50336B',
 '50336A': 'Stricture-50336A',
 '50403C2': 'Stricture-50403C2',
 '50403C1': 'Stricture-50403C1',
 '50403B': 'Inflamed-50403B',
 '50403A1': 'Uninflamed-50403A1',
 '50403A2': 'Uninflamed-50403A2',
 '50217C': 'Stricture-50217C',
 '50217B': 'Uninflamed-50217B',
 '50217A': 'Inflamed-50217A',
 '50006C': 'Stricture-50006C',
 '50006B': 'Uninflamed-50006B',
 '50006A': 'Inflamed-50006A',
 '50445A3': 'Stricture-50445A3',
 '50007B2': 'Stricture-50007B2',
 '50115A2': 'Stricture-50115A2',
 '49696A4': 'Stricture-49696A4',
 '49559A5': 'Stricture-49559A5',
 '49464A4': 'Stricture-49464A4',
 '49471A4': 'Stricture-49471A4',
 '49377A2': 'Stricture-49377A2',
 '50618B5': 'Stricture-50618B5',
 '50564A4': 'Stricture-50564A4',
 '50452C': 'Stricture-50452C',
 '50452B': 'Inflamed-50452B',
 '50452A': 'Uninflamed-50452A'}

In [44]:
os.path.basename(
    dir_data)

'TUQ97N'

In [49]:
col_fff = "out_file"
fff = np.array(cr.pp.construct_file(directory=direc, panel_id=panel))
bff = np.array([os.path.basename(i) for i in fff])  # base path names
samps = np.array([i.split("__")[2].split("-")[0] for i in fff])
for x in metadata[cso]:
    m_f = metadata[metadata[cso] == x][
        "out_file"].iloc[0]  # ...use to find unconventionally-named files
    locx = np.where(samps == x)[0] if pd.isnull(
        m_f) else np.where(bff == m_f)[0]
    metadata.loc[metadata[cso] == x, col_fff] = fff[locx[0]] if (
        len(locx) > 0) else np.nan  # assign output file to metadata row
metadata = metadata.dropna(subset=[col_fff]).drop_duplicates().set_index(cso)

'/mnt/cho_lab/bbdata2/outputs/TUQ97N/CHO-001/output-XETG00189__0010700__50452A-TUQ97N-EA__20240126__205019'

In [None]:
# Load Spatial Data
for libid in samp_ids:
    if not os.path.exists(os.path.join(out_dir, samp_ids[libid] + ".h5ad")):
        print(f"\n\nWarning: Critical file(s) for {libid} not found.\n\n")
        continue
    self = cr.Spatial(metadata.loc[libid]["out_file"],
                      library_id=samp_ids[libid])
    self.update_from_h5ad(os.path.join(out_dir, samp_ids[libid] + ".h5ad"))
    if c_t not in self.rna.obs.columns:
        print(f"\n\nWarning: {c_t} column for {libid} not found.\n\n")
        continue
    self.write_clusters(out_dir, col_cell_type=c_t, overwrite=True,
                        file_prefix=f"{self._library_id}__",
                        n_top="find_markers")

In [103]:
c_t = "leiden_res1pt5_dist0_npc30"
# Faster way for spatial data

# import scanpy as sc

# c_t = "leiden_res1pt5_dist0_npc30"
clusterings = ["res1pt5_dist0_npc30"]

for libid in samp_ids:
    if not os.path.exists(os.path.join(out_dir, samp_ids[libid] + ".h5ad")):
        print(f"\n\nWarning: Critical file(s) for {libid} not found.\n\n")
        continue
    adata = sc.read(os.path.join(out_dir, samp_ids[libid] + ".h5ad"))
    for r in clusterings:
        c_t = f"leiden_{r}"
        if c_t not in adata.obs.columns:
            print(f"\n\nWarning: {c_t} column for {libid} not found.\n\n")
            continue
        fff = os.path.join(out_dir, f"{samp_ids[libid]}__{c_t}.csv")
        adata.obs.set_index("cell_id")[c_t].to_frame("group").to_csv(fff)
        fmr = pd.read_excel(os.path.join(
            out_dir, "annotation_dictionaries/annotations_all.xlsx"),
                            index_col=[0, 1]).iloc[:, :3].astype(str).dropna()
        if f"{samp_ids[libid]}___leiden_{r}_dictionary.xlsx" not in [
                v[0] for v in fmr.index]:
            print(f"{libid} not in annotation dictionary.")
            continue
        else:
            print(f"{libid} annotated.")
        fmr = fmr.loc[f"{samp_ids[libid]}___leiden_{r}_dictionary.xlsx"]
        mans = dict(fmr["annotation"])
        adata.obs.loc[:, f"manual_{r}"] = adata.obs[f"leiden_{r}"].astype(
            int).astype(str).replace(mans)  # Leiden -> manual annotation
        adata.obs.loc[adata.obs[f"manual_{r}"].isnull(
            ), f"manual_{r}"] = adata.obs.loc[adata.obs[
                f"manual_{r}"].isnull(), f"leiden_{r}"].astype(str)
        adata.obs.loc[:, f"manual_{r}"] = adata.obs[
            f"manual_{r}"].astype("category")  # as categorical
        fff = os.path.join(out_dir, f"{samp_ids[libid]}__manual_{r}.csv")
        adata.obs.set_index("cell_id")[f"manual_{r}"].to_frame(
            "group").to_csv(fff)

50336C annotated.
50336B annotated.
50336A annotated.
50403C2 not in annotation dictionary.
50403C1 annotated.
50403B annotated.
50403A1 annotated.
50403A2 annotated.
50217C annotated.
50217B annotated.
50217A annotated.
50006C annotated.
50006B annotated.
50006A annotated.
50445A3 not in annotation dictionary.
50007B2 not in annotation dictionary.
50115A2 not in annotation dictionary.




















50618B5 not in annotation dictionary.
50564A4 annotated.
50452C annotated.
50452B annotated.
50452A annotated.
