# Setup

## Imports & Settings

In [1]:
%load_ext autoreload
%autoreload 2

import os
import re
import itertools
import scipy
import scanpy as sc
import seaborn as sb
import pandas as pd
import numpy as np
import corescpy as cr

# Computing Resources
gpu = False
sc.settings.n_jobs = 8
# sc.settings.max_memory = 150

# Display
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 100
pd.options.display.max_rows = 500
sc.settings.set_figure_params(dpi=100, frameon=False, figsize=(20, 20))

# Panel & Column Names (from Metadata & To Be Created)
# panel = "XR4UZH"
panel = "TUQ97N"
suffix = ""  # no file suffix for object h5ad file (main object)
capitalize_sample = True if panel == "TUQ97N" else False
# suffix = "_new"  # suffix for object h5ad file (to avoid overwrite)

# Samples/Runs
use_prior_clustering = False
run = None  # just look for samples in all Xenium runs for the panel
# run = "CHO-001"  # run all from this run; so don't have to specify samples
# samples = "all"  # use samples = "all" with run = something for all from run
# or run = None for all available samples
# samples = ["49471A4", "49696A4"]
samples = ["49559A5A", "49559A5B"]

# Main Directories
usr_write_rel_path = f"{os.getlogin()}/data/shared-xenium-library"
d_hpc = "/mnt/cho_lab" if os.path.exists(
    "/mnt/cho_lab") else "/sc/arion/projects/untreatedIBD"  # HPC path
d_nfs = os.path.join(d_hpc, "bb-xenium-registry") if os.path.exists(
    os.path.join(d_hpc, "bb-xenium-registry")) else os.path.join(
        d_hpc, "chobiolab-core/shared-xenium-library")
d_usr = os.path.join(d_hpc, "disk2", usr_write_rel_path) if os.path.exists(
    os.path.join(d_hpc, "disk2")) else os.path.join(d_hpc, usr_write_rel_path)
d_img = os.path.join(d_hpc, f"cache/tissue-registry/xenium/{panel}") if (
    "arion" in d_hpc) else os.path.join(
        d_hpc, f"bb-nfs-data-registries/tissue-registry/xenium/{panel}")

# Construct Directories (Less Likely to Need Changes)
# Mirror my file/directory tree in the `d_usr` directory
# out_dir = None  # don't write any outputs
out_subdir_markers = "find_markers"  # sub-directory under out_dir for markers
out_subdir_cluster = "explorer_files"  # sub-directory for cluster-cell ID csv
out_dir = os.path.join(d_usr, f"outputs/{panel}/nebraska")  # to save objects
out_dir_plot = None if out_dir is None else os.path.join(
    out_dir, "plots")  # plot output directory
file_mdf = os.path.join(d_usr, f"samples_{panel}.csv")  # metadata file path
print(f"\n\n\n{'=' * 80}\nDirectories\n{'=' * 80}\n\nHPC Entry Point (Cho): "
      f"{d_hpc}\nData: {d_nfs}\nMetadata: {file_mdf}\nImages: {d_img}\n"
      f"Object/Outputs:\n\t{out_dir} (objects)\n\t{out_dir_plot} (plots)\n\t"
      f"{os.path.join(out_dir, out_subdir_markers)} (markers)\n\n\n")

# Automated Annotation Options
file_ann = None  # to skip marker-based annotation
# file_ann = os.path.join("~/corescpy/examples/markers_lineages.csv")
col_assignment = None  # column in annotation file whose labels to use
# col_assignment = "Bin"  # all clustering versions use same annotation column
# col_assignment = ["group", "Bin", "Bin"]  # (order corresponds to res_list)

# Preprocessing Options
outlier_mads = {"n_counts": [1.25, None]}
# outlier_mads = None
# kws_pp = dict(cell_filter_ngene=[3, None], gene_filter_ncell=[3, None],
#               gene_filter_ncounts=[3, None], custom_thresholds=None,
#               kws_scale=dict(max_value=10, zero_center=True),
#               outlier_mads=outlier_mads, method_norm="log")  # preprocessing
kws_pp = dict(cell_filter_pmt=None, cell_filter_ncounts=[15, None],
              cell_filter_ngene=[3, None], gene_filter_ncell=[3, None],
              gene_filter_ncounts=[3, None], custom_thresholds=None,
              kws_scale=dict(max_value=10, zero_center=True),
              method_norm="log")  # preprocessing old segmentation TUQ97N
# kws_pp = None   # if loading object already preprocessed

# Clustering Options
genes_subset = None  # use all genes in clustering
# genes_subset = list(pd.read_csv(file_ann).iloc[:, 0])  # only cell markers
kws_cluster = dict(kws_umap=dict(method="rapids" if gpu else "umap"),
                   genes_subset=genes_subset,  # use only markers
                   use_gpu=gpu, use_highly_variable=False)
# res_list = [1.5, 0.75, 0.5]  # resolutions (iterate clustering runs)
# min_dist_list = [0, 0.3, 0.5]  # distances (order corresponds to res_list)
# n_comps_list = [30, 30, 30]  # PCA components (order same as res_list)
res_list = [1.5]  # resolutions (iterate different clustering runs)
min_dist_list = [0]  # distances (order corresponds to res_list)
n_comps_list = [30]  # PCA components (order corresponds to res_list)
kws_clustering_spatial = None  # specify to perform spatial clustering
suffix_clustering_spatial = None  # column key for spatial clustering results
# ^ should parallel the parameters, like normal clustering does
# e.g., res0pt75_dist0pt3_npc30

Downloading data from `https://omnipathdb.org/queries/enzsub?format=json`
Downloading data from `https://omnipathdb.org/queries/interactions?format=json`
Downloading data from `https://omnipathdb.org/queries/complexes?format=json`
Downloading data from `https://omnipathdb.org/queries/annotations?format=json`
Downloading data from `https://omnipathdb.org/queries/intercell?format=json`
Downloading data from `https://omnipathdb.org/about?format=text`





Directories

HPC Entry Point (Cho): /mnt/cho_lab
Data: /mnt/cho_lab/bb-xenium-registry
Metadata: /mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/samples_TUQ97N.csv
Images: /mnt/cho_lab/bb-nfs-data-registries/tissue-registry/xenium/TUQ97N
Object/Outputs:
	/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska (objects)
	/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/plots (plots)
	/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/find_markers (markers)





## Setup

Get constants (e.g., column names in metadata), read metadata, create dictionary of clustering parameters (so can iterate across different clustering specifications to make multiple versions, e.g., at multiple resolutions) using `res_list`, `min_dist_list`, and `n_comps_list`, make any output directories (e.g., for processed objects, plots, find markers results, Xenium Explorer cluster files) if any don't exist yet, load data into objects, etc.

In [10]:
x="49559A5A"

In [16]:
m_f

'/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-004/output-XETG00189__0011167__Region_1__20240229__215113'

In [15]:
bff

array(['output-XETG00189__0022234__50618B5-TUQ97N-EA__20240130__212525',
       'output-XETG00189__0022235__50564A4-TUQ97N-EA__20240130__212525',
       'output-XETG00189__0011170__Region_1__20240229__215114',
       'output-XETG00189__0011167__Region_1__20240229__215113',
       'output-XETG00189__0011167__Region_2__20240229__215114',
       'output-XETG00189__0011167__Region_4__20240229__215114',
       'output-XETG00189__0010700__50452B-TUQ97N-EA__20240126__205019',
       'output-XETG00189__0010700__50452A-TUQ97N-EA__20240126__205019',
       'output-XETG00189__0010663__50452C-TUQ97N-EA__20240126__205019',
       'output-XETG00189__0021979__50403C2-TUQ97N-EA__20240516__190239',
       'output-XETG00189__0021978__50403C1-TUQ97N-EA__20240516__190239',
       'output-XETG00189__0010490__49471A4-TUQ97N-EA__20240314__211504',
       'output-XETG00189__0010923__50445A3-TUQ97N-EA__20240314__211504',
       'output-XETG00189__0015521__50217A-TUQ97N-EA__20240415__212906',
       'output-XET

In [17]:
cr.pp.get_metadata_cho(
    d_nfs, file_mdf, panel_id=panel,
    capitalize_sample=capitalize_sample)  # get metadata

Unnamed: 0_level_0,sample_id,block_id,subject_id,run_id,panel_id,slide_id,grid,project,clinical_block,description,date_hybridization,date_sectioned,age,sex,race,hispanic,diagnosis,location,inflammation,stricture,Condition,disease_status
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Stricture-49559A5C,49559A5C,,49559,CHO-004,TUQ97N,11167,,,YES,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-004/output-XETG00189__0011167__Region_4__20240229__215114,2024-02-29 00:00:00.000,2024-01-30 00:00:00.000,,,,,,Terminal Ileum,inflamed,yes,stricture,
Stricture-49559A5B,49559A5B,,49559,CHO-004,TUQ97N,11167,,,YES,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-004/output-XETG00189__0011167__Region_2__20240229__215114,2024-02-29 00:00:00.000,2024-01-30 00:00:00.000,,,,,,Terminal Ileum,inflamed,yes,stricture,
Stricture-49559A5A,49559A5A,,49559,CHO-004,TUQ97N,11167,,,YES,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-004/output-XETG00189__0011167__Region_1__20240229__215113,2024-02-29 00:00:00.000,2024-01-30 00:00:00.000,,,,,,Terminal Ileum,inflamed,yes,stricture,
Stricture-49559A5C-,49559A5C-,,49559,,,11167,,,YES,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-004/output-XETG00189__0011167__Region_4__20240229__215114,2024-02-29 14:53:30.000,2024-01-30 14:53:18.000,,,,,,Terminal Ileum,inflamed,yes,stricture,
Stricture-49559A5B-,49559A5B-,,49559,,,11167,,,YES,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-004/output-XETG00189__0011167__Region_2__20240229__215114,2024-02-29 14:53:26.000,2024-01-30 14:53:13.000,,,,,,Terminal Ileum,inflamed,yes,stricture,
Uninflamed-50403A2,50403A2,,50403,CHO-011,TUQ97N,10589,,,NO,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-011/output-XETG00189__0010589__50403A2-TUQ97N-EA__20240513__201050,2024-05-07 00:00:00.000,2024-03-21 00:00:00.000,,,,,,Terminal Ileum,uninflamed,no,uninflamed,
Uninflamed-50403A1,50403A1,,50403,CHO-011,TUQ97N,10589,,,NO,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-011/output-XETG00189__0010589__50403A1-TUQ97N-EA__20240513__201050,2024-05-07 00:00:00.000,2024-03-21 00:00:00.000,,,,,,Terminal Ileum,uninflamed,no,uninflamed,
Uninflamed-50336C,50336C,,50336,CHO-010,TUQ97N,11047,,,NO,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-010/output-XETG00189__0011047__50336C-TUQ97N-EA__20240422__175051,,2024-03-21 00:00:00.000,,,,,,Terminal Ileum,uninflamed,no,uninflamed,
Inflamed-50336B,50336B,,50336,CHO-010,TUQ97N,11044,,,NO,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-010/output-XETG00189__0011044__50336B-TUQ97N-EA__20240422__175051,,2024-03-21 00:00:00.000,,,,,,Terminal Ileum,inflamed,no,inflamed,
Stricture-50336A,50336A,,50336,CHO-009,TUQ97N,10496,,,NO,/mnt/cho_lab/bb-xenium-registry/outputs/TUQ97N/CHO-009/output-XETG00189__0010496__50336A-TUQ97N-EA__20240418__170624,2024-04-15 00:00:00.000,2024-03-21 00:00:00.000,,,,,,Terminal Ileum,inflamed,yes,stricture,


In [None]:
# Get/Set Constants
constants_dict = cr.get_panel_constants(panel_id=panel)
col_sample_id_o, col_sample_id, col_condition, col_inflamed, col_subject = [
    constants_dict[x] if x in constants_dict else None for x in [
        "col_sample_id_o", "col_sample_id", "col_condition",
        "col_inflamed", "col_subject"]]
col_stricture, key_stricture, col_f, col_tangram, col_segment, col_object = [
    constants_dict[x] if (x in constants_dict) else None for x in [
        "col_stricture", "key_stricture", "col_data_dir",
        "col_tangram", "col_segment", "col_object"]]

# Construct Clustering Keyword Dictionary
kws_clustering = {}
for i in zip(res_list, min_dist_list, n_comps_list):
    kws = {**kws_cluster}
    kws.update({"resolution": i[0], "n_comps": i[2],
                "kws_umap": {**kws_cluster["kws_umap"], "min_dist": i[1]}})
    suff = str(f"res{re.sub('[.]', 'pt', str(kws['resolution']))}_dist"
               f"{re.sub('[.]', 'pt', str(kws['kws_umap']['min_dist']))}"
               f"_npc{kws['n_comps']}")  # file path suffix
    kws_clustering.update({suff: kws})

# Read Metadata
metadata = cr.pp.get_metadata_cho(
    d_nfs, file_mdf, panel_id=panel, samples=samples, run=run,
    capitalize_sample=capitalize_sample)  # get metadata
print("\n\n", metadata[list(set([
    col_sample_id_o, col_subject, col_condition, col_inflamed, col_stricture,
    col_segment]).intersection(metadata))])

# Annotation File
assign = pd.read_csv(file_ann).dropna(subset=col_assignment).set_index(
    "gene").rename_axis("Gene") if file_ann is not None else None
# assign = assign[~assign.Quality.isin([-1])]  # drop low-quality markers
if col_assignment is not None and isinstance(col_assignment, str):
    col_assignment = [col_assignment] * len(res_list)  # same for each version

# Create Objects
[os.makedirs(x, exist_ok=True) for x in [
    out_dir, out_dir_plot, os.path.join(out_dir, out_subdir_markers),
    os.path.join(out_dir, out_subdir_cluster)] if x]  # make out directories
kws_init = dict(col_sample_id=col_sample_id, col_subject=col_subject,
                col_cell_type=f"leiden_{list(kws_clustering.keys())[0]}")
selves = [None] * metadata.shape[0]  # to hold different samples
for i, x in enumerate(metadata.index.values):
    out = os.path.join(out_dir, x + suffix)  # object output path

    # Ensure No Overwrite of Prior Preprocessing or Skipping Preprocessing
    # without Loading Prior Preprocessed Object
    if os.path.exists(out + ".h5ad"):  # if processed object file exists...
        if kws_pp is not None:  # don't overwrite with new preprocessing
            raise ValueError(f"\n\nProcessed object already exists!\n{out}"
                             ".h5ad.\nSpecify different file suffix, or set "
                             "`kws_pp` to None to reload processed object.")
    elif kws_pp is None:  # if doesn't exist but pp parameters specified...
        raise ValueError(f"\n\nProcessed object doesn't exist!\n{out}.\n"
                         "Specify `kws_pp` to perform new proprocessing "
                         "or ensure processed object paths are correct.")

    # Load Data into Object (Update with Prior Preprocessed Object if Exists)
    selves[i] = cr.Spatial(metadata.loc[x][col_f], library_id=x, **kws_init)
    if os.path.exists(out + ".h5ad") and kws_pp is None:
        selves[i].update_from_h5ad(out)  # update with prior preprocessing

    # Add metadata to object
    for j in metadata.dropna(how="all", axis=1):  # add metadata to .obs
        selves[i].rna.obs.loc[:, j] = str(metadata.loc[x][j])
    selves[i].rna.obs.loc[:, col_object] = out  # path for processed object

# Clustering

## Processing, Leiden, Annotation

In [None]:
%%time

for i, s in enumerate(selves):  # iterate objects (samples)
    f_o = None if out_dir is None else str(s.rna.obs[col_object].iloc[0])

    # Preprocessing
    if kws_pp is not None:
        print("\n\n", kws_pp, "\n\n")
        _ = s.preprocess(**kws_pp, figsize=(15, 15))  # preprocess
    else:
        print(f"\n\n***** Using Prior Preprocessing\n\n{s.rna.obs.iloc[[0]]}")

    # Clustering at Different Resolutions & Minimum Distances & # of PCs
    for j, x in enumerate(kws_clustering):  # iterate clustering versions

        # Variables & Output Files
        print(f"\n\n{'=' * 80}\n{x}\n{'=' * 80}\n\n")
        cct, cca = f"leiden_{x}", f"label_{x}"  # Leiden & annotation columns

        # Clustering & Find Markers
        if use_prior_clustering is True and cct in s.rna.obs:
            print("Using prior clustering results...")
        else:
            _ = s.cluster(**kws_clustering[x], key_added=cct, out_file=f_o)
        _ = s.find_markers(col_cell_type=cct, kws_plot=False)  # DEGs

        # Annotation
        if assign is not None:  # annotate by marker list
            _ = s.annotate_clusters(assign[[col_assignment[j]]],
                                    col_cell_type=cct, col_annotation=cca)

        # Create Xenium Explorer Cluster Files
        if out_dir is not None:
            for c in [k for k in [cct, cca] if k in s.rna.obs]:  # Explorer
                s.write_clusters(out_dir, col_cell_type=c, overwrite=True,
                                 file_prefix=f"{s._library_id}__",
                                 n_top=out_subdir_markers)

        # Write Final Object
        if out_dir is not None:
            s.write(f_o)

# Plot Clusters

## Plot Clusters Individually (Save in Same PDF if `out_dir` is not None)

In [None]:
for i, s in enumerate(selves):
    for x in kws_clustering:
        print(f"\n\n{'=' * 80}\n{x}\n{'=' * 80}\n\n")
        for c in [f"leiden_{x}", f"label_{x}", f"manual_{x}"]:
            if c not in s.rna.obs:
                print(f"\n\n{c} not in {s.rna.obs.columns}.\n\n")
            if out_dir_plot is not None:
                pfp = os.path.join(out_dir_plot, s._library_id, f"{c}.pdf")
                s.plot_clusters(col_cell_type=c, out_dir=pfp, multi_pdf=True)

## Plot Clusters (Overall; No Save)

In [None]:
# for s in selves:
#     for j, x in enumerate(kws_clustering):
#         _ = s.plot_spatial(color=list(set([
#             f"leiden_{x}", f"label_{x}"]).intersection(s.rna.obs.columns)))

# Spatially-Informed Clustering (Optional)

In [None]:
if kws_clustering_spatial is not None:
    for s in selves:
        f_o = None if out_dir is None else str(s.rna.obs[col_object].iloc[0])
        cct = f"leiden_spatial_{suffix_clustering_spatial}"
        _ = s.cluster_spatial(key_added=cct, **kws_clustering_spatial)
        _ = s.find_markers(col_cell_type=cct, kws_plot=False)
        _ = s.annotate_clusters(assign[[col_assignment[-1]]], col_cell_type=cct,
                                col_annotation=f"annotation_{cct}")
        for c in [cct, f"annotation_{cct}"]:
            s.plot_spatial(c)
            if out_dir is not None:
                s.write_clusters(os.path.join(out_dir, out_subdir_cluster),
                                 col_cell_type=c, overwrite=True,
                                 file_prefix=f"{s._library_id}__",
                                 n_top=out_subdir_markers)
        if out_dir is not None:
            s.write(f_o)

# Analyze

The first clustering version (first specified in `res_list`) is the cell type column used by default in downstream analyses (because it was specified in `kws_init["col_cell_type"]` when creating the object and thus is stored in `self._columns["col_cell_type"]`). Specify `col_cell_type` as an argument in the following functions to use a different column.

## Centrality Scores

In [None]:
%%time

for s in selves:
    s.calculate_centrality(n_jobs=sc.settings.n_jobs)

## Neighborhood Enrichment Analysis

In [None]:
%%time

for s in selves:
    _ = s.calculate_neighborhood(figsize=(60, 30))

## Cell Type Co-Occurrence

In [None]:
%%time

for s in selves:
    _ = s.find_cooccurrence(figsize=(60, 20), kws_plot=dict(wspace=3))

## Spatially-Variable Genes

In [None]:
%%time

kws = dict(kws_plot=dict(legend_fontsize="large"), figsize=(15, 15))
for s in selves:
    _ = s.find_svgs(genes=15, method="moran", n_perms=10, **kws)

## GEX

In [None]:
# for s in selves:
#     s.plot_spatial(color=["TNF", "IL23", col_cell_type])