# Setup

## Imports & Settings

In [1]:
%load_ext autoreload
%autoreload 2

import os
import re
import scanpy as sc
from anndata import AnnData
import pandas as pd
import numpy as np
import corescpy as cr

# Computing Resources
gpu = False
sc.settings.n_jobs = 4
sc.settings.max_memory = 150

# Display
pd.options.display.max_columns = 100
pd.options.display.max_rows = 500
sc.settings.set_figure_params(dpi=100, frameon=False, figsize=(20, 20))

# Column Names (from Metadata & To Be Created)
col_sample_id_o, col_sample_id = "Sample ID", "Sample"  # in metadata, new
col_subject = "Patient"  # in metadata file
col_inflamed, col_stricture = "Inflamed", "Stricture"  # in metadata file
col_condition = "Condition"  # constructed from col_inflamed & col_stricture
col_fff = "file_path"  # column in metadata in which to store data file path
col_cell_type = "Annotation"  # for eventual cluster annotation column
col_tangram = "tangram_prediction"  # for future Tangram imputation annotation

Downloading data from `https://omnipathdb.org/queries/enzsub?format=json`
Downloading data from `https://omnipathdb.org/queries/interactions?format=json`
Downloading data from `https://omnipathdb.org/queries/complexes?format=json`
Downloading data from `https://omnipathdb.org/queries/annotations?format=json`
Downloading data from `https://omnipathdb.org/queries/intercell?format=json`
Downloading data from `https://omnipathdb.org/about?format=text`


## Options & Data

In [2]:
# Directories & Metadata
run = ["CHO-001", "CHO-002", "CHO-005", "CHO-006"]
# run = "CHO-006"
# samples = "all"
samples = ["50007B2"]
# samples = "all"
# samples = ["50452A"]
# samples = ["50452B"]
# samples = ["50452A", "50452B", "50452C"]
# samples = ["50564A4", "50618B5"]

# Main Directories
# Replace manually or mirror my file/directory tree in your home (`ddu`)
ddu = os.path.expanduser("~")
ddm = "/mnt/cho_lab" if os.path.exists("/mnt/cho_lab") else "/mnt"  # Spark?
ddl = f"{ddm}/disk2/{os.getlogin()}/data/shared-xenium-library" if (
    "cho" in ddm) else os.path.join(ddu, "shared-xenium-library")
ddx = f"{ddm}/bbdata0/xenium"  # mounted drive Xenium folder
out_dir = os.path.join(ddl, "outputs", "TUQ97N", "nebraska")  # None = no save
d_path = os.path.join(ddm, "disk2" if "cho" in ddm else "",
                      os.getlogin(), "data")  # other, e.g., Tangram data
file_ann = os.path.join(ddu, "corescpy/examples/annotation_guide.xlsx")  # AG
annot_df = pd.read_excel(file_ann)
file_mdf = os.path.join(ddl, "Xenium_Samples_03152024.xlsx")  # metadata

# Annotation & Tangram Imputation
col_assignment = "Bin"  # which column from annotation file to use
# col_cell_type_sc, file_sc = "ClusterAnnotation", str(
#     f"{d_path}/2023-05-12_CombinedCD-v2_ileal_new.h5ad")
col_cell_type_sc, file_sc = "cell_type", f"{d_path}/elmentaite_ileal.h5ad"

# Processing & Clustering Options
kws_pp = dict(cell_filter_pmt=None, cell_filter_ncounts=[50, None],
              cell_filter_ngene=[10, None], gene_filter_ncell=[3, None],
              gene_filter_ncounts=[3, None], custom_thresholds=None,
              kws_scale=dict(max_value=10, zero_center=True),
              method_norm="log")  # preprocessing keyword arguments
kws_cluster = dict(kws_umap=dict(method="rapids" if gpu else "umap"),
                   genes_subset=list(annot_df.iloc[:, 0]),  # use only markers
                   use_gpu=gpu, use_highly_variable=False)
kws_clustering, col_assignment = {}, []
for i in zip([0.5, 0.75, 1.5], [0.5, 0.3, 0], [30, 30, 30]):
    kws = {**kws_cluster, "resolution": i[0], "kws_umap": {
        **kws_cluster["kws_umap"], "min_dist": i[1]}, "n_comps": i[2]}
    suff = str(f"res{re.sub('[.]', 'pt', str(kws['resolution']))}_dist"
            f"{re.sub('[.]', 'pt', str(kws['kws_umap']['min_dist']))}"
            f"_npc{kws['n_comps']}")  # file path suffix
    kws_clustering.update({suff: kws})
    col_assignment += ["Type" if kws["resolution"] >= 1 else "Bin" if kws[
        "resolution"] >= 0.7 else "Bucket"]

# After this point, no more options to specify
# Just code to infer the data file path from your specifications
# and construct argument dictionaries and manipulate metadata and such.

# Read Metadata & Other Information
metadata = pd.read_excel(file_mdf, dtype={"Slide ID": str}).rename(
    {"Name": col_subject, "Inflammation Status": col_inflamed}, axis=1)
if samples not in ["all", None]:  # subset by sample ID?
    metadata = metadata.set_index(col_sample_id_o).loc[samples].reset_index()

# Revise Metadata & Construct Variables from Options
metadata.loc[:, col_condition] = metadata.apply(lambda x: "Stricture" if x[
    col_stricture].lower() in ["stricture", "yes"] else x[
        col_inflamed].capitalize(), axis=1)  # inflamation/stricture condition
metadata.loc[:, col_sample_id] = metadata[[col_condition, col_sample_id_o]
                                          ].apply("-".join, axis=1)
metadata = metadata.set_index(col_sample_id)
fff = np.array(cr.pp.construct_file(run=run, directory=ddx))
samps = np.array([i.split("__")[2].split("-")[0] for i in fff])
for x in metadata[col_sample_id_o]:
    metadata.loc[metadata[col_sample_id_o] == x, col_fff] = fff[np.where(
        samps == x)[0][0]] if len(np.where(samps == x)[0]) > 0 else np.nan
metadata = metadata.dropna(subset=[col_fff])

# Annotation File
assign = pd.read_excel(file_ann, index_col=0).dropna(
    subset=col_assignment).rename_axis("Gene")  # markers
assign = assign[~assign.Quality.isin([-1])]  # drop low-quality markers
marker_genes_dict = dict(assign["Bucket"].reset_index().groupby(
    "Bucket").apply(lambda x: list(pd.unique(x.Gene))))  # to dictionary

# Print Metadata & Make Output Directory (If Not Present)
print(metadata)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Load Data
kws_init = dict(col_sample_id=col_sample_id, col_subject=col_subject,
                col_cell_type=col_cell_type)  # object creation arguments
selves = [None] * metadata.shape[0]  # to hold different samples
for i, x in enumerate(metadata.index.values):
    selves[i] = cr.Spatial(metadata.loc[x][col_fff], library_id=x, **kws_init)
    for j in metadata:  # iterate metadata columns
        selves[i].rna.obs.loc[:, j] = str(metadata.loc[x][j])  # add to object
    selves[i].rna.obs.loc[:, "out_file"] = os.path.join(
        out_dir, selves[0]._library_id)  # output path (to save object)
    # selves[i].update_from_h5ad(str(selves[0].rna.obs.out_file.iloc[0]))

                  Sample ID  Patient     Status          Project  \
Sample                                                             
Stricture-50007B2   50007B2    50007  Available  scRNA & Spatial   

                         Location Stricture  Inflamed  GRID ID Procedure Date  \
Sample                                                                          
Stricture-50007B2  Terminal Ileum       yes  Inflamed  1011314        2023-11   

                   Age   Sex   Race Hispanic        Diagnosis  \
Sample                                                          
Stricture-50007B2   58  male  white       NO  Crohn's Disease   

                             Project.1  Procedure Disease_Status  \
Sample                                                             
Stricture-50007B2  Helmsley-Senescence  Resection         Active   

                  Date Collected Date Sectioned Date Hybridization Storage 4c  \
Sample                                                               

# Pre-Process, Cluster, Annotate

In [None]:
%%time

for i, s in enumerate(selves):
    f_o = str(selves[0].rna.obs.out_file.iloc[0])

    # Preprocessing
    print("\n\n", kws_pp, "\n\n")
    _ = s.preprocess(**kws_pp, figsize=(15, 15))  # preprocess

    # Clustering at Different Resolutions & Minimum Distances
    for x in kws_clustering:

        # Variables & Output Files
        print(kws_clustering[x])
        cct, cca, f_i = f"leiden_{x}", f"label_{x}", f"{s._library_id}_{x}"
        f_m = f"{f_o}__{cct}_markers.csv" if out_dir else None
        annot = assign[[col_assignment[j]]]

        # Clustering
        _ = s.cluster(**kws_clustering[x], key_added=cct, out_file=f_o)
        _ = s.plot_spatial(color=cct)
        _ = s.find_markers(col_cell_type=cct, kws_plot=False)
        _ = s.annotate_clusters(annot, col_cell_type=cct, col_annotation=cca)
        _ = s.plot_spatial(color=cca)
        for c in [cct, cca]:  # Leiden & annotations to Xenium Explorer files
            s.write_clusters(out_dir, col_cell_type=c, overwrite=True,
                             file_prefix=f"{s._library_id}__", n_top=True)

    # Write Final Object
    s.write(f_o)

In [36]:
%%time

for i, s in enumerate(selves):
    f_o = str(selves[0].rna.obs.out_file.iloc[0])

    # Preprocessing

    # Clustering at Different Resolutions & Minimum Distances
    for j, x in enumerate(kws_clustering):

        # Variables & Output Files
        cct, cca, f_i = f"leiden_{x}", f"label_{x}", f"{s._library_id}_{x}"
        f_m = f"{f_o}__{cct}_markers.csv" if out_dir else None
        annot = assign[[col_assignment[j]]]

        # Clustering
        _ = s.annotate_clusters(annot, col_cell_type=cct, col_annotation=cca)
        for c in [cct, cca]:  # Leiden & annotations to Xenium Explorer files
            s.write_clusters(out_dir, col_cell_type=c, overwrite=True,
                             file_prefix=f"{s._library_id}__", n_top=True)

    # Write Final Object
    s.write(f_o)

