In [61]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import re
import math
import functools
import traceback
import seaborn as sb
import matplotlib.pyplot as plt
import anndata
import scanpy as sc
import spatialdata_plot
import numpy as np
import pandas as pd
import corescpy as cr

# Main
write_object = True  # change to True when you're ready to save objects
overwrite = False  # overwrite if already exists?
col_cell_type = "leiden_res1pt5_dist0_npc30"
col_ann = "Bucket"

# Process Options
panel = "TUQ97N"  # Xenium panel ID
constants_dict = cr.get_panel_constants(panel_id=panel)
libs = [  # sample IDs from patients for whom we have all conditions
    "50452A", "50452B", "50452C",  # old segmentation
    "50006A", "50006B", "50006C",  # rest are new segmentation
    "50217A", "50217B", "50217C",
    "50336B", "50336C", "50336A",
    "50403A2", "50403B", "50403C1"
]  # excludes low-quality sample/condition replicates 50403A1 & 50403C2
# libs = None  # to run all available samples
cols = [
    "transcript_counts", "cell_area", "nucleus_area", "nucleus_count",
    "control_probe_counts", "control_codeword_counts",
    "unassigned_codeword_counts", "deprecated_codeword_counts"
]
input_suffix = ""  # in case want to load objects with some suffix

# Files & Directories
direc = "/mnt/cho_lab/bbdata2/"  # mounted NFS with data
dir_entry = "/mnt/cho_lab/disk2"  # Spark writeable data directory
mdf = str("/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/samples_"
          f"{panel}.csv")  # metadata file path (for now; will soon be on NFS)
dir_writeable = os.path.join(
    dir_entry, f"elizabeth/data/shared-xenium-library")  # where objects are
out_dir = os.path.join(
    dir_writeable, f"outputs/{panel}/nebraska")  # object output directory

#  Your Folders
out_new = os.path.join(
    dir_entry,
    f"{os.getlogin()}/data/shared-xenium-library/outputs/{panel}/nebraska")

# Constants (Shouldn't Need Edits Unless Extreme Process Changes)
cso, col_sample, col_condition, col_inflamed, col_subject = [
    constants_dict[x] if x in constants_dict else None for x in [
        "col_sample_id_o", "col_sample_id", "col_condition",
        "col_inflamed", "col_subject"]]
dir_data = os.path.join(direc, f"outputs/{panel}")
files = functools.reduce(lambda i, j: i + j, [[os.path.join(
    run, i) for i in os.listdir(os.path.join(
        dir_data, run))] for run in os.listdir(dir_data)])  # all data paths
os.makedirs(out_dir, exist_ok=True)  # make output directory if needed
metadata = cr.pp.get_metadata_cho(direc, mdf, panel_id=panel, samples=libs)
metadata[col_subject]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Sample
Uninflamed-50452A     50452
Inflamed-50452B       50452
Stricture-50452C      50452
Inflamed-50006A       50006
Uninflamed-50006B     50006
Stricture-50006C      50006
Inflamed-50217A       50217
Uninflamed-50217B     50217
Stricture-50217C      50217
Inflamed-50336B       50336
Uninflamed-50336C     50336
Stricture-50336A      50336
Uninflamed-50403A2    50403
Inflamed-50403B       50403
Stricture-50403C1     50403
Name: subject_id, dtype: int64

In [None]:
n_origin, n_proc, obs = {}, {}, {}
for s in libs:  # iterate samples
    print(f"\n\n{'=' * 80}\n{s}\n{'=' * 80}\n\n")
    fff = os.path.join(dir_data, np.array(files)[np.where([
        s == os.path.basename(x).split("__")[2].split("-")[0]
        for x in files])[0][0]])  # sample's Xenium data directory path
    lib = metadata.reset_index().set_index(cso).loc[s][col_sample]
    file_obj_proc = os.path.join(out_dir, f"{lib}{input_suffix}.h5ad")
    self = cr.Spatial(fff, library_id=lib)  # load original data
    adata = sc.read_h5ad(file_obj_proc)  # processed adata
    n_origin[s], n_proc[s] = self.rna.obs.shape[0], adata.obs.shape[0]
    obs[s] = adata.obs[adata.obs.columns.intersection(set(cols + [
        col_cell_type]))].groupby(col_cell_type).describe().copy()
    sb.pairplot(adata.obs[adata.obs.columns.intersection(set(cols))])
n_cells = pd.concat([pd.Series(x, index=pd.Index(libs, name=cso))
                     for x in [n_origin, n_proc]],
                    keys=["Original", "Processed"], names=["Source"])
obs = pd.concat(obs, keys=libs, names=[cso])
n_cells.unstack("Source").to_csv(os.path.join(
    out_new, f"quantification/xenium_n_cells{input_suffix}.csv"))
obs.to_csv(os.path.join(
    out_new, f"quantification/xenium_qc{input_suffix}.csv"))

In [27]:
# Reload

# n_cells = pd.read_csv(os.path.join(
#     out_new, f"quantification/xenium_n_cells{input_suffix}.csv"),
#                       index_col=0)
# obs = pd.read_csv(os.path.join(
#     out_new, f"quantification/xenium_qc{input_suffix}.csv"),
#                   header=[0, 1], index_col=[0, 1]).rename_axis([
#                       "Variable", "Metric"], axis=1).stack(0)

In [43]:
obs[["25%", "50%", "75%"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Metric,25%,50%,75%
sample_id,leiden_res1pt5_dist0_npc30,Variable,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
50452A,0,cell_area,78.797659,129.214614,206.488250
50452A,0,control_codeword_counts,0.000000,0.000000,0.000000
50452A,0,control_probe_counts,0.000000,0.000000,0.000000
50452A,0,deprecated_codeword_counts,0.000000,0.000000,0.000000
50452A,0,nucleus_area,19.868751,29.170939,40.324533
...,...,...,...,...,...
50403C1,34,nucleus_area,14.303243,19.642969,27.838829
50403C1,34,nucleus_count,1.000000,1.000000,1.000000
50403C1,34,total_counts,20.000000,27.000000,36.000000
50403C1,34,transcript_counts,20.000000,27.000000,36.000000


In [47]:
obs[["25%", "50%", "75%"]].stack().to_frame("Value").join(
    metadata.reset_index().set_index(cso))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Value,Sample,block_id,subject_id,run_id,panel_id,slide_id,grid,project,clinical_block,...,age,sex,race,hispanic,diagnosis,location,inflammation,stricture,Condition,disease_status
sample_id,leiden_res1pt5_dist0_npc30,Variable,Metric,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
50452A,0,cell_area,25%,78.797659,Uninflamed-50452A,,50454,CHO-001,TUQ97N,10700,,,NO,...,,,,,,Terminal Ileum,uninflamed,no,uninflamed,
50452A,0,cell_area,50%,129.214614,Uninflamed-50452A,,50454,CHO-001,TUQ97N,10700,,,NO,...,,,,,,Terminal Ileum,uninflamed,no,uninflamed,
50452A,0,cell_area,75%,206.488250,Uninflamed-50452A,,50454,CHO-001,TUQ97N,10700,,,NO,...,,,,,,Terminal Ileum,uninflamed,no,uninflamed,
50452A,0,control_codeword_counts,25%,0.000000,Uninflamed-50452A,,50454,CHO-001,TUQ97N,10700,,,NO,...,,,,,,Terminal Ileum,uninflamed,no,uninflamed,
50452A,0,control_codeword_counts,50%,0.000000,Uninflamed-50452A,,50454,CHO-001,TUQ97N,10700,,,NO,...,,,,,,Terminal Ileum,uninflamed,no,uninflamed,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50403C1,34,transcript_counts,50%,27.000000,Stricture-50403C1,,50403,CHO-012,TUQ97N,21978,,,NO,...,,,,,,Terminal Ileum,inflamed,yes,stricture,
50403C1,34,transcript_counts,75%,36.000000,Stricture-50403C1,,50403,CHO-012,TUQ97N,21978,,,NO,...,,,,,,Terminal Ileum,inflamed,yes,stricture,
50403C1,34,unassigned_codeword_counts,25%,0.000000,Stricture-50403C1,,50403,CHO-012,TUQ97N,21978,,,NO,...,,,,,,Terminal Ileum,inflamed,yes,stricture,
50403C1,34,unassigned_codeword_counts,50%,0.000000,Stricture-50403C1,,50403,CHO-012,TUQ97N,21978,,,NO,...,,,,,,Terminal Ileum,inflamed,yes,stricture,


In [None]:
palette = "tab20"
# palette = ["r", "b", "y"]

fig = sb.catplot(obs[["25%", "50%", "75%"]].stack().to_frame("Value").join(
    metadata.reset_index().set_index(cso)), x=col_subject, y="Value",
                 row="Variable", col="Metric", kind="bar", hue=col_condition,
                 sharex=False, sharey=False, palette=palette)
fig.set_xticklabels(rotation=45, fontsize=10)
plt.subplots_adjust(hspace=0.3)

In [64]:
regions = ["mucosa", "serosa", "myenteric_plexus",
           "submucosa", "smc_circular"]

obs_reg, n_cells_reg = {},{}
for r in regions:
    obs_reg[r], n_cells_reg[r] = {},{}
    for s in libs:  # iterate samples
        print(f"\n\n{'=' * 80}\n{s}\n{'=' * 80}\n\n")
        fff = os.path.join(dir_data, np.array(files)[np.where([
            s == os.path.basename(x).split("__")[2].split("-")[0]
            for x in files])[0][0]])  # sample's Xenium data directory path
        lib = metadata.reset_index().set_index(cso).loc[s][col_sample]
        file_obj_proc = os.path.join(out_dir, "objects_cropped",
                                     r, f"{lib}_{r}{input_suffix}.h5ad")
        if not os.path.exists(file_obj_proc):
            print(f"{file_obj_proc} doesn't exist")
            continue
        adata = sc.read_h5ad(file_obj_proc)  # processed adata
        obs_reg[r][s] = adata.obs[adata.obs.columns.intersection(set(cols + [
            col_cell_type]))].assign(n_cells=adata.obs.shape[0])
        n_cells_reg[r][s] = adata.obs[col_cell_type].value_counts()
    n_cells_reg[r] = pd.concat(n_cells_reg[r], names=[cso])
    obs_reg[r] = pd.concat(obs_reg[r], names=[cso])
obs_reg = pd.concat(obs_reg, names=["Region"])
n_cells_reg = pd.concat(n_cells_reg, names=["Region"])
# obs_reg.to_excel(os.path.join(
#     out_new, f"quantification/xenium_qc_{'_'.join(regions)}{input_suffix}"
#     ".xlsx"))
# n_cells_reg.to_excel(os.path.join(
#     out_new, f"quantification/xenium_n_cells{'_'.join(regions)}"
#     f"{input_suffix}.xlsx"))



50452A


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/mucosa/Uninflamed-50452A_mucosa.h5ad doesn't exist


50452B


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/mucosa/Inflamed-50452B_mucosa.h5ad doesn't exist


50452C


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/mucosa/Stricture-50452C_mucosa.h5ad doesn't exist


50006A






50006B




50006C


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/mucosa/Stricture-50006C_mucosa.h5ad doesn't exist


50217A




50217B




50217C


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/mucosa/Stricture-50217C_mucosa.h5ad doesn't exist


50336B




50336C




50336A


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/mucosa/Stricture-50336A_mucosa.h5ad doesn't exist


50403A2




50403B




50403C1


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/mucosa/Stricture-50403C1_mucosa.h5ad doesn't exist


50452A


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/serosa/Uninflamed-50452A_serosa.h5ad doesn't exist


50452B


/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/outputs/TUQ97N/nebraska/objects_cropped/serosa/Inflamed-50452B_sero