In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import re
import math
import functools
import traceback
import seaborn as sb
import anndata
import scanpy as sc
import spatialdata_plot
import numpy as np
import pandas as pd
import corescpy as cr

# Main
write_object = True  # change to True when you're ready to save objects
overwrite = False  # overwrite if already exists?
col_cell_type = "leiden_res1pt5_dist0_npc30"
col_ann = "Bucket"

# Process Options
panel = "TUQ97N"  # Xenium panel ID
constants_dict = cr.get_panel_constants(panel)
libs = [  # sample IDs from patients for whom we have all conditions
    "50452A", "50452B", "50452C",  # old segmentation
    "50006A", "50006B", "50006C",  # rest are new segmentation
    "50217A", "50217B", "50217C",
    "50336B", "50336C", "50336A",
    "50403A2", "50403B", "50403C1"
]  # excludes low-quality sample/condition replicates 50403A1 & 50403C2
# libs = None  # to run all available samples
cols = [
    "transcript_counts", "control_probe_counts", "control_codeword_counts",
    "unassigned_codeword_counts", "deprecated_codeword_counts",
    "total_counts", "cell_area", "nucleus_area", "nucleus_count"
]
input_suffix = ""  # in case want to load objects with some suffix

# Files & Directories
direc = "/mnt/cho_lab/bbdata2/"  # mounted NFS with data
dir_entry = "/mnt/cho_lab/disk2"  # Spark writeable data directory
mdf = str("/mnt/cho_lab/disk2/elizabeth/data/shared-xenium-library/samples_"
          f"{panel}.csv")  # metadata file path (for now; will soon be on NFS)
dir_writeable = os.path.join(
    dir_entry, f"elizabeth/data/shared-xenium-library")  # where objects are
out_dir = os.path.join(
    dir_writeable, f"outputs/{panel}/nebraska")  # object output directory

#  Your Folders
out_new = os.path.join(
    dir_entry,
    f"{os.getlogin()}/data/shared-xenium-library/outputs/{panel}/nebraska")

# Constants (Shouldn't Need Edits Unless Extreme Process Changes)
cso, col_sample, col_condition, col_inflamed, col_subject = [
    constants_dict[x] if x in constants_dict else None for x in [
        "col_sample_id_o", "col_sample_id", "col_condition",
        "col_inflamed", "col_subject"]]
dir_data = os.path.join(direc, f"outputs/{panel}")
files = functools.reduce(lambda i, j: i + j, [[os.path.join(
    run, i) for i in os.listdir(os.path.join(
        dir_data, run))] for run in os.listdir(dir_data)])  # all data paths
os.makedirs(out_dir, exist_ok=True)  # make output directory if needed
metadata = cr.pp.get_metadata_cho(direc, mdf, panel_id=panel, samples=libs)
metadata[col_subject]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Sample
Uninflamed-50452A     50454
Inflamed-50452B       50453
Stricture-50452C      50452
Inflamed-50006A       50006
Uninflamed-50006B     50006
Stricture-50006C      50006
Inflamed-50217A       50217
Uninflamed-50217B     50217
Stricture-50217C      50217
Inflamed-50336B       50336
Uninflamed-50336C     50336
Stricture-50336A      50336
Uninflamed-50403A2    50403
Inflamed-50403B       50403
Stricture-50403C1     50403
Name: subject_id, dtype: int64

In [4]:
adata

AnnData object with n_obs × n_vars = 312629 × 469
    obs: 'cell_id', 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'region', 'cell_labels', 'Sample', 'Sample ID', 'Patient', 'Status', 'Slide Id', 'Project', 'Location', 'Stricture', 'GRID ID', 'Inflamed', 'Procedure Date', 'Age', 'Sex', 'Race', 'Hispanic', 'Diagnosis', 'Project.1', 'Procedure', 'Disease_Status', 'Date Collected', 'Date Sectioned', 'Date Hybridization', 'Storage 4c', 'Created By', 'Created', 'Storage Status', 'Location.1', 'Storage Row', 'Storage Col', 'Checked Out By', 'out_file', 'Condition', 'file_path', 'n_counts', 'log_counts', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'assay_protein', 'col_gene_symbols', 'col_cell_type', 'col_sample_id', 'col_batch', 'col_subject', 'col_condition', 'col_num_umis', 'col_segment', 'cell_filter_pmt', 'cell_filter_nco

In [5]:
n_origin, n_proc, obs = {}, {}, {}
for s in libs:  # iterate samples
    print(f"\n\n{'=' * 80}\n{s}\n{'=' * 80}\n\n")
    fff = os.path.join(dir_data, np.array(files)[np.where([
        s == os.path.basename(x).split("__")[2].split("-")[0]
        for x in files])[0][0]])  # sample's Xenium data directory path
    lib = metadata.reset_index().set_index(cso).loc[s][col_sample]
    file_obj_proc = os.path.join(out_dir, f"{lib}{input_suffix}.h5ad")
    self = cr.Spatial(fff, library_id=lib)  # load original data
    adata = sc.read_h5ad(file_obj_proc)  # processed adata
    n_origin[s], n_proc[s] = self.rna.obs.shape[0], adata.obs.shape[0]
    obs[s] = adata.obs[adata.obs.columns.intersection(set(cols + [
        col_cell_type]))].groupby(col_cell_type).describe().copy()
    sb.pairplot(adata.obs[adata.obs.columns.intersection(set(cols))])
n_cells = pd.concat([pd.Series(x, index=pd.Index(libs, name=cso))
                     for x in [n_origin, n_proc]],
                    keys=["Original", "Processed"], names=["Source"])
obs = pd.concat(obs, keys=libs, names=[cso])
n_cells.to_csv(os.path.join(
    out_new, f"quantification/xenium_n_cells{input_suffix}.csv"))
obs.to_csv(os.path.join(
    out_new, f"quantification/xenium_qc{input_suffix}.csv"))



50452A




<<< INITIALIZING SPATIAL CLASS OBJECT >>>

[34mINFO    [0m reading                                                                                                   
         [35m/mnt/cho_lab/bbdata2/outputs/TUQ97N/CHO-001/output-XETG00189__0010700__50452A-TUQ97N-EA__20240126__205019/[0m
         [95mcell_feature_matrix.h5[0m                                                                                    




Counts: Initial


	Observations: 333825

	Genes: 469







 AnnData object with n_obs × n_vars = 333825 × 469
    obs: 'cell_id', 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'region', 'cell_labels', 'Sample'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatialdata_attrs', 'spatial', 'original_ix'
    obsm: 'spatial'
    layers: 'counts' 

                      gene_ids    feature_types   genome
gene_symbols                                           
ABCA7         ENSG00000064687  Gene Expression  Unknown
ACTA2         ENSG00000107796  Gene Expression  Unknown
ACTG2         ENSG00000163017  Gene Expression  Unknown
ADAM15        ENSG00000143537  Gene Expression  Unknown
ADAM28        ENSG00000042980  Gene Expression  Unknown 

 



col_gene_symbols="gene_symbols"
col_cell_type="leiden"
col_sample_id="Sample"
col_batch="Sample"
col_subject=No