### Merge BTC DPT Visium Cohorts

In [1]:
import tarfile
import warnings
from glob import glob
import os
import anndata as ad
import pandas as pd
import scanpy as sc
import squidpy as sq
import re

sc.set_figure_params(figsize=(4, 4))
sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
print(f"squidpy=={sq.__version__}")



scanpy==1.9.7 anndata==0.10.6 umap==0.5.6 numpy==1.26.4 scipy==1.13.0 pandas==2.2.1 scikit-learn==1.4.1.post1 statsmodels==0.14.1 igraph==0.11.4 pynndescent==0.5.12
squidpy==1.6.5


In [2]:
# Load in h5ad files for both standard and HD Visium (based on Cirro directory stcuture)
def load_h5ad_files(base_dir):
    sample_dict = {}
    for root, dirs, files in os.walk(base_dir):
        for dir_name in dirs:
            sample_dir = os.path.join(root, dir_name)
            h5ad_files = [f for f in os.listdir(sample_dir) if f.endswith('.h5ad')]
            if h5ad_files:
                if len(h5ad_files) > 1: # should have only one h5ad file per directory but check for multiple
                    raise ValueError(f"Multiple h5ad files found in {sample_dir}, expected one.")
                file_path = os.path.join(sample_dir, h5ad_files[0])
                print(f"Loading: {file_path}")
                sample_dict[dir_name] = sc.read_h5ad(file_path)
    return sample_dict

In [3]:
rctd_samples = load_h5ad_files("../../rctd_testing/rctd")
rctd_hd_samples = load_h5ad_files("../../rctd_testing/rctd_hd")
# list out the sample names in each
print("Standard samples:", list(rctd_samples.keys()))
print("HD samples:", list(rctd_hd_samples.keys()))

Loading: ../../rctd_testing/rctd/BTC_ST2/rctd.h5ad
Loading: ../../rctd_testing/rctd/BTC_ST1/rctd.h5ad
Loading: ../../rctd_testing/rctd_hd/BTC_HD1/rctd.h5ad
Loading: ../../rctd_testing/rctd_hd/BTC_HD2/rctd.h5ad
Loading: ../../rctd_testing/rctd_hd/BTC_HD3/rctd.h5ad
Loading: ../../rctd_testing/rctd_hd/BTC_HD4/rctd.h5ad
Standard samples: ['BTC_ST2', 'BTC_ST1']
HD samples: ['BTC_HD1', 'BTC_HD2', 'BTC_HD3', 'BTC_HD4']


In [4]:
# add metadata to each object for merging (sample name and type)
for name, adata in rctd_samples.items():
    print(name)
    adata.obs["sample_name"] = name
    adata.obs["sample_type"] = "standard"

for name, adata in rctd_hd_samples.items():
    print(name)
    adata.obs["sample_name"] = name
    adata.obs["sample_type"] = "HD"

# merge all objects into one
all_adatas = list(rctd_samples.values()) + list(rctd_hd_samples.values()) # list all objects together
# combined_adata = ad.concat(all_adatas,uns_merge="unique",index_unique="-")
#keys = [adata.obs["sample_name"][0] for adata in all_adatas]  # assuming obs is not empty

combined_adata = sc.concat(
    all_adatas,
    index_unique="-",
    uns_merge="unique"  # keeps each spatial entry under its own key
)

BTC_ST2
BTC_ST1
BTC_HD1
BTC_HD2
BTC_HD3
BTC_HD4


In [5]:
combined_adata

AnnData object with n_obs × n_vars = 686634 × 18082
    obs: 'in_tissue', 'cell_type', 'sample_name', 'sample_type'
    uns: 'layout', 'spatial', 'spatialdata_attrs'
    obsm: 'spatial'

In [6]:
combined_adata.obs

Unnamed: 0,in_tissue,cell_type,sample_name,sample_type
AACACCTACTATCGAA-1-0,1,PERICYTES,BTC_ST2,standard
AACACGTGCATCGCAC-1-0,1,FIBROBLASTS,BTC_ST2,standard
AACACTTGGCAAGGAA-1-0,1,DUCTAL,BTC_ST2,standard
AACAGGAAGAGCATAG-1-0,1,DUCTAL,BTC_ST2,standard
AACAGGATTCATAGTT-1-0,1,FIBROBLASTS,BTC_ST2,standard
...,...,...,...,...
s_016um_00375_00231-1-5,1,PERICYTES,BTC_HD4,HD
s_016um_00109_00223-1-5,1,FIBROBLASTS,BTC_HD4,HD
s_016um_00039_00175-1-5,1,DUCTAL,BTC_HD4,HD
s_016um_00037_00193-1-5,1,ENDOTHELIAL,BTC_HD4,HD


In [7]:
combined_adata.obs["sample_name"].value_counts()

sample_name
BTC_HD3    174601
BTC_HD1    169550
BTC_HD4    168236
BTC_HD2    164263
BTC_ST2      4992
BTC_ST1      4992
Name: count, dtype: int64

In [8]:
combined_adata.obs["cell_type"].value_counts()

cell_type
FIBROBLASTS         197042
DUCTAL               45909
ACINAR               41596
CYCLING DUCTAL       32457
MYELOID               7168
ENDOCRINE             6853
ENDOTHELIAL           6795
PERICYTES             4777
PLASMA                1331
TNK                    962
CYCLING. MYELOID       711
B CELLS                379
MAST                   218
CYCLING TNK             67
Name: count, dtype: int64

In [9]:
# define cell types of interest
ductal_cells = ["DUCTAL","CYCLING DUCTAL"]
fibro_ductal_cells = ["FIBROBLASTS","DUCTAL","CYCLING DUCTAL"]

In [10]:
# susbet combined object to cell types of interest
ductal_adata = combined_adata[combined_adata.obs["cell_type"].isin(ductal_cells)].copy()
fibro_ductal_adata = combined_adata[combined_adata.obs["cell_type"].isin(fibro_ductal_cells)].copy()

In [11]:
ductal_adata

AnnData object with n_obs × n_vars = 78366 × 18082
    obs: 'in_tissue', 'cell_type', 'sample_name', 'sample_type'
    uns: 'layout', 'spatial', 'spatialdata_attrs'
    obsm: 'spatial'

In [12]:
ductal_adata.obs

Unnamed: 0,in_tissue,cell_type,sample_name,sample_type
AACACTTGGCAAGGAA-1-0,1,DUCTAL,BTC_ST2,standard
AACAGGAAGAGCATAG-1-0,1,DUCTAL,BTC_ST2,standard
AACAGGTTATTGCACC-1-0,1,DUCTAL,BTC_ST2,standard
AACAGGTTCACCGAAG-1-0,1,DUCTAL,BTC_ST2,standard
AACATCTAATGACCGG-1-0,1,DUCTAL,BTC_ST2,standard
...,...,...,...,...
s_016um_00046_00250-1-5,1,DUCTAL,BTC_HD4,HD
s_016um_00367_00225-1-5,1,CYCLING DUCTAL,BTC_HD4,HD
s_016um_00267_00377-1-5,1,CYCLING DUCTAL,BTC_HD4,HD
s_016um_00193_00227-1-5,1,DUCTAL,BTC_HD4,HD


In [13]:
ductal_adata.obs["sample_name"].value_counts()

sample_name
BTC_HD1    43343
BTC_HD4    20245
BTC_HD2    11599
BTC_ST2     1259
BTC_ST1     1229
BTC_HD3      691
Name: count, dtype: int64

In [14]:
ductal_adata.obs["cell_type"].value_counts()

cell_type
DUCTAL            45909
CYCLING DUCTAL    32457
Name: count, dtype: int64

In [15]:
fibro_ductal_adata

AnnData object with n_obs × n_vars = 275408 × 18082
    obs: 'in_tissue', 'cell_type', 'sample_name', 'sample_type'
    uns: 'layout', 'spatial', 'spatialdata_attrs'
    obsm: 'spatial'

In [16]:
fibro_ductal_adata.obs

Unnamed: 0,in_tissue,cell_type,sample_name,sample_type
AACACGTGCATCGCAC-1-0,1,FIBROBLASTS,BTC_ST2,standard
AACACTTGGCAAGGAA-1-0,1,DUCTAL,BTC_ST2,standard
AACAGGAAGAGCATAG-1-0,1,DUCTAL,BTC_ST2,standard
AACAGGATTCATAGTT-1-0,1,FIBROBLASTS,BTC_ST2,standard
AACAGGCCAACGATTA-1-0,1,FIBROBLASTS,BTC_ST2,standard
...,...,...,...,...
s_016um_00212_00331-1-5,1,FIBROBLASTS,BTC_HD4,HD
s_016um_00288_00288-1-5,1,FIBROBLASTS,BTC_HD4,HD
s_016um_00193_00227-1-5,1,DUCTAL,BTC_HD4,HD
s_016um_00109_00223-1-5,1,FIBROBLASTS,BTC_HD4,HD


In [17]:
fibro_ductal_adata.obs["sample_name"].value_counts()

sample_name
BTC_HD4    128259
BTC_HD1    116110
BTC_HD2     20711
BTC_ST1      4786
BTC_ST2      4785
BTC_HD3       757
Name: count, dtype: int64

In [18]:
fibro_ductal_adata.obs["cell_type"].value_counts()

cell_type
FIBROBLASTS       197042
DUCTAL             45909
CYCLING DUCTAL     32457
Name: count, dtype: int64