### Merge BTC DPT Visium Cohorts

In [1]:
import tarfile
import warnings
from glob import glob
import os
import anndata as ad
import pandas as pd
import scanpy as sc
import squidpy as sq
import re
from cirro import DataPortal

# set up cirro data portal access
portal = DataPortal()

sc.set_figure_params(figsize=(4, 4))
sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
print(f"squidpy=={sq.__version__}")



To sign in, use a web browser to open the page https://breakthroughcancer.cirro.bio/authorize?user_code=DG455JVT
scanpy==1.9.7 anndata==0.10.6 umap==0.5.6 numpy==1.26.4 scipy==1.13.0 pandas==2.2.1 scikit-learn==1.4.1.post1 statsmodels==0.14.1 igraph==0.11.4 pynndescent==0.5.12
squidpy==1.6.5


In [None]:
# adapt cirro code into function to pull in h5ad files
def get_cirro_h5ad_files(project_name, dataset_name):
    # access the cirro project
    project = portal.get_project_by_name(project_name)
    all_datasets = project.list_datasets()
    #print(f"The project {project.name} contains {len(all_datasets):,} datasets")
    # access the dataset of interest within the cirro project
    dataset = all_datasets.get_by_name(dataset_name)
    # get the complete list of files in that dataset
    files = dataset.list_files()
    #print(f"Dataset {dataset.name} contains {len(files):,} files")
    # parse out the rctd.h5ad files
    rctds = files.filter_by_pattern("*/rctd.h5ad")
    print(f"Selected the file: {rctds.description()}")
    return rctds # cirro data portal object with all rctd.h5ad files

In [3]:
visST = get_cirro_h5ad_files('BTC-DPT-Development', 'Standard Visium - 15000 genes')
visHD = get_cirro_h5ad_files('BTC-DPT-Development', 'Test Visium HD - 15000 genes')

Selected the file: data/rctd/BTC01_visium/rctd.h5ad (173.35 MB)

---

data/rctd/BTC03_visium/rctd.h5ad (193.22 MB)

---

data/rctd/BTC09_visium/rctd.h5ad (241.61 MB)

---

data/rctd/BTC13_visium/rctd.h5ad (274.06 MB)
Selected the file: data/rctd/HC01BTC_visiumHD/rctd.h5ad (380.89 MB)

---

data/rctd/HC02BTC_visiumHD/rctd.h5ad (66.38 MB)

---

data/rctd/HC03BTC_visiumHD/rctd.h5ad (136.53 MB)

---

data/rctd/HC04BTC_visiumHD/rctd.h5ad (267.20 MB)

---

data/rctd/HC05BTC_visiumHD/rctd.h5ad (156.17 MB)

---

data/rctd/HC07BTC_visiumHD/rctd.h5ad (267.56 MB)

---

data/rctd/HC08BTC_visiumHD/rctd.h5ad (131.96 MB)


In [None]:
def load_cirro_h5ad_files(file_list):
    # file list is a cirro dataset object (output of get_cirro_h5ad_files)
    sample_dict = {}
    for dataset in file_list:
        #dataset_name = dataset.name # whole path
        name = dataset.name.split("/")[-2] # just dataset name
        print(f"Loading dataset: {name}")
        adata = dataset.read_h5ad()
        sample_dict[name] = adata
    return sample_dict

In [5]:
rctd_samples = load_cirro_h5ad_files(visST)
rctd_hd_samples = load_cirro_h5ad_files(visHD)
print("Standard samples:", list(rctd_samples.keys()))
print("HD samples:", list(rctd_hd_samples.keys()))

Loading dataset: BTC01_visium
Loading dataset: BTC03_visium
Loading dataset: BTC09_visium
Loading dataset: BTC13_visium
Loading dataset: HC01BTC_visiumHD
Loading dataset: HC02BTC_visiumHD
Loading dataset: HC03BTC_visiumHD
Loading dataset: HC04BTC_visiumHD
Loading dataset: HC05BTC_visiumHD
Loading dataset: HC07BTC_visiumHD
Loading dataset: HC08BTC_visiumHD
Standard samples: ['BTC01_visium', 'BTC03_visium', 'BTC09_visium', 'BTC13_visium']
HD samples: ['HC01BTC_visiumHD', 'HC02BTC_visiumHD', 'HC03BTC_visiumHD', 'HC04BTC_visiumHD', 'HC05BTC_visiumHD', 'HC07BTC_visiumHD', 'HC08BTC_visiumHD']


In [6]:
# add metadata to each object for merging (sample name and type)
for name, adata in rctd_samples.items():
    print(name)
    adata.obs["sample_name"] = name
    adata.obs["sample_type"] = "standard"

for name, adata in rctd_hd_samples.items():
    print(name)
    adata.obs["sample_name"] = name
    adata.obs["sample_type"] = "HD"

# merge all objects into one
all_adatas = list(rctd_samples.values()) + list(rctd_hd_samples.values()) # list all objects together
# combined_adata = ad.concat(all_adatas,uns_merge="unique",index_unique="-")
#keys = [adata.obs["sample_name"][0] for adata in all_adatas]  # assuming obs is not empty

combined_adata = sc.concat(
    all_adatas,
    index_unique="-",
    uns_merge="unique"  # keeps each spatial entry under its own key
)

BTC01_visium
BTC03_visium
BTC09_visium
BTC13_visium
HC01BTC_visiumHD
HC02BTC_visiumHD
HC03BTC_visiumHD
HC04BTC_visiumHD
HC05BTC_visiumHD
HC07BTC_visiumHD
HC08BTC_visiumHD


In [7]:
combined_adata

AnnData object with n_obs × n_vars = 1161405 × 18082
    obs: 'in_tissue', 'cell_type', 'sample_name', 'sample_type'
    uns: 'layout', 'spatial', 'spatialdata_attrs'
    obsm: 'spatial'

In [8]:
combined_adata.obs

Unnamed: 0,in_tissue,cell_type,sample_name,sample_type
AACACCTACTATCGAA-1-0,1,FIBROBLASTS,BTC01_visium,standard
AACACGTGCATCGCAC-1-0,1,FIBROBLASTS,BTC01_visium,standard
AACACTTGGCAAGGAA-1-0,1,FIBROBLASTS,BTC01_visium,standard
AACAGGAAGAGCATAG-1-0,1,FIBROBLASTS,BTC01_visium,standard
AACAGGATTCATAGTT-1-0,1,FIBROBLASTS,BTC01_visium,standard
...,...,...,...,...
s_016um_00193_00227-1-10,1,,HC08BTC_visiumHD,HD
s_016um_00109_00223-1-10,1,,HC08BTC_visiumHD,HD
s_016um_00039_00175-1-10,1,,HC08BTC_visiumHD,HD
s_016um_00037_00193-1-10,1,,HC08BTC_visiumHD,HD


In [9]:
combined_adata.obs["sample_name"].value_counts()

sample_name
HC02BTC_visiumHD    174601
HC01BTC_visiumHD    169550
HC07BTC_visiumHD    168236
HC05BTC_visiumHD    164263
HC04BTC_visiumHD    163383
HC03BTC_visiumHD    158415
HC08BTC_visiumHD    142989
BTC01_visium          4992
BTC03_visium          4992
BTC09_visium          4992
BTC13_visium          4992
Name: count, dtype: int64

In [10]:
combined_adata.obs["cell_type"].value_counts()

cell_type
FIBROBLASTS         334251
DUCTAL               61136
ACINAR               42261
CYCLING DUCTAL       39938
MYELOID              26009
PERICYTES            25638
ENDOTHELIAL          13668
ENDOCRINE            13090
TNK                   8269
PLASMA                5540
CYCLING. MYELOID      2857
B CELLS               2137
MAST                   509
CYCLING TNK            266
Name: count, dtype: int64

In [11]:
# define cell types of interest
ductal_cells = ["DUCTAL","CYCLING DUCTAL"]
fibro_ductal_cells = ["FIBROBLASTS","DUCTAL","CYCLING DUCTAL"]

In [12]:
# susbet combined object to cell types of interest
ductal_adata = combined_adata[combined_adata.obs["cell_type"].isin(ductal_cells)].copy()
fibro_ductal_adata = combined_adata[combined_adata.obs["cell_type"].isin(fibro_ductal_cells)].copy()

In [13]:
ductal_adata

AnnData object with n_obs × n_vars = 101074 × 18082
    obs: 'in_tissue', 'cell_type', 'sample_name', 'sample_type'
    uns: 'layout', 'spatial', 'spatialdata_attrs'
    obsm: 'spatial'

In [14]:
ductal_adata.obs

Unnamed: 0,in_tissue,cell_type,sample_name,sample_type
AACATACTCATATGCG-1-0,1,DUCTAL,BTC01_visium,standard
AACATCGCGTGACCAC-1-0,0,DUCTAL,BTC01_visium,standard
AACCAAGGTATCAGGC-1-0,1,DUCTAL,BTC01_visium,standard
AACCACTGCCATAGCC-1-0,1,DUCTAL,BTC01_visium,standard
AACCGCATGTATGTTA-1-0,1,DUCTAL,BTC01_visium,standard
...,...,...,...,...
s_016um_00193_00258-1-10,1,CYCLING DUCTAL,HC08BTC_visiumHD,HD
s_016um_00247_00325-1-10,1,CYCLING DUCTAL,HC08BTC_visiumHD,HD
s_016um_00225_00261-1-10,1,DUCTAL,HC08BTC_visiumHD,HD
s_016um_00220_00218-1-10,1,DUCTAL,HC08BTC_visiumHD,HD


In [15]:
ductal_adata.obs["sample_name"].value_counts()

sample_name
HC01BTC_visiumHD    43343
HC07BTC_visiumHD    20245
HC04BTC_visiumHD    17190
HC05BTC_visiumHD    11599
HC03BTC_visiumHD     3156
BTC01_visium         1608
BTC03_visium         1230
BTC13_visium          754
HC02BTC_visiumHD      691
BTC09_visium          634
HC08BTC_visiumHD      624
Name: count, dtype: int64

In [16]:
ductal_adata.obs["cell_type"].value_counts()

cell_type
DUCTAL            61136
CYCLING DUCTAL    39938
Name: count, dtype: int64

In [17]:
fibro_ductal_adata

AnnData object with n_obs × n_vars = 435325 × 18082
    obs: 'in_tissue', 'cell_type', 'sample_name', 'sample_type'
    uns: 'layout', 'spatial', 'spatialdata_attrs'
    obsm: 'spatial'

In [18]:
fibro_ductal_adata.obs

Unnamed: 0,in_tissue,cell_type,sample_name,sample_type
AACACCTACTATCGAA-1-0,1,FIBROBLASTS,BTC01_visium,standard
AACACGTGCATCGCAC-1-0,1,FIBROBLASTS,BTC01_visium,standard
AACACTTGGCAAGGAA-1-0,1,FIBROBLASTS,BTC01_visium,standard
AACAGGAAGAGCATAG-1-0,1,FIBROBLASTS,BTC01_visium,standard
AACAGGATTCATAGTT-1-0,1,FIBROBLASTS,BTC01_visium,standard
...,...,...,...,...
s_016um_00217_00238-1-10,1,FIBROBLASTS,HC08BTC_visiumHD,HD
s_016um_00122_00358-1-10,1,FIBROBLASTS,HC08BTC_visiumHD,HD
s_016um_00197_00110-1-10,1,FIBROBLASTS,HC08BTC_visiumHD,HD
s_016um_00224_00113-1-10,1,FIBROBLASTS,HC08BTC_visiumHD,HD


In [19]:
fibro_ductal_adata.obs["sample_name"].value_counts()

sample_name
HC07BTC_visiumHD    128259
HC01BTC_visiumHD    116110
HC04BTC_visiumHD     89395
HC03BTC_visiumHD     39259
HC08BTC_visiumHD     22317
HC05BTC_visiumHD     20711
BTC03_visium          4784
BTC09_visium          4710
BTC13_visium          4596
BTC01_visium          4427
HC02BTC_visiumHD       757
Name: count, dtype: int64

In [20]:
fibro_ductal_adata.obs["cell_type"].value_counts()

cell_type
FIBROBLASTS       334251
DUCTAL             61136
CYCLING DUCTAL     39938
Name: count, dtype: int64