# Goal

* Summarize data in Cell X Gene by using the census python package

In [11]:
from pathlib import Path
import pandas as pd
import cellxgene_census
# set number of rows to display to 1000
pd.set_option('display.max_rows', 4)

In [12]:
# get root of git repo
base_dir = !git rev-parse --show-toplevel
base_dir = Path(base_dir[0])
base_dir

PosixPath('/home/nickyoungblut/dev/python/scBaseCount_analysis')

# Schema

In [2]:
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    print(census["census_data"]["homo_sapiens"].obs.schema)

soma_joinid: int64 not null
dataset_id: dictionary<values=string, indices=int16, ordered=0>
assay: dictionary<values=string, indices=int8, ordered=0>
assay_ontology_term_id: dictionary<values=string, indices=int8, ordered=0>
cell_type: dictionary<values=string, indices=int16, ordered=0>
cell_type_ontology_term_id: dictionary<values=string, indices=int16, ordered=0>
development_stage: dictionary<values=string, indices=int16, ordered=0>
development_stage_ontology_term_id: dictionary<values=string, indices=int16, ordered=0>
disease: dictionary<values=string, indices=int16, ordered=0>
disease_ontology_term_id: dictionary<values=string, indices=int16, ordered=0>
donor_id: dictionary<values=string, indices=int16, ordered=0>
is_primary_data: bool
observation_joinid: large_string
self_reported_ethnicity: dictionary<values=string, indices=int8, ordered=0>
self_reported_ethnicity_ontology_term_id: dictionary<values=string, indices=int8, ordered=0>
sex: dictionary<values=string, indices=int8, ord

# List datasets

In [4]:
# Get collection-dataset table
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    datasets = (
        census["census_info"]["datasets"]
        .read()
        .concat()
        .to_pandas()
    )
datasets

Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,collection_doi_label,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,0,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,4eb29386-de81-452f-b3c0-e00844e8c7fd,f76861bb-becb-4eb7-82fc-782dc96ccc7f,Spatial transcriptomics in mouse: Puck_191112_05,4eb29386-de81-452f-b3c0-e00844e8c7fd.h5ad,10888
1,1,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,78d59e4a-82eb-4a61-a1dc-da974d7ea54b,7d7ec1b6-6e3f-4aaa-9442-4b22f3424396,Spatial transcriptomics in mouse: Puck_191112_08,78d59e4a-82eb-4a61-a1dc-da974d7ea54b.h5ad,10250
...,...,...,...,...,...,...,...,...,...,...,...
1571,1571,Publication: https://doi.org/10.1038/s41593-02...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1038/s41593-024-01774-5,Gabitto et al. (2024) Nat Neurosci,c2876b1b-06d8-4d96-a56b-5304f815b99a,c32964d2-3339-441f-8e56-7177234c7876,Whole Taxonomy - MTG: Seattle Alzheimer's Dise...,c2876b1b-06d8-4d96-a56b-5304f815b99a.h5ad,1226855
1572,1572,Publication: https://doi.org/10.1038/s41593-02...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1038/s41593-024-01774-5,Gabitto et al. (2024) Nat Neurosci,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,d3427e8c-c55d-4d4e-b15b-1a8774cd3a4b,Whole Taxonomy - DLPFC: Seattle Alzheimer's Di...,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3.h5ad,1309414


# Get all tissues

### Cell x Gene

In [20]:
# filter human datasets to targets
cxg_tissues = []
organisms = ["homo_sapiens", "mus_musculus"]
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    for organism in organisms:
        df = (
            census["census_data"][organism]
            .obs.read(column_names = ["tissue", "tissue_general", "tissue_ontology_term_id"])
            .concat()
            .group_by(["tissue", "tissue_general", "tissue_ontology_term_id"])
            .aggregate([])
            .to_pandas() 
        )
        df["organism"] = organism
        cxg_tissues.append(df)
cxg_tissues = pd.concat(cxg_tissues)
cxg_tissues.head()

Unnamed: 0,tissue,tissue_general,tissue_ontology_term_id,organism
0,liver,liver,UBERON:0002107,homo_sapiens
1,caudate lobe of liver,liver,UBERON:0001117,homo_sapiens
...,...,...,...,...
3,tonsil,immune system,UBERON:0002372,homo_sapiens
4,superior frontal gyrus,brain,UBERON:0002661,homo_sapiens


In [21]:
# number of unique tissue categories
len(cxg_tissues['tissue_general'].unique())

70

### scBaseCount

In [14]:
scbc_tissues_file = base_dir / "data" / "tissues" / "2025-03-11_tissue_categories.csv.gz"
scbc_tissues = pd.read_csv(scbc_tissues_file)
scbc_tissues.head()


Unnamed: 0,tissue,category
0,2 layer spheroid,other
1,"3 layer spheroid channel 1,epithelial",other
...,...,...
3,3D healthy skin model,skin of body
4,A549 cells,lung


In [22]:
# number of unique tissue categories
len(scbc_tissues["category"].unique())

76

# Get unique onotology terms

In [7]:
# filter human datasets to targets
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    tissue_ontology = (
        census["census_data"]["homo_sapiens"]
        .obs.read(column_names = ["tissue_ontology_term_id"])
        .concat()
        .group_by(["tissue", "tissue_ontology_term_id"])
        .aggregate([])
        .to_pandas() 
    )
tissue_ontology.head()

Unnamed: 0,tissue,tissue_ontology_term_id,organism
0,breast,UBERON:0000310,homo_sapiens
1,cerebral cortex,UBERON:0000956,homo_sapiens
...,...,...,...
3,liver,UBERON:0002107,homo_sapiens
4,skin epidermis,UBERON:0001003,homo_sapiens


In [None]:
# filter human datasets to targets
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    human_tissues = (
        census["census_data"]["homo_sapiens"]
        .obs.read(column_names = ["tissue_ontology_term_id"])
        .concat()
        .group_by(["tissue", "tissue_ontology_term_id"])
        .aggregate([])
        .to_pandas() 
    )
human_tissues["organism"] = "homo_sapiens"
human_tissues

In [9]:
# any non-Uberon terms?
tissue_ontology[~tissue_ontology["tissue_ontology_term_id"].str.startswith("UBERON")]

Unnamed: 0,tissue_ontology_term_id


In [10]:
# filter human datasets to targets
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    tissue_ontology = (
        census["census_data"]["homo_sapiens"]
        .obs.read(column_names = ["tissue", "tissue_ontology_term_id"])
        .concat()
        .group_by(["tissue", "tissue_ontology_term_id"])
        .aggregate([])
        .to_pandas() 
    )
tissue_ontology.head()

Unnamed: 0,tissue,tissue_ontology_term_id
0,liver,UBERON:0002107
1,caudate lobe of liver,UBERON:0001117
2,thymus,UBERON:0002370
3,tonsil,UBERON:0002372
4,superior frontal gyrus,UBERON:0002661


In [15]:
# tumor tissues?
tumor_tissues = tissue_ontology[tissue_ontology["tissue"].str.contains("tumor", case=False, regex=True)]
tumor_tissues

Unnamed: 0,tissue,tissue_ontology_term_id
