# Goal

* Summarize data in Cell X Gene by using the census python package

In [1]:
import pandas as pd
import cellxgene_census

In [2]:
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    print(census["census_data"]["homo_sapiens"].obs.schema)

soma_joinid: int64 not null
dataset_id: dictionary<values=string, indices=int16, ordered=0>
assay: dictionary<values=string, indices=int8, ordered=0>
assay_ontology_term_id: dictionary<values=string, indices=int8, ordered=0>
cell_type: dictionary<values=string, indices=int16, ordered=0>
cell_type_ontology_term_id: dictionary<values=string, indices=int16, ordered=0>
development_stage: dictionary<values=string, indices=int16, ordered=0>
development_stage_ontology_term_id: dictionary<values=string, indices=int16, ordered=0>
disease: dictionary<values=string, indices=int16, ordered=0>
disease_ontology_term_id: dictionary<values=string, indices=int16, ordered=0>
donor_id: dictionary<values=string, indices=int16, ordered=0>
is_primary_data: bool
observation_joinid: large_string
self_reported_ethnicity: dictionary<values=string, indices=int8, ordered=0>
self_reported_ethnicity_ontology_term_id: dictionary<values=string, indices=int8, ordered=0>
sex: dictionary<values=string, indices=int8, ord

# List datasets

In [3]:
# Get collection-dataset table
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    datasets = (
        census["census_info"]["datasets"]
        .read()
        .concat()
        .to_pandas()
    )
datasets

Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,collection_doi_label,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,0,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,4eb29386-de81-452f-b3c0-e00844e8c7fd,f76861bb-becb-4eb7-82fc-782dc96ccc7f,Spatial transcriptomics in mouse: Puck_191112_05,4eb29386-de81-452f-b3c0-e00844e8c7fd.h5ad,10888
1,1,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,78d59e4a-82eb-4a61-a1dc-da974d7ea54b,7d7ec1b6-6e3f-4aaa-9442-4b22f3424396,Spatial transcriptomics in mouse: Puck_191112_08,78d59e4a-82eb-4a61-a1dc-da974d7ea54b.h5ad,10250
2,2,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,add5eb84-5fc9-4f01-982e-a346dd42ee82,de54aed8-4f73-48f6-9229-418a840e2d82,Spatial transcriptomics in mouse: Puck_191109_20,add5eb84-5fc9-4f01-982e-a346dd42ee82.h5ad,12906
3,3,Publication: https://doi.org/10.1016/j.isci.20...,8e880741-bf9a-4c8e-9227-934204631d2a,High Resolution Slide-seqV2 Spatial Transcript...,10.1016/j.isci.2022.104097,Marshall et al. (2022) iScience,b020294c-ab82-4547-b5a7-63d8ffa575ed,abe4fce1-0859-4a56-ad1e-734d79f0e6c8,Spatial transcriptomics in mouse: Puck_191112_13,b020294c-ab82-4547-b5a7-63d8ffa575ed.h5ad,15161
4,4,Publication: https://doi.org/10.1038/s41591-02...,a96133de-e951-4e2d-ace6-59db8b3bfb1d,HTAN/HTAPP Broad - Spatio-molecular dissection...,10.1038/s41591-024-03215-z,Klughammer et al. (2024) Nat Med,d7476ae2-e320-4703-8304-da5c42627e71,863fc5e4-bd4a-4681-9c3d-0ee7ef54e327,HTAPP-330-SMP-1082 scRNA-seq,d7476ae2-e320-4703-8304-da5c42627e71.h5ad,565
...,...,...,...,...,...,...,...,...,...,...,...
1568,1568,Publication: https://doi.org/10.1038/s41586-02...,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82,A single-cell transcriptional timelapse of mou...,10.1038/s41586-024-07069-w,Qiu et al. (2024) Nature,dcfa2614-7ca7-4d82-814c-350626eccb26,3002a659-a1a9-4406-9976-99e658e1fbb5,Major cell cluster: Mesoderm,dcfa2614-7ca7-4d82-814c-350626eccb26.h5ad,3267338
1569,1569,Publication: https://doi.org/10.1126/science.a...,e5f58829-1a66-40b5-a624-9046778e74f5,Tabula Sapiens,10.1126/science.abl4896,The Tabula Sapiens Consortium* et al. (2022) S...,53d208b0-2cfd-4366-9866-c3c6114081bc,10df7690-6d10-4029-a47e-0f071bb2df83,Tabula Sapiens - All Cells,53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad,1136218
1570,1570,Publication: https://doi.org/10.1038/s41586-02...,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82,A single-cell transcriptional timelapse of mou...,10.1038/s41586-024-07069-w,Qiu et al. (2024) Nature,dcfd4feb-18a3-4b30-81d7-1b0c544a8ab3,3817734b-0f82-433b-8c38-55b214200fff,Whole dataset: Raw counts only,dcfd4feb-18a3-4b30-81d7-1b0c544a8ab3.h5ad,11441407
1571,1571,Publication: https://doi.org/10.1038/s41593-02...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1038/s41593-024-01774-5,Gabitto et al. (2024) Nat Neurosci,c2876b1b-06d8-4d96-a56b-5304f815b99a,c32964d2-3339-441f-8e56-7177234c7876,Whole Taxonomy - MTG: Seattle Alzheimer's Dise...,c2876b1b-06d8-4d96-a56b-5304f815b99a.h5ad,1226855


# Get unique onotology terms

In [5]:
# filter human datasets to targets
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    tissue_ontology = (
        census["census_data"]["homo_sapiens"]
        .obs.read(column_names = ["tissue_ontology_term_id"])
        .concat()
        .group_by(["tissue_ontology_term_id"])
        .aggregate([])
        .to_pandas() 
    )
tissue_ontology.head()

Unnamed: 0,tissue_ontology_term_id
0,UBERON:0002113
1,UBERON:0000451
2,UBERON:0001872
3,UBERON:0001871
4,UBERON:0002436


In [9]:
# any non-Uberon terms?
tissue_ontology[~tissue_ontology["tissue_ontology_term_id"].str.startswith("UBERON")]

Unnamed: 0,tissue_ontology_term_id


In [10]:
# filter human datasets to targets
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    tissue_ontology = (
        census["census_data"]["homo_sapiens"]
        .obs.read(column_names = ["tissue", "tissue_ontology_term_id"])
        .concat()
        .group_by(["tissue", "tissue_ontology_term_id"])
        .aggregate([])
        .to_pandas() 
    )
tissue_ontology.head()

Unnamed: 0,tissue,tissue_ontology_term_id
0,liver,UBERON:0002107
1,caudate lobe of liver,UBERON:0001117
2,thymus,UBERON:0002370
3,tonsil,UBERON:0002372
4,superior frontal gyrus,UBERON:0002661


In [15]:
# tumor tissues?
tumor_tissues = tissue_ontology[tissue_ontology["tissue"].str.contains("tumor", case=False, regex=True)]
tumor_tissues

Unnamed: 0,tissue,tissue_ontology_term_id
