# Goals

* calculate the tissue ontology distance for scBaseCount records versus CellxGene

In [14]:
import os
from pathlib import Path
import pandas as pd
import plotnine as pn
from pypika import Query, Table, functions as fn
from SRAgent.db.connect import db_connect
import cellxgene_census

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4)
pn.theme_set(pn.theme_bw())

<plotnine.themes.theme_bw.theme_bw at 0x7fe8c8c192b0>

In [3]:
# get base of github repo
base_dir = !git rev-parse --show-toplevel
base_dir = Path(base_dir[0])

In [5]:
# set tenant
os.environ["DYNACONF"] = "prod"

In [31]:
def get_screcounter_star_results(
    conn, 
    feature="GeneFull_Ex50pAS"
):
    """
    Get the screcounter_star_results table
    
    Args:
        conn: A database connection object (e.g., sqlite3, psycopg2, etc.)
        feature: String of the feature to filter on
    Returns:
        A pandas DataFrame of records
    """
    
    # Define tables
    meta_table = Table('srx_metadata')
    star_table = Table('screcounter_star_results')
    
    # Build the star query with feature filter (case insensitive)
    star_query = (
        Query.from_(star_table)
        .select('*')
        .where(
            fn.Lower(star_table.feature) == feature.lower()
        )
    )

    #return pd.read_sql_query(str(star_query), conn)
    
    # Build the main query with join
    # Assuming you want all columns from both tables except feature from star and _at columns from meta
    main_query = (
        Query.from_(star_table)
        .join(meta_table)
        .on(star_table.sample == meta_table.srx_accession)
        .select(
            star_table.sample.as_('srx_accession'),
            star_table.estimated_number_of_cells,
            star_table.number_of_reads,
            meta_table.tissue,
            meta_table.tissue_ontology_term_id,
            meta_table.czi_collection_id,
        )
        .where(
            fn.Lower(star_table.feature) == feature.lower()
        )
        .where(
            meta_table.lib_prep == "10x_Genomics"
        )
        .where(
            meta_table.czi_collection_id != ""
        )
    )
    
    # Execute query and return as DataFrame
    return pd.read_sql_query(str(main_query), conn)

with db_connect() as conn:
    df_cells = get_screcounter_star_results(conn, feature="GeneFull_Ex50pAS")
df_cells

Unnamed: 0,srx_accession,estimated_number_of_cells,number_of_reads,tissue,tissue_ontology_term_id,czi_collection_id
0,SRX17412852,2245,177612277,Whole Lung Dissociate,UBERON:0002048,f86d6317-7215-409e-bfda-3f4ded3dadaa
1,NRX0000001,1119,19601351,blood,UBERON:0000178,ced320a1-29f3-47c1-a735-513c7084d508
...,...,...,...,...,...,...
639,NRX0000599,9006,87731895,brain,UBERON:0000955,0fd39ad7-5d2d-41c2-bda0-c55bde614bd
640,NRX0000600,8522,84178767,brain,UBERON:0000955,0fd39ad7-5d2d-41c2-bda0-c55bde614bd


In [32]:
# number of collections
df_cells["czi_collection_id"].nunique()

3

In [18]:
# Get cxg collection-dataset table
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    df_datasets = (
        census["census_info"]["datasets"]
        .read(column_names = ["collection_id", "dataset_id"])
        .concat()
        .to_pandas()
    )
df_datasets

Unnamed: 0,collection_id,dataset_id
0,8e880741-bf9a-4c8e-9227-934204631d2a,4eb29386-de81-452f-b3c0-e00844e8c7fd
1,8e880741-bf9a-4c8e-9227-934204631d2a,78d59e4a-82eb-4a61-a1dc-da974d7ea54b
...,...,...
1571,1ca90a2d-2943-483d-b678-b809bf464c30,c2876b1b-06d8-4d96-a56b-5304f815b99a
1572,1ca90a2d-2943-483d-b678-b809bf464c30,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3


In [33]:
# get cxg tissues data for each dataset
cxg_tissues = []
target_cols = ["dataset_id", "tissue", "tissue_general", "tissue_ontology_term_id"]
organisms = ["homo_sapiens", "mus_musculus"]
with cellxgene_census.open_soma(census_version="2025-01-30") as census:
    for organism in organisms:
        df = (
            census["census_data"][organism]
            .obs.read(column_names = target_cols)
            .concat()
            .group_by(target_cols)
            .aggregate([])
            .to_pandas() 
        )
        df["organism"] = organism
        cxg_tissues.append(df)
cxg_tissues = pd.concat(cxg_tissues).merge(df_datasets, on="dataset_id", how="inner")
cxg_tissues

Unnamed: 0,dataset_id,tissue,tissue_general,tissue_ontology_term_id,organism,collection_id
0,d7476ae2-e320-4703-8304-da5c42627e71,liver,liver,UBERON:0002107,homo_sapiens,a96133de-e951-4e2d-ace6-59db8b3bfb1d
1,0895c838-e550-48a3-a777-dbcd35d30272,caudate lobe of liver,liver,UBERON:0001117,homo_sapiens,44531dd9-1388-4416-a117-af0a99de2294
...,...,...,...,...,...,...
3283,dcfa2614-7ca7-4d82-814c-350626eccb26,embryo,embryo,UBERON:0000922,mus_musculus,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82
3284,dcfd4feb-18a3-4b30-81d7-1b0c544a8ab3,embryo,embryo,UBERON:0000922,mus_musculus,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82


In [23]:
# merge data sets
df_cells_j = df_cells.merge(cxg_tissues, left_on="czi_collection_id", right_on="collection_id", how="inner", suffixes=("_scbasecount", "_cxg"))
df_cells_j

Unnamed: 0,srx_accession,estimated_number_of_cells,number_of_reads,tissue_scbasecount,tissue_ontology_term_id_scbasecount,czi_collection_id,dataset_id,tissue_cxg,tissue_general,tissue_ontology_term_id_cxg,organism,collection_id
0,NRX0000044,2342,88420005,blood,UBERON:0000178,ced320a1-29f3-47c1-a735-513c7084d508,b0e547f0-462b-4f81-b31b-5b0a5d96f537,blood,blood,UBERON:0000178,homo_sapiens,ced320a1-29f3-47c1-a735-513c7084d508
1,NRX0000045,1564,57677756,blood,UBERON:0000178,ced320a1-29f3-47c1-a735-513c7084d508,b0e547f0-462b-4f81-b31b-5b0a5d96f537,blood,blood,UBERON:0000178,homo_sapiens,ced320a1-29f3-47c1-a735-513c7084d508
...,...,...,...,...,...,...,...,...,...,...,...,...
375,NRX0000138,2440,102135552,blood,UBERON:0000178,ced320a1-29f3-47c1-a735-513c7084d508,b0e547f0-462b-4f81-b31b-5b0a5d96f537,blood,blood,UBERON:0000178,homo_sapiens,ced320a1-29f3-47c1-a735-513c7084d508
376,NRX0000139,1158,39675709,blood,UBERON:0000178,ced320a1-29f3-47c1-a735-513c7084d508,b0e547f0-462b-4f81-b31b-5b0a5d96f537,blood,blood,UBERON:0000178,homo_sapiens,ced320a1-29f3-47c1-a735-513c7084d508


In [26]:
df_cells_j["tissue_ontology_term_id_scbasecount"].value_counts()

tissue_ontology_term_id_scbasecount
UBERON:0000178    376
UBERON:0002048      1
Name: count, dtype: int64

In [27]:
df_cells_j["tissue_ontology_term_id_cxg"].value_counts()

tissue_ontology_term_id_cxg
UBERON:0000178    376
UBERON:0002048      1
Name: count, dtype: int64

In [28]:
# differences in tissue ontology terms
filt = df_cells_j["tissue_ontology_term_id_scbasecount"] != df_cells_j["tissue_ontology_term_id_cxg"]
df_cells_j[filt]

Unnamed: 0,srx_accession,estimated_number_of_cells,number_of_reads,tissue_scbasecount,tissue_ontology_term_id_scbasecount,czi_collection_id,dataset_id,tissue_cxg,tissue_general,tissue_ontology_term_id_cxg,organism,collection_id


# session info

In [29]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/SRAgent_nb:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
_python_abi3_support      1.0                  hd8ed1ab_2    conda-forge
aiobotocore               2.23.0             pyhd8ed1ab_0    conda-forge
aiohappyeyeballs          2.6.1              pyhd8ed1ab_0    conda-forge
aiohttp                   3.11.14                  pypi_0    pypi
aioitertools              0.12.0             pyhd8ed1ab_1    conda-forge
aiosignal                 1.3.2              pyhd8ed1ab_0    conda-forge
anndata                   0.11.4             pyhd8ed1ab_0    conda-forge
annotated-types           0.7.0                    pypi_0    pypi
anthropic                 0.57.1                   pypi_0    pypi
anyio                     4.9.0                    pypi_0    pypi
appdir