# Goal

* Obtain the overlap of the CELLxGENE datasets scBaseCount

In [1]:
import os
from pathlib import Path
import pandas as pd
import plotnine as pn
from pypika import Query, Table, functions as fn
import cellxgene_census
from SRAgent.db.connect import db_connect

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4)
pn.theme_set(pn.theme_bw())

In [3]:
# set tenant
os.environ["DYNACONF"] = "prod"

In [19]:
def get_screcounter_star_results(
    conn, 
    feature="GeneFull_Ex50pAS"
):
    """
    Get the screcounter_star_results table
    
    Args:
        conn: A database connection object (e.g., sqlite3, psycopg2, etc.)
        feature: String of the feature to filter on
    Returns:
        A pandas DataFrame of records
    """
    
    # Define tables
    meta_table = Table('srx_metadata')
    star_table = Table('screcounter_star_results')
    
    # Build the query with join
    query = (
        Query.from_(star_table)
        .join(meta_table)
        .on(star_table.sample == meta_table.srx_accession)
        .select(
            star_table.sample.as_('srx_accession'),
            star_table.feature,
            star_table.estimated_number_of_cells,
            star_table.median_reads_per_cell,
            star_table.median_umi_per_cell,
            star_table.median_feature_per_cell,
            star_table.number_of_reads,
            meta_table.lib_prep,
            meta_table.tech_10x,
            meta_table.organism,
            meta_table.tissue,
            meta_table.tissue_ontology_term_id,
            meta_table.cell_prep,
            meta_table.czi_collection_id,
            meta_table.czi_collection_name,
        )
        .where(
            fn.Lower(star_table.feature) == feature.lower()
        )
        .where(
            meta_table.lib_prep == "10x_Genomics"
        )
    )
    
    # Execute query and return as DataFrame
    return pd.read_sql_query(str(query), conn)

with db_connect() as conn:
    df_cells = get_screcounter_star_results(conn, feature="GeneFull_Ex50pAS")
df_cells

Unnamed: 0,srx_accession,feature,estimated_number_of_cells,median_reads_per_cell,median_umi_per_cell,median_feature_per_cell,number_of_reads,lib_prep,tech_10x,organism,tissue,tissue_ontology_term_id,cell_prep,czi_collection_id,czi_collection_name
0,SRX20732890,GeneFull_Ex50pAS,11833,15253.0,8551.0,3025.0,429254174,10x_Genomics,3_prime_gex,Homo sapiens,breast,UBERON:0000310,single_cell,,
1,ERX10398458,GeneFull_Ex50pAS,7471,9093.0,4582.0,2276.0,197317627,10x_Genomics,3_prime_gex,Homo sapiens,Retina,UBERON:0000966,single_cell,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61379,NRX0000599,GeneFull_Ex50pAS,9006,1632.0,1178.0,878.0,87731895,10x_Genomics,3_prime_gex,Callithrix jacchus,brain,UBERON:0000955,single_nucleus,0fd39ad7-5d2d-41c2-bda0-c55bde614bd,A marmoset brain cell census reveals influence...
61380,NRX0000600,GeneFull_Ex50pAS,8522,2000.0,1516.0,1102.0,84178767,10x_Genomics,3_prime_gex,Callithrix jacchus,brain,UBERON:0000955,single_nucleus,0fd39ad7-5d2d-41c2-bda0-c55bde614bd,A marmoset brain cell census reveals influence...


In [None]:
# write out all data
#df_cells.to_csv("20250909_scBaseCount_metadata.csv", index=False)

In [None]:
# just cellxgene collections
df_cells_cxg = df_cells[~df_cells["czi_collection_id"].isna()][[]]
df_cells_cxg

Unnamed: 0,srx_accession,feature,estimated_number_of_cells,median_reads_per_cell,median_umi_per_cell,median_feature_per_cell,number_of_reads,lib_prep,tech_10x,organism,tissue,tissue_ontology_term_id,cell_prep,czi_collection_id,czi_collection_name
24328,SRX17412852,GeneFull_Ex50pAS,2245,33209.0,4471.0,1627.0,177612277,10x_Genomics,3_prime_gex,Homo sapiens,Whole Lung Dissociate,UBERON:0002048,single_cell,f86d6317-7215-409e-bfda-3f4ded3dadaa,Human CellCards Multi-Study CellRef 1.0 Atlas
60498,NRX0000005,GeneFull_Ex50pAS,2419,19742.0,5116.0,2004.0,85746470,10x_Genomics,5_prime_gex,Homo sapiens,blood,UBERON:0000178,single_cell,ced320a1-29f3-47c1-a735-513c7084d508,Asian immune diversity atlas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61304,NRX0000461,GeneFull_Ex50pAS,13987,3812.0,2617.0,1812.0,285323199,10x_Genomics,3_prime_gex,Callithrix jacchus,brain,UBERON:0000955,single_nucleus,0fd39ad7-5d2d-41c2-bda0-c55bde614bd,A marmoset brain cell census reveals influence...
61305,NRX0000462,GeneFull_Ex50pAS,14795,3487.0,2788.0,1737.0,262750933,10x_Genomics,3_prime_gex,Callithrix jacchus,brain,UBERON:0000955,single_nucleus,0fd39ad7-5d2d-41c2-bda0-c55bde614bd,A marmoset brain cell census reveals influence...


In [None]:
# dump to csv
df_cells_cxg[["srx_accession", "czi_collection_id", "czi_collection_name"]].drop_duplicates().to_csv("cxg_srx_accessions.csv", index=False)

# session info 

In [21]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/SRAgent_nb:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
_python_abi3_support      1.0                  hd8ed1ab_2    conda-forge
aiobotocore               2.23.0             pyhd8ed1ab_0    conda-forge
aiohappyeyeballs          2.6.1              pyhd8ed1ab_0    conda-forge
aiohttp                   3.11.14                  pypi_0    pypi
aioitertools              0.12.0             pyhd8ed1ab_1    conda-forge
aiosignal                 1.3.2              pyhd8ed1ab_0    conda-forge
anndata                   0.11.4             pyhd8ed1ab_0    conda-forge
annotated-types           0.7.0                    pypi_0    pypi
anthropic                 0.57.1                   pypi_0    pypi
anyio                     4.9.0                    pypi_0    pypi
appdir