# Goals

* Summarize the stats how much data has been processed by SRAgent

In [1]:
import os
from pathlib import Path
import pandas as pd
import plotnine as pn
from pypika import Query, Table, functions as fn

from SRAgent.db.connect import db_connect

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4)
pn.theme_set(pn.theme_bw())

In [None]:
# get base of github repo
base_dir = !git rev-parse --show-toplevel
base_dir = Path(base_dir[0])
base_dir

PosixPath('/home/nickyoungblut/dev/python/scBaseCount_analysis')

In [4]:
os.environ["DYNACONF"] = "prod"

# Datasets processed

In [5]:
# get metadata
meta_table = Table('srx_metadata')
query = (
    Query.from_(meta_table)
    .select(
        meta_table.srx_accession,
        meta_table.is_illumina,
        meta_table.is_single_cell,
        meta_table.is_paired_end,
        meta_table.lib_prep,
        meta_table.tech_10x,
        meta_table.organism,
        meta_table.czi_collection_id
    )
    # .where(
    #     meta_table.czi_collection_id is None or meta_table.czi_collection_id == ""
    # )
)

with db_connect() as conn:
    df_meta = pd.read_sql_query(str(query), conn)
    df_meta = df_meta[df_meta["czi_collection_id"].isna()]
df_meta

Unnamed: 0,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,organism,czi_collection_id
0,SRX22482811,yes,yes,yes,10x_Genomics,5_prime_gex,Homo sapiens,
1,ERX14208438,yes,no,yes,not_applicable,not_applicable,Homo sapiens,
...,...,...,...,...,...,...,...,...
214387,NRX0000735,yes,yes,yes,10x_Genomics,3_prime_gex,Homo sapiens,
214388,NRX0000736,yes,yes,yes,10x_Genomics,3_prime_gex,Homo sapiens,


In [None]:
# number of datasets with an srx_accession
n_datasets = df_meta[df_meta["srx_accession"].notna()].shape[0]
n_datasets

208939

In [None]:
# number of 10x Genomics datasets
df_meta[df_meta["lib_prep"] == "10x_Genomics"].shape[0]

105343

In [8]:
# time to process, assuming human takes X minutes
min_per_dataset = 1
time_hours = n_datasets * min_per_dataset / 60 
print(f"Time to process: {time_hours:.1f} hours")
time_days = time_hours / 24
print(f"Time to process: {time_days:.1f} days")

Time to process: 3482.3 hours
Time to process: 145.1 days


In [9]:
# total cost, assuming $0.08 per dataset
total_cost = df_meta.shape[0] * 0.08
print(f"Total cost: ${total_cost:.2f}")

Total cost: $16909.28


In [10]:
# total tokens, assuming 63k tokens per dataset
total_tokens = n_datasets * 63000
print(f"Total tokens: {total_tokens:,}")

Total tokens: 13,163,157,000


In [None]:
# filter to 10x datasets
x = (df_meta["lib_prep"] == "10x_Genomics") & (df_meta["is_illumina"] == 'yes') & (df_meta["is_single_cell"] == 'yes') & (df_meta["is_paired_end"] == 'yes')
df_meta_f = df_meta[x]
df_meta_f

Unnamed: 0,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,organism,czi_collection_id
0,SRX22482811,yes,yes,yes,10x_Genomics,5_prime_gex,Homo sapiens,
8,SRX13824065,yes,yes,yes,10x_Genomics,3_prime_gex,Mus musculus,
...,...,...,...,...,...,...,...,...
214387,NRX0000735,yes,yes,yes,10x_Genomics,3_prime_gex,Homo sapiens,
214388,NRX0000736,yes,yes,yes,10x_Genomics,3_prime_gex,Homo sapiens,


In [12]:
# get metadata with STAR results
meta_table = Table('srx_metadata')
star_table = Table('screcounter_star_results')
query = (
    Query.from_(meta_table)
    .inner_join(star_table)
    .on(meta_table.srx_accession == star_table.sample)
    .select(
        meta_table.srx_accession,
        meta_table.is_illumina,
        meta_table.is_single_cell,
        meta_table.is_paired_end,
        meta_table.lib_prep,
        meta_table.tech_10x,
        meta_table.organism,
        meta_table.czi_collection_id
    )
    .where(
        star_table.feature == "GeneFull_Ex50pAS"
    )
    .where(
        meta_table.lib_prep == "10x_Genomics"
    )
    .distinct()
)

with db_connect() as conn:
    df_meta_star = pd.read_sql_query(str(query), conn)
    df_meta_star = df_meta_star[df_meta_star["czi_collection_id"].isna()]
df_meta_star

Unnamed: 0,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,organism,czi_collection_id
0,SRX9724312,yes,yes,yes,10x_Genomics,5_prime_gex,Mus musculus,
1,SRX24117407,yes,yes,yes,10x_Genomics,3_prime_gex,Mus musculus,
...,...,...,...,...,...,...,...,...
61379,SRX21641468,yes,yes,yes,10x_Genomics,3_prime_gex,Homo sapiens,
61380,SRX16217060,yes,yes,yes,10x_Genomics,3_prime_gex,Homo sapiens,


In [8]:
# number of CPU hours saved by using SRAgent classifications
cpu_min_per_run_org = 8
num_orgs = 27
total_datasets = 211366
total_proc_datasets = 60740

x = (cpu_min_per_run_org * num_orgs) * (total_datasets - total_proc_datasets) / 60
print(f"Number of CPU hours saved by using SRAgent classifications: {x:.1f} hours")

Number of CPU hours saved by using SRAgent classifications: 542253.6 hours


# session info 

In [5]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/SRAgent:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiohappyeyeballs          2.6.1                    pypi_0    pypi
aiohttp                   3.12.14                  pypi_0    pypi
aiosignal                 1.4.0                    pypi_0    pypi
annotated-types           0.7.0                    pypi_0    pypi
anthropic                 0.57.1                   pypi_0    pypi
anyio                     4.9.0                    pypi_0    pypi
appdirs                   1.4.4                    pypi_0    pypi
asgiref                   3.9.1                    pypi_0    pypi
asttokens                 3.0.0              pyhd8ed1ab_1    conda-forge
attrs                     25.3.0                   pypi_0    pypi
backoff                   2.2.1             