# Goals

* Summarize the stats how much data has been processed by SRAgent

In [1]:
import os
from pathlib import Path
import pandas as pd
import plotnine as pn
from pypika import Query, Table, functions as fn

from SRAgent.db.connect import db_connect

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4)
pn.theme_set(pn.theme_bw())

In [3]:
# get base of github repo
base_dir = !git rev-parse --show-toplevel
base_dir = Path(base_dir[0])
base_dir

PosixPath('/home/nickyoungblut/dev/python/scBaseCount_analysis')

In [5]:
os.environ["DYNACONF"] = "prod"

# Datasets processed

In [11]:
# get metadata
meta_table = Table('srx_metadata')
query = (
    Query.from_(meta_table)
    .select(
        meta_table.srx_accession,
        meta_table.is_illumina,
        meta_table.is_single_cell,
        meta_table.is_paired_end,
        meta_table.lib_prep,
        meta_table.tech_10x,
        meta_table.organism,
        meta_table.czi_collection_id
    )
    # .where(
    #     meta_table.czi_collection_id is None or meta_table.czi_collection_id == ""
    # )
)

with db_connect() as conn:
    df_meta = pd.read_sql_query(str(query), conn)
    df_meta = df_meta[df_meta["czi_collection_id"].isna()]
df_meta

Unnamed: 0,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,organism,czi_collection_id
0,SRX22482811,yes,yes,yes,10x_Genomics,5_prime_gex,Homo sapiens,
1,ERX14208438,yes,no,yes,not_applicable,not_applicable,Homo sapiens,
...,...,...,...,...,...,...,...,...
208016,ERX9138738,yes,yes,no,10x_Genomics,5_prime_gex,Homo sapiens,
208017,SRX5679920,yes,no,yes,other,not_applicable,Homo sapiens,


In [27]:
# time to process, assuming human takes X minutes
time_days = round(df_meta.shape[0] * 1 / 60 / 24, 1)
print(f"Time to process: {time_days} days")

Time to process: 142.8 days


In [28]:
# total cost, assuming $0.08 per dataset
total_cost = df_meta.shape[0] * 0.08
print(f"Total cost: ${total_cost:.2f}")

Total cost: $16450.80


In [29]:
# total tokens, assuming 63k tokens per dataset
total_tokens = df_meta.shape[0] * 63000
print(f"Total tokens: {total_tokens:,}")

Total tokens: 12,955,005,000


In [16]:
# filter to 10x datasets
x = (df_meta["lib_prep"] == "10x_Genomics") & (df_meta["is_illumina"] == 'yes') & (df_meta["is_single_cell"] == 'yes') & (df_meta["is_paired_end"] == 'yes')
df_meta_f = df_meta[x]
df_meta_f

Unnamed: 0,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,organism,czi_collection_id
0,SRX22482811,yes,yes,yes,10x_Genomics,5_prime_gex,Homo sapiens,
8,SRX13824065,yes,yes,yes,10x_Genomics,3_prime_gex,Mus musculus,
...,...,...,...,...,...,...,...,...
208006,SRX10579573,yes,yes,yes,10x_Genomics,3_prime_gex,Homo sapiens,
208015,SRX10614824,yes,yes,yes,10x_Genomics,3_prime_gex,Mus musculus,


In [None]:
# get metadata with STAR results
meta_table = Table('srx_metadata')
star_table = Table('screcounter_star_results')
query = (
    Query.from_(meta_table)
    .inner_join(star_table)
    .on(meta_table.srx_accession == star_table.sample)
    .select(
        meta_table.srx_accession,
        meta_table.is_illumina,
        meta_table.is_single_cell,
        meta_table.is_paired_end,
        meta_table.lib_prep,
        meta_table.tech_10x,
        meta_table.organism,
        meta_table.czi_collection_id
    )
    .distinct()
)

with db_connect() as conn:
    df_meta_star = pd.read_sql_query(str(query), conn)
    df_meta_star = df_meta_star[df_meta_star["czi_collection_id"].isna()]
df_meta_star

Unnamed: 0,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,organism,czi_collection_id
0,ERX10016429,yes,yes,yes,10x_Genomics,3_prime_gex,Mus musculus,
1,ERX10016430,yes,yes,yes,10x_Genomics,3_prime_gex,Mus musculus,
...,...,...,...,...,...,...,...,...
59553,SRX9995797,yes,yes,yes,10x_Genomics,3_prime_gex,Mus musculus,
59554,SRX9995798,yes,yes,yes,10x_Genomics,3_prime_gex,Mus musculus,
