# Goals

* Check on the test and/or production databases

In [1]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [2]:
import os
import pandas as pd
from pypika import Query, Table, Field, Column, Criterion

In [11]:
from SRAgent.db.connect import db_connect
from SRAgent.db.update import db_update
from SRAgent.db.utils import db_list_tables, db_glimpse_tables, db_get_table, execute_query

In [12]:
# set to production
os.environ['DYNACONF'] = 'prod'

# Summary

In [14]:
# list all of the tables in prod
with db_connect() as conn:
    print("\n".join(db_list_tables(conn)))

screcounter_star_results
eval
screcounter_trace
srx_srr
srx_metadata
screcounter_log
screcounter_star_params


# Database updates

## CZI datasets

### Organism

In [29]:
# read in srx-metadata as pandas dataframe
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.star) \
    .distinct() \
    .where(tbl.czi_collection_id.isnull().negate())

with db_connect() as conn:
    srx_metadata_cxg = pd.read_sql(str(stmt), conn)
srx_metadata_cxg

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at
0,sra,5980061,SRX4401543,yes,no,no,other,not_applicable,not_applicable,human,Human prostate basal epithelia,unsure,unsure,"D4PrF_BE, CD326+/CD271+/CD26-",e2a4a67f-6a18-431a-ab9c-6e77dd31cc80,A Cellular Anatomy of the Normal Adult Human P...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-18 01:17:18.338743
1,sra,19008386,SRX13671170,yes,yes,yes,other,not_applicable,single_nucleus,human,Dorsolateral Prefrontal cortex (Brodmann area 46),unsure,unsure,unsure,91c8e321-566f-4f9d-b89e-3a164be654d5,Neuron type-specific effects of human aging an...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-18 01:48:15.592726
2,sra,21270522,ERX8792046,yes,yes,no,10x_Genomics,5_prime_gex,single_cell,human,Lung-draining lymph node,Prostate cancer,Saquinavir-NO treatment,PC-3,62ef75e4-cbea-454e-a0ce-998ec40223d3,Cross-tissue immune cell analysis reveals tiss...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-16 03:08:04.360701
3,sra,33748102,ERX12098959,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,human,intercostal muscle,intracranial haemorrhage,unsure,nuclei from human skeletal muscle biopsies,2d40e6a7-f2fd-49ba-9db9-6b97e4c6dad5,Human skeletal muscle ageing atlas,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-15 15:20:42.842344
4,sra,30534579,SRX22540304,yes,yes,yes,10x_Genomics,5_prime_gex,single_cell,human,Peripheral Blood Mononuclear Cells (PBMCs),COVID-19,COVID-19 mRNA vaccine (BNT162b2),PBMCs,ecb739c5-fe0d-4b48-81c6-217c4d64eec4,COVID-19 mRNA vaccine elicits a potent adaptiv...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-15 23:28:22.014208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6055,sra,30533995,SRX22539720,yes,yes,yes,10x_Genomics,5_prime_gex,single_cell,human,PBMCs,COVID-19,COVID-19 vaccination,PBMCs,ecb739c5-fe0d-4b48-81c6-217c4d64eec4,COVID-19 mRNA vaccine elicits a potent adaptiv...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-17 01:40:39.811736
6056,sra,19008361,SRX13671145,yes,no,yes,other,not_applicable,single_nucleus,human,Dorsolateral Prefrontal Cortex (Brodmann area 46),Alzheimer's Disease,unsure,unsure,91c8e321-566f-4f9d-b89e-3a164be654d5,Neuron type-specific effects of human aging an...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-17 03:59:44.582294
6057,sra,5987222,SRX4405260,yes,yes,no,MARS-seq,not_applicable,single_cell,human,Bone Marrow,Multiple myeloma (MM),Untreated,Control donor (AB2835),2a0b02c0-fea6-47bd-92b9-9b03f5d2580c,Single cell dissection of plasma cell heteroge...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-15 14:20:46.277606
6058,sra,9278658,SRX7059587,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,human,brain,"glioblastoma, stage IV",unsure,unsure,999f2a15-3d7e-440b-96ae-2c806799c08c,"Harmonized single-cell landscape, intercellula...",Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-15 03:37:30.793431


In [26]:
# summarize organism
srx_metadata_cxg.groupby("organism").size().sort_values(ascending=False)

organism
human                5000
mouse                1009
Macaca mulatta         36
Rattus norvegicus       5
Sus scrofa              4
Danio rerio             3
Gallus gallus           2
other                   1
dtype: int64

In [9]:
# filter to just `other` organisms
srx_metadata_cxg[srx_metadata_cxg["organism"] == "other"][["srx_accession", "organism"]]

Unnamed: 0,srx_accession,organism
3640,SRX13549222,other


In [10]:
# # update "other" organisms to correct
# idx = [
#     ['SRX9556667', 'human'], 
#     ['ERX12060278', 'human'], 
#     ['ERX12060274', 'human'], 
#     ['ERX12060276', 'human'], 
#     ['ERX12060273', 'human'], 
#     ['ERX12060297', 'human'], 
#     ['ERX12060299', 'human'], 
#     ['ERX12060275', 'human'], 
#     ['SRX13549222', 'other'],
#     ['SRX9556651', 'human'], 
#     ['ERX12060277', 'human'], 
# ]
# idx = pd.DataFrame(idx, columns=["srx_accession", "organism"])
# idx

In [27]:
# filter to just `other` organisms
srx_metadata_cxg[srx_metadata_cxg["organism"] == "NaN"][["srx_accession", "organism"]]

Unnamed: 0,srx_accession,organism


In [28]:
# update "NaN" organisms to correct
# idx = [
# ['SRX22385964', 'human'],
# ['SRX22385973', 'human'],
# ['SRX22385959', 'human'],
# ['SRX22385965', 'human'],
# ['SRX18945635', 'mouse'],
# ['SRX22385974', 'human'],
# ['SRX22385969', 'human'],
# ['SRX18945636', 'mouse'],
# ['SRX22385976', 'human'],
# ['SRX22385960', 'human'],
# ['SRX18945634', 'mouse'],
# ['SRX22385962', 'human'],
# ['SRX22385972', 'human'],
# ]
# idx = pd.DataFrame(idx, columns=["srx_accession", "organism"])
# idx

In [22]:
# merge dataframes
srx_metadata_cxg_f = pd.merge(srx_metadata_cxg.drop(columns=["organism"]), idx, on="srx_accession", how="inner")
srx_metadata_cxg_f = srx_metadata_cxg_f[["database", "entrez_id", "srx_accession", "organism"]]
srx_metadata_cxg_f

Unnamed: 0,database,entrez_id,srx_accession,organism
0,sra,30368187,SRX22385964,human
1,sra,30368196,SRX22385973,human
2,sra,30368182,SRX22385959,human
3,sra,30368188,SRX22385965,human
4,sra,26077148,SRX18945635,mouse
5,sra,30368197,SRX22385974,human
6,sra,30368192,SRX22385969,human
7,sra,26077149,SRX18945636,mouse
8,sra,30368199,SRX22385976,human
9,sra,30368183,SRX22385960,human


In [23]:
with db_connect() as conn:
    db_update(srx_metadata_cxg_f, "srx_metadata", conn)

### NaN

In [39]:
df = srx_metadata_cxg[srx_metadata_cxg["is_illumina"] == "NaN"][["entrez_id", "srx_accession"]]

# write to czi_annotate/nan.csv
df.to_csv("czi_annotate/nan.csv", index=False)

In [None]:
!SRAgent metadata --no-summaries --use-database --no-srr czi_annotate/nan.csv

### Tech 10x

In [33]:
# lib_prep != "10x_Genomics" & tech_10x != "not_applicable"
srx_metadata_cxg[(srx_metadata_cxg["lib_prep"] != "10x_Genomics") & (srx_metadata_cxg["tech_10x"] != "not_applicable")]

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at
1042,sra,19066359,SRX13707644,yes,no,yes,other,other,single_nucleus,human,Dorsolateral Prefrontal Cortex (Brodmann area 46),unsure,unsure,unsure,91c8e321-566f-4f9d-b89e-3a164be654d5,Neuron type-specific effects of human aging an...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-15 02:12:50.913645
1446,sra,29553044,ERX10811424,yes,yes,no,other,multiome,single_nucleus,human,sinoatrial node,normal,unsure,unsure,3116d060-0a8e-4767-99bb-e866badea1ed,Spatially resolved multiomics of human cardiac...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-15 23:05:43.609785
1648,sra,30368185,SRX22385962,,,,,,,human,,,,,0c8a364b-97b5-4cc8-a593-23c38c6f0ac5,Single-cell and spatial transcriptomics charac...,New dataset found by Find-Datasets agent,2025-01-03 18:39:40.221883,2025-02-19 04:03:50.873291
1760,sra,30368192,SRX22385969,,,,,,,human,,,,,0c8a364b-97b5-4cc8-a593-23c38c6f0ac5,Single-cell and spatial transcriptomics charac...,New dataset found by Find-Datasets agent,2025-01-03 18:39:40.221883,2025-02-19 04:03:50.873291
2171,sra,29561204,ERX10875792,yes,yes,no,other,other,single_cell,human,heart left ventricle,normal,none,none,3116d060-0a8e-4767-99bb-e866badea1ed,Spatially resolved multiomics of human cardiac...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-17 09:33:18.347793
2605,sra,30368195,SRX22385972,,,,,,,human,,,,,0c8a364b-97b5-4cc8-a593-23c38c6f0ac5,Single-cell and spatial transcriptomics charac...,New dataset found by Find-Datasets agent,2025-01-03 18:39:40.221883,2025-02-19 04:03:50.873291
2841,sra,30368196,SRX22385973,,,,,,,human,,,,,0c8a364b-97b5-4cc8-a593-23c38c6f0ac5,Single-cell and spatial transcriptomics charac...,New dataset found by Find-Datasets agent,2025-01-03 18:39:40.221883,2025-02-19 04:03:50.873291
2975,sra,31085102,ERX11760589,yes,yes,no,other,other,single_cell,human,skin,normal (healthy skin),bestatin (an inhibitor of leukotriene A4 hydro...,skin cell,34f12de7-c5e5-4813-a136-832677f98ac8,Multi-scale spatial mapping of cell population...,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-14 06:15:41.255389
3014,sra,11539510,SRX8874821,yes,yes,yes,Smart-seq2,3_prime_gex,single_cell,mouse,Dorsal root ganglion neuron,neuropathic pain,SNI 14d,not applicable,03608e22-227a-4492-910b-3cb3f16f952e,iPain atlas,Metadata obtained by SRAgent,2025-01-03 18:39:40.221883,2025-02-15 18:34:42.489433
3227,sra,30368187,SRX22385964,,,,,,,human,,,,,0c8a364b-97b5-4cc8-a593-23c38c6f0ac5,Single-cell and spatial transcriptomics charac...,New dataset found by Find-Datasets agent,2025-01-03 18:39:40.221883,2025-02-19 04:03:50.873291


# sessionInfo

In [24]:
!pip list

Package                       Version         Editable project location
----------------------------- --------------- --------------------------------------
aiohappyeyeballs              2.4.3
aiohttp                       3.10.10
aiosignal                     1.3.1
annotated-types               0.7.0
anyio                         4.6.2.post1
asttokens                     2.4.1
attrs                         24.2.0
beautifulsoup4                4.12.3
biopython                     1.84
cachetools                    5.5.0
certifi                       2024.8.30
charset-normalizer            3.4.0
comm                          0.2.2
dataclasses-json              0.6.7
db-dtypes                     1.3.1
debugpy                       1.8.7
decorator                     5.1.1
distro                        1.9.0
dynaconf                      3.2.6
exceptiongroup                1.2.2
executing                     2.1.0
frozenlist                    1.5.0
google-api-core               2.23.0
g