# Goal

* Check for overlap of data between Replogle et al. and the SRAgent sql database

In [6]:
import os
import pandas as pd
from pypika import Query, Table, Field, Column, Criterion
from SRAgent.db.connect import db_connect

In [8]:
# set to prod sql database
os.environ['DYNACONF'] = 'prod'

In [4]:
# get the repo base directory
repo_base_dir = !git rev-parse --show-toplevel
repo_base_dir = repo_base_dir[0]

In [7]:
# replogle dataset
infile = os.path.join(repo_base_dir, 'data', 'Replogle2022', 'PRJNA83156_SRX-SRR.csv')
PRJNA83156 = pd.read_csv(infile)
PRJNA83156

Unnamed: 0,exp_name,srx,srr
0,KD6_seq1_essential_mRNA_lane_1_S49_L004,SRX15390082,SRR19330645
1,KD6_seq1_essential_mRNA_lane_1_S49_L001,SRX15390083,SRR19330644
2,KD6_seq1_essential_sgRNA_lane_1_S1_L002,SRX15390084,SRR19330643
3,KD6_seq1_essential_mRNA_lane_9_S33_L004,SRX15390085,SRR19330642
4,KD6_seq1_essential_mRNA_lane_9_S33_L003,SRX15390086,SRR19330641
...,...,...,...
7878,KD6_6_essential,SRX15703621,SRR19653804
7879,KD6_7_essential,SRX15703622,SRR19653803
7880,KD6_8_essential,SRX15703623,SRR19653802
7881,KD6_9_essential,SRX15703624,SRR19653801


In [9]:
# get srx_metad# load database metadata for target organisms
tbl = Table("srx_metadata")

stmt = (
    Query
    .from_(tbl)
    .select(
        tbl.srx_accession, tbl.organism, tbl.tissue, tbl.lib_prep, 
        tbl.tech_10x, tbl.czi_collection_id
    )
)

with db_connect() as conn:
    df_srx_meta = pd.read_sql(str(stmt), conn)
df_srx_meta

Unnamed: 0,srx_accession,organism,tissue,lib_prep,tech_10x,czi_collection_id
0,SRX3809288,Mus musculus,cerebellum,10x_Genomics,3_prime_gex,d86517f0-fa7e-4266-b82e-a521350d6d36
1,SRX13549220,Gallus gallus,liver,10x_Genomics,3_prime_gex,74e10dc4-cbb2-4605-a189-8a1cd8e44d8c
2,SRX10417554,Macaca mulatta,fetal lungs,10x_Genomics,3_prime_gex,6e067060-f7e4-466c-86f3-ec3dd33c0381
3,SRX2003956,Homo sapiens,pancreas,CEL-seq2,not_applicable,6e8c5415-302c-492a-a5f9-f29c57ff18fb
4,ERX2814788,Homo sapiens,skin,Smart-seq2,not_applicable,
...,...,...,...,...,...,...
80249,SRX27713793,Mus musculus,kidney,other,not_applicable,
80250,SRX27713794,Mus musculus,kidney,other,not_applicable,
80251,SRX27713795,Mus musculus,kidney,other,not_applicable,
80252,SRX27713796,Mus musculus,kidney,other,not_applicable,


In [12]:
# join the two dataframes on srx_accession
PRJNA83156_j = pd.merge(PRJNA83156, df_srx_meta, left_on='srx', right_on='srx_accession', how='inner')
PRJNA83156_j

Unnamed: 0,exp_name,srx,srr,srx_accession,organism,tissue,lib_prep,tech_10x,czi_collection_id
