# Goals

* Assess the accuracy of the SRAgent for obtaining the correct SRA metadata

In [42]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import plotnine as pn
from pypika import Query, Table, functions as fn

from SRAgent.db.connect import db_connect
from SRAgent.db.update import db_update

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4)
pn.theme_set(pn.theme_bw())

In [4]:
# get base of github repo
base_dir = !git rev-parse --show-toplevel
base_dir = Path(base_dir[0])
base_dir

PosixPath('/home/nickyoungblut/dev/python/scBaseCount_analysis')

In [5]:
# set tenant
os.environ["DYNACONF"] = "prod"

# Running evaluation

In [14]:
exe = base_dir / "scripts" / "sragent_eval.py"
# run script
!{exe} --tenant prod --list-datasets

Using database tenant: prod
  dataset_id  record_count
0      eval2            19
1      eval1            21
2      eval3            13


In [32]:
outfile = base_dir / "data" / "SRAgent_profiling" / "eval1-3.tsv"
outfile.parent.mkdir(parents=True, exist_ok=True)
!{exe} --tenant prod --eval-datasets eval1 eval2 eval3 --outfile {outfile}

Using database tenant: prod

#-- is_illumina --#
# Total mismatches: 0 (0.00%)

#-- is_single_cell --#
# Total mismatches: 0 (0.00%)

#-- is_paired_end --#
# Total mismatches: 0 (0.00%)

#-- lib_prep --#
# Total mismatches: 3 (7.89%)

# Mismatches
| lib_prep     | lib_prep_pred   |   count |
|--------------|-----------------|---------|
| 10x_Genomics | MARS-seq        |       1 |
| Smart-seq2   | other           |       1 |
| other        | 10x_Genomics    |       1 |

#-- tech_10x --#
# Total mismatches: 2 (5.26%)

# Mismatches
| tech_10x       | tech_10x_pred   |   count |
|----------------|-----------------|---------|
| 3_prime_gex    | not_applicable  |       1 |
| not_applicable | 3_prime_gex     |       1 |

#-- cell_prep --#
# Total mismatches: 0 (0.00%)

#-- organism --#
# Total mismatches: 0 (0.00%)

#-- Accuracy Table --#
| dataset_id     |   database |   entrez_id |
|----------------|------------|-------------|
| is_illumina    |     100    |          38 |
| is_single_cell |

# New eval datasets

* randomly sampling from X datasets

In [39]:
meta_tbl = Table("srx_metadata")
eval_tbl = Table("eval")
query = (
    Query.from_(meta_tbl)
    .left_join(eval_tbl)
    .on(meta_tbl.srx_accession == eval_tbl.srx_accession)
    .select(
        meta_tbl.database,
        meta_tbl.srx_accession,
        meta_tbl.is_illumina,
        meta_tbl.is_single_cell,
        meta_tbl.is_paired_end,
        meta_tbl.lib_prep,
        meta_tbl.tech_10x,
        meta_tbl.cell_prep,
        meta_tbl.organism,
    )
    .where(~meta_tbl.srx_accession.isnull())
)

with db_connect() as conn:
    df_to_eval = pd.read_sql_query(str(query), conn)
df_to_eval

Unnamed: 0,database,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism
0,sra,SRX22482811,yes,yes,yes,10x_Genomics,5_prime_gex,single_cell,Homo sapiens
1,sra,ERX14208438,yes,no,yes,not_applicable,not_applicable,not_applicable,Homo sapiens
...,...,...,...,...,...,...,...,...,...
206008,sra,ERX11148787,yes,yes,no,10x_Genomics,3_prime_gex,single_cell,Homo sapiens
206009,sra,SRX10955672,yes,yes,yes,10x_Genomics,5_prime_gex,single_cell,Mus musculus


In [44]:
# creating a random vector for sampling
batches = []
for i in range(10):
    batches.append(np.random.choice(df_to_eval.index, size=20, replace=False))
len(batches)

10

In [48]:
# randomly sample 20 rows from df_to_eval
outfile = base_dir / "data" / "SRAgent_profiling" / "to-eval_n20-1.tsv"
df_to_eval.loc[batches[0]].to_csv(outfile, sep="\t", index=False)

# Sandbox

In [31]:
meta_tbl = Table("srx_metadata")
eval_tbl = Table("eval")
query = (
    Query.from_(eval_tbl)
    .inner_join(meta_tbl)
    .on(eval_tbl.srx_accession == meta_tbl.srx_accession)
    .select(
        eval_tbl.dataset_id,
        eval_tbl.database,
        eval_tbl.srx_accession,
        eval_tbl.entrez_id,
        eval_tbl.is_illumina,
        eval_tbl.is_single_cell,
        eval_tbl.is_paired_end,
        eval_tbl.lib_prep,
        eval_tbl.tech_10x,
        eval_tbl.cell_prep,
        eval_tbl.organism,
    )
)

with db_connect() as conn:
    df_eval = pd.read_sql_query(str(query), conn).sort_values(by="dataset_id")
df_eval

Unnamed: 0,dataset_id,database,srx_accession,entrez_id,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism
0,eval1,sra,ERX9692805,34439895,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Gallus gallus
15,eval1,sra,ERX11887200,36178506,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Mus musculus
...,...,...,...,...,...,...,...,...,...,...,...
29,eval3,sra,SRX19392127,26659536,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Mus musculus
28,eval3,sra,SRX24387553,32702534,yes,yes,yes,10x_Genomics,5_prime_gex,single_cell,Homo sapiens


In [28]:
# update organism
mapping = {
    "human": "Homo sapiens",
    "mouse": "Mus musculus",
    "chicken": "Gallus gallus",
    "zebrafish": "Danio rerio",
    "macaque": "Macaca mulatta",

}
df_eval["organism"] = df_eval["organism"].map(mapping)
df_eval

Unnamed: 0,dataset_id,database,srx_accession,entrez_id,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism
0,eval1,sra,ERX9692805,34439895,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Gallus gallus
1,eval1,sra,ERX11157721,29115023,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Danio rerio
...,...,...,...,...,...,...,...,...,...,...,...
23,eval3,sra,SRX21638105,29273462,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Mus musculus
37,eval3,sra,SRX25852474,34914388,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Homo sapiens


In [29]:
df_eval_f = df_eval[["dataset_id", "database", "srx_accession", "entrez_id", "organism"]]
df_eval_f

Unnamed: 0,dataset_id,database,srx_accession,entrez_id,organism
0,eval1,sra,ERX9692805,34439895,Gallus gallus
1,eval1,sra,ERX11157721,29115023,Danio rerio
...,...,...,...,...,...
23,eval3,sra,SRX21638105,29273462,Mus musculus
37,eval3,sra,SRX25852474,34914388,Homo sapiens


In [30]:
with db_connect() as conn:
    db_update(df_eval_f, "eval", conn)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
