# Goals

* Add already-processed SRX accessions to the database
  * Datasets already processed by Chris

In [37]:
import os
import warnings
from dotenv import load_dotenv
import psycopg2
from Bio import Entrez
from psycopg2.extensions import connection
import pandas as pd
from dynaconf import Dynaconf
from pypika import Query, Table, Field, Column, Table

In [38]:
from SRAgent.db.connect import db_connect
from SRAgent.db.upsert import db_upsert

In [39]:
# setup
load_dotenv()
Entrez.email = os.getenv("EMAIL")
Entrez.api_key = os.getenv("NCBI_API_KEY1")

# Load SRX values

In [47]:
infile = "../data/counted_SRX.txt"
counted = pd.read_csv(infile, sep='\t', header=None)
counted.columns = ["srx_accession"]
counted

Unnamed: 0,srx_accession
0,SRX13670569
1,SRX19828711
2,SRX19829151
3,SRX9556597
4,SRX19826284
...,...
16708,SRX9770752
16709,SRX11523521
16710,SRX19830191
16711,SRX19825307


### Convert SRX accessions to Entrez IDs

In [48]:
from Bio import Entrez
from time import sleep

# Function to fetch Entrez UIDs for a list of accessions
def fetch_entrez_uids(accessions, db="sra"):
    """
    Converts a list of SRX/ENX accessions to their corresponding Entrez UIDs.
    Args:
        accessions: List of SRX or ENX accessions
        db: Entrez database to query (default: "sra" for Sequence Read Archive)
    """
    Entrez.email = "your_email@example.com"  # Replace with your email
    #accession_to_uid = {}

    results = []
    for acc in accessions:
        result = []
        print(f"Querying {acc}...")
        try:
            # Use esearch to get the UID for the accession
            handle = Entrez.esearch(db=db, term=acc)
            record = Entrez.read(handle)
            handle.close()
            
            if record["IdList"]:
                result = [acc, record["IdList"][0]]
            else:
                result = [acc, None]
        except Exception as e:
            print(f"Error querying {acc}: {e}")
            result = [acc, None]
        results.append(result)
        sleep(0.34)
    
    # convert to dataframe
    return pd.DataFrame(results, columns=["srx_accession", "entrez_id"])

# Fetch Entrez UIDs
accessions = ["SRX9770752"]
fetch_entrez_uids(accessions)

Querying SRX9770752...


Unnamed: 0,srx_accession,entrez_id
0,SRX9770752,12773490


In [None]:
# Fetch Entrez UIDs for all accessions in the dataframe
results = fetch_entrez_uids(counted["srx_accession"])
results

Querying SRX13670569...
Querying SRX19828711...
Querying SRX19829151...
Querying SRX9556597...
Querying SRX19826284...
Querying SRX19824411...
Querying SRX8362275...
Querying SRX19829128...
Querying SRX1128902...
Querying ERX8791959...
Querying SRX19828920...
Querying SRX19825834...
Querying SRX19825585...
Querying ERX12558784...
Querying SRX19831332...
Querying SRX19829159...
Querying SRX19831592...
Querying SRX13549220...
Querying SRX5900984...
Querying SRX13300000...
Querying SRX19825205...
Querying ERX9138857...
Querying SRX19830089...
Querying SRX19828076...
Querying SRX19824064...
Querying SRX1128122...
Querying SRX3809288...
Querying SRX1128908...
Querying SRX4507858...
Querying SRX19823315...
Querying ERX11148781...
Querying SRX6640138...
Querying SRX19828517...
Querying SRX19829209...
Querying SRX19823607...
Querying SRX19824563...
Querying SRX19823843...
Querying SRX5901153...
Querying SRX1128984...
Querying SRX7063667...
Querying SRX11169657...
Querying SRX19827876...
Queryi

In [None]:
# join the results to the original dataframe
counted = pd.merge(counted, results, on="srx_accession", how="left")
counted

0

In [None]:
# any missing UIDs?
num_missing = counted[counted["entrez_id"].isnull()].shape[0]
print(f"Number of missing UIDs: {num_missing}")

### Format

In [44]:
# add columns
counted["database"] = "sra"
counted["notes"] = "Processed by Chris Carpenter"
counted

Unnamed: 0,srx_accession,entrez_id,database,notes
0,SRX13670569,19007785,sra,Processed by Chris Carpenter
1,SRX19828711,27175908,sra,Processed by Chris Carpenter
2,SRX19829151,27176348,sra,Processed by Chris Carpenter
3,SRX9556597,12488012,sra,Processed by Chris Carpenter
4,SRX19826284,27173481,sra,Processed by Chris Carpenter


### Upsert

In [45]:
with db_connect() as conn:
    db_upsert(counted, "srx_metadata", conn)

# Delete records

In [33]:
# Delete records in which notes = "Processed by Chris Carpenter"
with db_connect() as conn:
    cur = conn.cursor()
    cur.execute("DELETE FROM srx_metadata WHERE notes = 'Processed by Chris Carpenter'")
    conn.commit()

In [46]:
# Delete records in which notes = "Processed by Chris Carpenter"
with db_connect() as conn:
    cur = conn.cursor()
    cur.execute("SELECT * FROM srx_metadata")
    rows = cur.fetchall()
    for row in rows:
        print(row)

('sra', 35087715, 'SRX25994842', 'yes', 'yes', 'yes', '10x_Genomics', '3_prime_gex', 'single_cell', 'human', 'subcutaneous adipose tissue', 'breast cancer-related lymphedema', 'injection of adipose-derived regenerative cells (ADRCs)', 'adipose derived regenerative cells/stromal vascular fraction', None)
('sra', 36178506, 'ERX11887200', 'yes', 'yes', 'yes', '10x_Genomics', '3_prime_gex', 'single_cell', 'mouse', 'lung tumor', 'murine lung cancer', 'murine recombinant coronavirus vector (mCOV) treatment to boost anti-tumor T cell response', 'Lewis lung carcinoma (LLC) expressing LCMV glycoprotein peptide (LLC-gp33)', None)
('sra', 30749595, 'SRX22716300', 'yes', 'yes', 'yes', '10x_Genomics', '3_prime_gex', 'single_cell', 'human', 'bone marrow', 'no', 'no treatment', 'primary cells (CD34+ cells)', None)
('sra', 18060880, 'SRX13201194', 'yes', 'no', 'no', 'not_applicable', 'not_applicable', 'not_applicable', 'mouse', 'bone marrow', 'not specified', 'Notch1+/- genotype', 'Common Lymphoid Pro