# NCBI nucleotide sequences

## Functions

In [11]:
#define the function for retrieval
from io import StringIO
import requests
from Bio import SeqIO

#find nucleotide sequence from ID (if no entry in protein database)
def getncbi_nuc(pid, session=None):
    if isinstance(pid, (list, set)):
        pid = ",".join(str(s) for s in pid)
    resp = (session or requests).get(
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
        params=dict(id=pid, db="nucleotide", rettype="gb", retmode="text"),
    )

    return SeqIO.read(StringIO(resp.text), format="gb")


#UPDATE
#update the nucleotide sequences in the databae
def updatedb_nucl(id_paper1, rec):
    u = gfp_input_2019.update()
    u = u.values({gfp_input_2019.c.seq_nucl: str(rec.seq).upper()})
    u = u.where(gfp_input_2019.c.id_paper1 == id_paper1)
    u = u.where(gfp_input_2019.c.seq_nucl == None)

    proxy = engine.execute(u)
    return proxy.rowcount


def updateerr(id_paper, num):
    u = gfp_input_2019.update()
    u = u.values({gfp_input_2019.c.errcol: num})
    u = u.where(gfp_input_2019.c.id_paper1 == id_paper1)

    proxy = engine.execute(u)
    return proxy.rowcount




### connect to database

In [1]:
#Set up connection to database
from sqlalchemy import create_engine, text
engine = create_engine('mysql+pymysql://root:Password@localhost/database')
# test connection
engine.execute(text("show variables like \"ver%\"")).fetchall()

#extract the table information
from sqlalchemy import select, MetaData
meta = MetaData(bind=engine)
meta.reflect()
meta.tables.keys()

#define the table
gfp_input_2019 = meta.tables['A_msms_input_auto1']

### define variables

In [5]:
### define errors
import time
chunk = 10000 #how many ids in the run limit
retrieval_delay = 2 #time break after successful retrieval in seconds
error_delay = 5 #time break after error in seconds
max_fail=  500
fail_num = 500
ncbi_nuc_err = 10
rap_err = 20
uniprot_err = 2
ncbi_prot_err = 3
ensembl_err = 4
uniparc_err = 5
err_status = 10 #where to start sending the sequence?


In [7]:
#FORM QUERY for id_paper1
# select the id_paper1 for protein retrieval
q0 = select([gfp_input_2019.c.id_paper1.distinct()])
q2 = q0.where(gfp_input_2019.c.id_paper2==None)
q21 = q2.where(gfp_input_2019.c.seq_prot==None)
q22 = q21.where(gfp_input_2019.c.seq_nucl==None)
#q20 = q22.where(gfp_input_2019.c.errcol!=5) #0,1 for uniprot 0,1,2 for NCBI 0,1,2,3 Gramene
qn = q20.limit(chunk)

In [8]:
res=[r.id_paper1 for r in engine.execute(qn).fetchall()]
len(res)

657

In [9]:
res[:10]

['BGLC_M',
 'Q4178',
 'Q6WF',
 'Q5XTZ',
 'NP_234364',
 'G400012702',
 'G400001528',
 'G400019437',
 'G400009947',
 'G400011716']

In [12]:
print("Sending",len(res)," ids to NCBI nucl")

#Searching the NCBI Nucleotide database
for id_paper1 in res:
    try:
        rec = getncbi_nuc(id_paper1)
        found = updatedb_nucl(id_paper1, rec)
        print(id_paper1, 'updated', found, 'rows')
        time.sleep(retrieval_delay)
    except Exception as e:
        print('failed for:',id_paper1, e)
        updateerr(id_paper1, ncbi_nuc_err)
        max_fail -= 1
        if max_fail <=0:
            break
        time.sleep(error_delay)
print('Done NCBI Nucleotide database search!')


Sending 657  ids to NCBI nucl
failed for: BGLC_M No records found in handle
failed for: Q4178 No records found in handle
failed for: Q6WF No records found in handle
failed for: Q5XTZ No records found in handle
failed for: NP_234364 No records found in handle
failed for: G400012702 No records found in handle
failed for: G400001528 No records found in handle
failed for: G400019437 No records found in handle
failed for: G400009947 No records found in handle
failed for: G400011716 No records found in handle
failed for: A7QDV5 No records found in handle
failed for: A7QF07 No records found in handle
failed for: A7PPM9 No records found in handle
failed for: A7P4I0 No records found in handle
failed for: A7PEM9 No records found in handle
failed for: A7P2K0 No records found in handle
failed for: A7QT90 No records found in handle
failed for: A7PYV3 No records found in handle
failed for: A7PJQ1 No records found in handle
failed for: A7P6B1 No records found in handle
failed for: A7Q204 No records f

failed for: Solyc06g073670.2.1 No records found in handle
failed for: Solyc07g055320.2.1 No records found in handle
failed for: Solyc08g006930.2.1 No records found in handle
failed for: Solyc08g079170.2.1 No records found in handle
failed for: Solyc09g018010.2.1 No records found in handle
failed for: ADD09611 No records found in handle
failed for: ACF06631 No records found in handle
failed for: EFA01361.1 No records found in handle
failed for: ABM9219 No records found in handle
failed for: GM0119X00246 No records found in handle
failed for: ABK92585.1 No records found in handle
failed for: Q17YU No records found in handle
T29205 updated 2 rows
failed for: CAO21229.1 No records found in handle
failed for: BAB47119 No records found in handle
failed for: CAB1742 No records found in handle
failed for: CH603 No records found in handle
failed for: S5146781 No records found in handle
failed for: PGKH No records found in handle
failed for: CYSKP No records found in handle
failed for: FTSH5 No 

failed for: CBC02989.1 No records found in handle
failed for: ACN33229.1 No records found in handle
failed for: DAA49573.1 No records found in handle
failed for: ACG32582.1 No records found in handle
failed for: DAA41374.1 No records found in handle
failed for: DAA36086.1 No records found in handle
failed for: DAA54661.1 No records found in handle
failed for: DAA56304.1 No records found in handle
failed for: DAA54144.1 No records found in handle
failed for: ACG43573.1 No records found in handle
failed for: DAA51009.1 No records found in handle
failed for: ACN33542.1 No records found in handle
failed for: DAA41608.1 No records found in handle
failed for: DAA63904.1 No records found in handle
failed for: DAA39209.1 No records found in handle
failed for: ACG35255.1 No records found in handle
failed for: DAA42974.1 No records found in handle
failed for: DAA46459.1 No records found in handle
failed for: ACN31157.1 No records found in handle
failed for: DAA63363.1 No records found in handle


failed for: DAA37623.1 No records found in handle
failed for: DAA57680.1 No records found in handle
failed for: DAA52519.1 No records found in handle
failed for: DAA52464.1 No records found in handle
failed for: DAA58655.1 No records found in handle
failed for: DAA42624.1 No records found in handle
failed for: DAA58070.1 No records found in handle
failed for: CAY56310.1 No records found in handle
failed for: DAA55401.1 No records found in handle
failed for: DAA52892.1 No records found in handle
failed for: DAA63498.1 No records found in handle
failed for: DAA47482.1 No records found in handle
failed for: DAA55122.1 No records found in handle
failed for: DAA49888.1 No records found in handle
failed for: DAA63472.1 No records found in handle
failed for: DAA56342.1 No records found in handle
failed for: ACG35387.1 No records found in handle
failed for: DAA57296.1 No records found in handle
failed for: DAA52985.1 No records found in handle
failed for: ACN31600.1 No records found in handle
