# NCBI protein search

## Functions

In [1]:
from io import StringIO
import requests
from Bio import SeqIO

In [24]:
#find protein sequence from ID
def getncbi(pid, session=None):
    if isinstance(pid, (list, set)):
        pid = ",".join(str(s) for s in pid)
    resp = (session or requests).get(
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
        params=dict(id=pid, db="protein", rettype="gb", retmode="text"),
    )

    return SeqIO.read(StringIO(resp.text), format="gb")


In [25]:
#update the protein sequences in the databae
def updatedb_prot(id_paper1, rec):
    u = gfp_input_2019.update()
    u = u.values({gfp_input_2019.c.seq_prot: str(rec.seq).upper()})
    u = u.where(gfp_input_2019.c.id_paper1 == id_paper1)
    u = u.where(gfp_input_2019.c.seq_prot == None)

    proxy = engine.execute(u)
    return proxy.rowcount

def updateerr(id_paper, num):
    u = gfp_input_2019.update()
    u = u.values({gfp_input_2019.c.errcol: num})
    u = u.where(gfp_input_2019.c.id_paper1 == id_paper1)

    proxy = engine.execute(u)
    return proxy.rowcount


## Connect to databae

In [28]:
#Set up connection to database
from sqlalchemy import create_engine, text
engine = create_engine('mysql+pymysql://root:PASSWORD@localhost/DATABASE')#Enter your own password and database
# test connection
engine.execute(text("show variables like \"ver%\"")).fetchall()

#extract the table information
from sqlalchemy import select, MetaData
meta = MetaData(bind=engine)
meta.reflect()
meta.tables.keys()


#define the table
gfp_input_2019 = meta.tables['A_msms_input_auto1'] #use more generic table definition

## Define Run variables

In [29]:
### define errors
import time
chunk = 100000 #how many ids in the run limit
retrieval_delay = 2 #time break after successful retrieval in seconds
error_delay =2 #time break after error in seconds
max_fail=  200
fail_num = 200
ncbi_nuc_err = 1
uniprot_err = 2
ncbi_prot_err = 3
ensembl_err = 4
err_status = 2 #where to start sending the sequence?


## Define query

In [49]:
###Define the nucleotide query that reoccurs after every target
q0 = select([gfp_input_2019.c.id_paper1.distinct()])
q1 = q0.where(gfp_input_2019.c.id_paper1!=None)
q2 = q1.where(gfp_input_2019.c.seq_nucl==None)
q3 = q2.where(gfp_input_2019.c.seq_prot==None)
q4 = q3.where(gfp_input_2019.c.errcol!=3)
qp = q4.limit(chunk)


In [50]:
res=[r.id_paper1 for r in engine.execute(qp).fetchall()]
len(res)

66

In [51]:
res[0:10]

['AAM10743',
 'AAM21644',
 'AAU09271',
 'AAD22975',
 'AAA74393',
 'AAU89223.1',
 'AAB38499.1',
 'AAG24873',
 'AAB61672',
 'AAC97495']

In [52]:
print("Sending",len(res)," ids to NCBI")

max_fail = fail_num
#Searching the NCBI Protein database
for id_paper1 in res:
    try:
        rec = getncbi(id_paper1)
        found = updatedb_prot(id_paper1, rec)
        print(id_paper1, 'updated', found, 'rows')
        time.sleep(retrieval_delay)
    except Exception as e:
        print('failed for:',id_paper1, e)
        updateerr(id_paper1, ncbi_prot_err)
        max_fail -= 1
        if max_fail <=0:
            break
        time.sleep(error_delay)
print('Done NCBI Protein database search!')


Sending 66  ids to NCBI
AAM10743 updated 1 rows
AAM21644 updated 2 rows
AAU09271 updated 1 rows
AAD22975 updated 1 rows
AAA74393 updated 1 rows
AAU89223.1 updated 1 rows
AAB38499.1 updated 1 rows
AAG24873 updated 2 rows
AAB61672 updated 2 rows
AAC97495 updated 2 rows
AAA03726 updated 6 rows
AAQ74612 updated 1 rows
AAW83328.1 updated 1 rows
AAK07827.1 updated 1 rows
AAC24855 updated 2 rows
AAD56659 updated 1 rows
AAA80688 updated 5 rows
AAA34016.1 updated 2 rows
AAD49719 updated 5 rows
AAL27029.1 updated 2 rows
AAK43833 updated 1 rows
AAU10526 updated 1 rows
AAC26053 updated 1 rows
AAO23069 updated 1 rows
AAG24884 updated 1 rows
AAA33941 updated 5 rows
AAQ08403 updated 12 rows
AAM95226 updated 1 rows
AAV98051 updated 1 rows
AAQ13492 updated 1 rows
AAG32959 updated 1 rows
AAC72337 updated 1 rows
AAA70268 updated 1 rows
AAA26326 updated 1 rows
AAC49294.1 updated 2 rows
AAQ08403.1 updated 4 rows
AAA86903 updated 1 rows
AAK55326 updated 1 rows
AAK55325 updated 1 rows
AAK55323 updated 1 rows