In [1]:
from Bio import Entrez
from urllib.error import HTTPError
Entrez.email = "garbanyo@gmail.com"
#from numba import jit

In [2]:
def get_sequences(search_results, out="out.fasta", batch_size = 1000):
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]
    count= int(search_results["Count"])
    out_handle = open(out, "w")
    for start in range(0, count, batch_size):
        end = min(count, start+batch_size)
        print("Going to download record %i to %i of %i" % (start+1, end, count))
        attempt = 0
        while attempt < 3:
            attempt += 1
            try:
                fetch_handle = Entrez.efetch(db="protein",
                                         rettype="fasta", retmode="text",
                                         retstart=start, retmax=batch_size,
                                         webenv=webenv, query_key=query_key,
                                         idtype="acc")
            except HTTPError as err:
                if 500 <= err.code <= 599:
                    print("Received error from server %s" % err)
                    print("Attempt %i of 3" % attempt)
                    time.sleep(15)
                else:
                    raise
        data = fetch_handle.read()
        fetch_handle.close()
        out_handle.write(data)
    out_handle.close()
    print("Done")

In [3]:
def get_search(term, extra=''):
    handle = Entrez.esearch(db="protein", term='('+ term +'[Title]) AND phage[Title] NOT hypothetical[Title] ' +
                        'NOT putative[Title] AND 50:1000000[SLEN] NOT putitive[Title] ' +
                        'NOT probable[Title] NOT possible[Title] AND "viruses"[porgn:__txid10239] ' + extra,
                        idtype="acc",usehistory="y")
    search_results = Entrez.read(handle)
    handle.close()
    return search_results

In [4]:
def get_full_search(term, extra=''):
    handle = Entrez.esearch(db="protein", term=term +' AND phage[Title] NOT hypothetical[Title] ' +
                        'NOT putative[Title] AND 50:1000000[SLEN] NOT putitive[Title] ' +
                        'NOT probable[Title] NOT possible[Title] AND "viruses"[porgn:__txid10239] ' + extra,
                        idtype="acc",usehistory="y")
    search_results = Entrez.read(handle)
    handle.close()
    return search_results

In [5]:
search_results = get_search('major capsid')
get_sequences(search_results,out='major_capsid.fasta')

Going to download record 1 to 1000 of 4853
Going to download record 1001 to 2000 of 4853
Going to download record 2001 to 3000 of 4853
Going to download record 3001 to 4000 of 4853
Going to download record 4001 to 4853 of 4853
Done


In [6]:
search_results = get_search('minor capsid')
get_sequences(search_results,out='minor_capsid.fasta')

Going to download record 1 to 683 of 683
Done


In [7]:
#search_results = get_search('capsid',extra='NOT minor[Title] NOT major[Title]')
#print(int(search_results["Count"]))
#get_sequences(search_results,out='capsid.fasta')

In [8]:
search_results = get_search('baseplate')
get_sequences(search_results,out='baseplate.fasta')

Going to download record 1 to 1000 of 7745
Going to download record 1001 to 2000 of 7745
Going to download record 2001 to 3000 of 7745
Going to download record 3001 to 4000 of 7745
Going to download record 4001 to 5000 of 7745
Going to download record 5001 to 6000 of 7745
Going to download record 6001 to 7000 of 7745
Going to download record 7001 to 7745 of 7745
Done


In [9]:
search_results = get_search('major tail')
get_sequences(search_results,out='major_tail.fasta')

Going to download record 1 to 1000 of 3717
Going to download record 1001 to 2000 of 3717
Going to download record 2001 to 3000 of 3717
Going to download record 3001 to 3717 of 3717
Done


In [10]:
search_results = get_search('minor tail')
get_sequences(search_results,out='minor_tail.fasta')

Going to download record 1 to 1000 of 11307
Going to download record 1001 to 2000 of 11307
Going to download record 2001 to 3000 of 11307
Going to download record 3001 to 4000 of 11307
Going to download record 4001 to 5000 of 11307
Going to download record 5001 to 6000 of 11307
Going to download record 6001 to 7000 of 11307
Going to download record 7001 to 8000 of 11307
Going to download record 8001 to 9000 of 11307
Going to download record 9001 to 10000 of 11307
Going to download record 10001 to 11000 of 11307
Going to download record 11001 to 11307 of 11307
Done


In [11]:
search_results = get_search('portal')
get_sequences(search_results,out='portal.fasta')

Going to download record 1 to 1000 of 5623
Going to download record 1001 to 2000 of 5623
Going to download record 2001 to 3000 of 5623
Going to download record 3001 to 4000 of 5623
Going to download record 4001 to 5000 of 5623
Going to download record 5001 to 5623 of 5623
Done


In [12]:
search_results = get_search('tail fiber')
get_sequences(search_results,out='tail_fiber.fasta')

Going to download record 1 to 1000 of 4574
Going to download record 1001 to 2000 of 4574
Going to download record 2001 to 3000 of 4574
Going to download record 3001 to 4000 of 4574
Going to download record 4001 to 4574 of 4574
Done


In [13]:
search_results = get_search('collar')
get_sequences(search_results,out='collar.fasta')

Going to download record 1 to 403 of 403
Done
