In [None]:
from Bio import Entrez
from urllib.error import HTTPError
Entrez.email = "garbanyo@gmail.com"
#from numba import jit

In [None]:
def get_sequences(search_results, out="out.fasta", batch_size = 25000):
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]
    count= int(search_results["Count"])
#    count=len(search_results["IdList"])
    out_handle = open(out, "w")
    for start in range(0, count, batch_size):
        end = min(count, start+batch_size)
        print("Going to download record %i to %i of %i" % (start+1, end, count))
        attempt = 0
        while attempt < 3:
            attempt += 1
            try:
                fetch_handle = Entrez.efetch(db="protein",
                                         rettype="fasta", retmode="text",
                                         retstart=start, retmax=batch_size,
                                         webenv=webenv, query_key=query_key,
                                         idtype="acc")
            except HTTPError as err:
                if 500 <= err.code <= 599:
                    print("Received error from server %s" % err)
                    print("Attempt %i of 3" % attempt)
                    time.sleep(15)
                else:
                    raise
        data = fetch_handle.read()
        fetch_handle.close()
        out_handle.write(data)
    out_handle.close()
    print("Done")

In [None]:
def get_search(term, extra=''):
    handle = Entrez.esearch(db="protein", term='('+ term +'[Title]) AND phage[Title] NOT hypothetical[Title] ' +
                        'NOT putative[Title] AND 50:1000000[SLEN] NOT putitive[Title] ' +
                        'NOT probable[Title] NOT possible[Title] NOT unknown[Title] ' + extra,
                        idtype="acc",usehistory="y")
    #,retmax=2000
    search_results = Entrez.read(handle)
    handle.close()
    return search_results

In [None]:
def get_full_search(term, extra=''):
    handle = Entrez.esearch(db="protein", term=term +' AND phage[Title] NOT hypothetical[Title] ' +
                        'NOT putative[Title] AND 50:1000000[SLEN] NOT putitive[Title] ' +
                        'NOT probable[Title] NOT possible[Title] NOT unknown[Title]' + extra,
                        idtype="acc",usehistory="y")
    # AND "viruses"[porgn:__txid10239] 
    search_results = Entrez.read(handle)
    handle.close()
    return search_results

In [None]:
search_results = get_search('major capsid')
get_sequences(search_results,out='major_capsid.fasta',batch_size = 25000)


In [None]:
search_results = get_search('minor capsid')
get_sequences(search_results,out='minor_capsid.fasta')

In [None]:
#search_results = get_search('capsid',extra='NOT minor[Title] NOT major[Title]')
#print(int(search_results["Count"]))
#get_sequences(search_results,out='capsid.fasta')

In [None]:
search_results = get_search('baseplate')
get_sequences(search_results,out='baseplate.fasta')

In [None]:
search_results = get_search('major tail')
get_sequences(search_results,out='major_tail.fasta')

In [None]:
search_results = get_search('minor tail')
get_sequences(search_results,out='minor_tail.fasta')

In [None]:
search_results = get_search('portal')
get_sequences(search_results,out='portal.fasta')

In [None]:
search_results = get_search('tail fiber')
get_sequences(search_results,out='tail_fiber.fasta')

In [None]:
search_results = get_search('collar')
get_sequences(search_results,out='collar.fasta')

In [None]:
search_results = get_full_search('tail[Title] AND (shaft[Title] OR sheath[Title])')
get_sequences(search_results,out='shaft.fasta')