In [1]:
from Bio import Entrez
from urllib.error import HTTPError
Entrez.email = "garbanyo@gmail.com"
#from numba import jit

In [18]:
def get_sequences(search_results, out="out.fasta", batch_size = 5000, start_batch=0):
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]
    count= int(search_results["Count"])
#    count=len(search_results["IdList"])
    out_handle = open(out, "w")
    for start in range(start_batch*batch_size, count, batch_size):
        end = min(count, start+batch_size)
        print("Going to download record %i to %i of %i (batch %i)" % (start+1, end, count, start/batch_size))
        attempt = 0
        while attempt < 3:
            attempt += 1
            try:
                fetch_handle = Entrez.efetch(db="protein",
                                         rettype="fasta", retmode="text",
                                         retstart=start, retmax=batch_size,
                                         webenv=webenv, query_key=query_key,
                                         idtype="acc")
            except HTTPError as err:
                if 500 <= err.code <= 599:
                    print("Received error from server %s" % err)
                    print("Attempt %i of 3" % attempt)
                    time.sleep(15)
                else:
                    raise
        data = fetch_handle.read()
        fetch_handle.close()
        out_handle.write(data)
    out_handle.close()
    print("Done")

In [3]:
def get_search(term, extra=''):
    handle = Entrez.esearch(db="protein", term='('+ term +'[Title]) AND phage[Title] NOT hypothetical[Title] ' +
                        'NOT putative[Title] AND 50:1000000[SLEN] NOT putitive[Title] ' +
                        'NOT probable[Title] NOT possible[Title] NOT unknown[Title] ' + extra,
                        idtype="acc",usehistory="y")
    #,retmax=2000
    search_results = Entrez.read(handle)
    handle.close()
    return search_results

In [4]:
def get_full_search(term, extra=''):
    handle = Entrez.esearch(db="protein", term=term +' AND phage[Title] NOT hypothetical[Title] ' +
                        'NOT putative[Title] AND 50:1000000[SLEN] NOT putitive[Title] ' +
                        'NOT probable[Title] NOT possible[Title] NOT unknown[Title]' + extra,
                        idtype="acc",usehistory="y")
    # AND "viruses"[porgn:__txid10239] 
    search_results = Entrez.read(handle)
    handle.close()
    return search_results

In [20]:
#doneist
search_results = get_search('major capsid')
get_sequences(search_results,out='major_capsid4.fasta',start_batch=21)
#113612


Going to download record 105001 to 110000 of 113612 (batch 21)
Going to download record 110001 to 113612 of 113612 (batch 22)
Done


In [23]:
#done
search_results = get_search('minor capsid')
get_sequences(search_results,out='minor_capsid.fasta')

Going to download record 1 to 2901 of 2901 (batch 0)
Done


In [None]:
#search_results = get_search('capsid',extra='NOT minor[Title] NOT major[Title]')
#print(int(search_results["Count"]))
#get_sequences(search_results,out='capsid.fasta')

In [22]:
#done
search_results = get_search('baseplate')
get_sequences(search_results,out='baseplate.fasta')

Going to download record 1 to 5000 of 76073 (batch 0)
Going to download record 5001 to 10000 of 76073 (batch 1)
Going to download record 10001 to 15000 of 76073 (batch 2)
Going to download record 15001 to 20000 of 76073 (batch 3)
Going to download record 20001 to 25000 of 76073 (batch 4)
Going to download record 25001 to 30000 of 76073 (batch 5)
Going to download record 30001 to 35000 of 76073 (batch 6)
Going to download record 35001 to 40000 of 76073 (batch 7)
Going to download record 40001 to 45000 of 76073 (batch 8)
Going to download record 45001 to 50000 of 76073 (batch 9)
Going to download record 50001 to 55000 of 76073 (batch 10)
Going to download record 55001 to 60000 of 76073 (batch 11)
Going to download record 60001 to 65000 of 76073 (batch 12)
Going to download record 65001 to 70000 of 76073 (batch 13)
Going to download record 70001 to 75000 of 76073 (batch 14)
Going to download record 75001 to 76073 of 76073 (batch 15)
Done


In [21]:
#donemv vv
search_results = get_search('major tail')
get_sequences(search_results,out='major_tail2.fasta',start_batch=6)

Going to download record 30001 to 35000 of 66513 (batch 6)
Going to download record 35001 to 40000 of 66513 (batch 7)
Going to download record 40001 to 45000 of 66513 (batch 8)
Going to download record 45001 to 50000 of 66513 (batch 9)
Going to download record 50001 to 55000 of 66513 (batch 10)
Going to download record 55001 to 60000 of 66513 (batch 11)
Going to download record 60001 to 65000 of 66513 (batch 12)
Going to download record 65001 to 66513 of 66513 (batch 13)
Done


In [8]:
#done
search_results = get_search('minor tail')
get_sequences(search_results,out='minor_tail.fasta')

Going to download record 1 to 5000 of 94635
Going to download record 5001 to 10000 of 94635
Going to download record 10001 to 15000 of 94635
Going to download record 15001 to 20000 of 94635
Going to download record 20001 to 25000 of 94635
Going to download record 25001 to 30000 of 94635
Going to download record 30001 to 35000 of 94635
Going to download record 35001 to 40000 of 94635
Going to download record 40001 to 45000 of 94635
Going to download record 45001 to 50000 of 94635
Going to download record 50001 to 55000 of 94635
Going to download record 55001 to 60000 of 94635
Going to download record 60001 to 65000 of 94635
Going to download record 65001 to 70000 of 94635
Going to download record 70001 to 75000 of 94635
Going to download record 75001 to 80000 of 94635
Going to download record 80001 to 85000 of 94635
Going to download record 85001 to 90000 of 94635
Going to download record 90001 to 94635 of 94635
Done


In [6]:
#done
search_results = get_search('portal')
get_sequences(search_results,out='portal.fasta')

Going to download record 1 to 5000 of 210036
Going to download record 5001 to 10000 of 210036
Going to download record 10001 to 15000 of 210036
Going to download record 15001 to 20000 of 210036
Going to download record 20001 to 25000 of 210036
Going to download record 25001 to 30000 of 210036
Going to download record 30001 to 35000 of 210036
Going to download record 35001 to 40000 of 210036
Going to download record 40001 to 45000 of 210036
Going to download record 45001 to 50000 of 210036
Going to download record 50001 to 55000 of 210036
Going to download record 55001 to 60000 of 210036
Going to download record 60001 to 65000 of 210036
Going to download record 65001 to 70000 of 210036
Going to download record 70001 to 75000 of 210036
Going to download record 75001 to 80000 of 210036
Going to download record 80001 to 85000 of 210036
Going to download record 85001 to 90000 of 210036
Going to download record 90001 to 95000 of 210036
Going to download record 95001 to 100000 of 210036
Going

In [5]:
#done
search_results = get_search('tail fiber')
get_sequences(search_results,out='tail_fiber.fasta')

Going to download record 1 to 5000 of 29133
Going to download record 5001 to 10000 of 29133
Going to download record 10001 to 15000 of 29133
Going to download record 15001 to 20000 of 29133
Going to download record 20001 to 25000 of 29133
Going to download record 25001 to 29133 of 29133
Done


In [7]:
#dome
search_results = get_search('collar')
get_sequences(search_results,out='collar.fasta')

Going to download record 1 to 4224 of 4224
Done


In [6]:
#done
search_results = get_full_search('tail[Title] AND (shaft[Title] OR sheath[Title])')
get_sequences(search_results,out='shaft.fasta')

Going to download record 1 to 5000 of 37885
Going to download record 5001 to 10000 of 37885
Going to download record 10001 to 15000 of 37885
Going to download record 15001 to 20000 of 37885
Going to download record 20001 to 25000 of 37885
Going to download record 25001 to 30000 of 37885
Going to download record 30001 to 35000 of 37885
Going to download record 35001 to 37885 of 37885
Done


In [27]:
search_results = get_search('head-tail joinning')
get_sequences(search_results,out='HTJls2.fasta',start_batch=4)

Going to download record 20001 to 25000 of 60270 (batch 4)
Going to download record 25001 to 30000 of 60270 (batch 5)
Going to download record 30001 to 35000 of 60270 (batch 6)
Going to download record 35001 to 40000 of 60270 (batch 7)
Going to download record 40001 to 45000 of 60270 (batch 8)
Going to download record 45001 to 50000 of 60270 (batch 9)
Going to download record 50001 to 55000 of 60270 (batch 10)
Going to download record 55001 to 60000 of 60270 (batch 11)
Going to download record 60001 to 60270 of 60270 (batch 12)
Done
