# Entrez Script Creation Playground

The goal of the script is to fetch a list of IDs without overpinging the website. This will:



In [1]:
from Bio import Entrez, SeqIO
from time import sleep
from io import StringIO

In [17]:
class BulkRefseqFetcher:
    """
    Main intention of function is to 
    """
    
    def __init__(self, email):
        """
        
        """
        
        self.email  = email
        
        self.fetcher = Entrez
        self.fetcher.email = self.email
            
    def read_ids(self, refseq_id_path):
        """
        Parses file and turns ids into a list
        """
        
        with open(refseq_id_path) as handle:
            ids = handle.read().split('\n')
            
        return ids
    
    def convert_query_to_seqrecord(self, query):
        """
        Converts query into a list containing one SeqRecord object
        """
        return list( SeqIO.parse( StringIO(query.read()), 'fasta' ) )
    
    def save_fasta(self, fasta_list, path):
        SeqIO.write(fasta_list, path, 'fasta')
    
    def bulk_efetch(self, refseq_id_path):
        """
        Main workhorse of the fetcher. This uses the Entrez class to
        efetch one Refseq ID at a time and combine the outputs into one list.
        
        If there is an error (e.g. doesn't exist or pinging too much), it
        skips that ID prints the error into stdout to look at when 
        """
        
        refseq_ids   = self.read_ids(refseq_id_path)
        refseq_fasta = list()
        
        for refseq_id in refseq_ids:
            
            try:
                query           = self.fetcher.efetch(db="nucleotide", id=refseq_id, rettype="fasta", retmode="text")
                query_seqrecord = self.convert_query_to_seqrecord(query)
                
                refseq_fasta.extend(query_seqrecord)
                sleep(2)
                
            except HTTPError as error:
                print(error)
                
        return refseq_fasta
    
    def fetch_refseq_id_seqs(self, refseq_id_path, output_fasta_path):
        print("Converting RefSeq IDs to sequence...")
        print("---")
        print("Errors:")
        
        refseq_fasta = self.bulk_efetch(refseq_id_path)
        
        print("---")
        print("""Conversion Complete!""")
        
        self.save_fasta(refseq_fasta, output_fasta_path)
        print(f"Saved output to: {output_fasta_path}")

In [13]:
x = "../Data/refseq_ids.txt"

with open(x) as handle:
    l = handle.read().split('\n')

In [18]:
fetcher = BulkRefseqFetcher("rwl42@drexel.edu")

fetcher.fetch_refseq_id_seqs(refseq_id_path    = "../Data/refseq_ids.txt",
                             output_fasta_path = "test2.fa")

Converting RefSeq IDs to sequence...
---
Error List:
---
Conversion Complete!
Saved output to: test2.fa


In [8]:
fetcher.save_fasta(x, "test.fasta")

In [None]:
Entrez.email = "rwl42@drexel.edu"

out = Entrez.efetch(db="nucleotide", id=','.join(l), rettype="fasta", retmode="text")

In [57]:
x = list( SeqIO.parse( StringIO(out.read()), 'fasta' ) )

In [59]:
for i in x:
    print(i.id)

NR_126047.1
NR_024101.1
NR_122124.1
NR_026951.1
NR_160982.1
NR_046407.1
NR_003038.2
NR_103538.1
XR_245013.2
NR_040046.1
NR_024480.1
NR_027433.1
NR_131243.2
NR_146177.1
NR_110468.1
NR_146968.1
NR_120514.1
NR_027994.1
NR_135589.1
NR_109957.1
NR_046088.1
NR_170222.1
NR_024365.1
XR_001747452.1
NR_145490.1
NR_046514.1
NR_110016.1
NR_110793.1
NR_149071.1
NR_046756.1
NR_110622.1
NR_146731.1
NR_132993.1
NR_033910.1
NR_102279.1
NR_166513.1
NR_102711.1
NR_104137.1
NR_126347.1
NR_037850.2
NR_033947.1
NR_046853.2
NR_120664.1
NR_165033.1
NR_037875.1
NR_110259.1
NR_046730.1
NR_126402.1
NR_034111.1
NR_110576.1
NR_046284.1
XR_942230.3
NR_026832.1
NR_103730.1
NR_110237.1
NR_046544.1
NR_110570.1
NR_046203.2
NR_146479.1
NR_110331.1
NR_133642.1
NR_147887.1
NR_125968.1
NR_135764.1
NR_038367.1
NR_132371.1
NR_126407.1
NR_130914.1
NR_109772.1
NR_146891.1
NR_026742.1
NR_121651.1
NR_047686.1
NR_126417.1
NR_120429.1
NR_125943.1
NR_110154.1
XR_947473.2
NR_120502.1
NR_135285.1
NR_015399.1
NR_110500.1
NR_047700.1
N