Adapted from [https://github.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-Second-Edition](https://github.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-Second-Edition), Chapter 2.

1) Retrieving data from the NCBI databases using the Entrez interface

In [1]:
# first of all, we need to give our credentials to access the Entrez interface. 
from Bio import Entrez, SeqIO, Medline
Entrez.email = 'jvilla@uic.cat'

# let us check the available databases
handle = Entrez.einfo()
rec = Entrez.read(handle)
print(rec)

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'sparcle', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'probe', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']}


In [3]:
handle = Entrez.esearch(db="nucleotide", term='ACE2[Gene Name] AND "human"[Organism] ')
rec_list = Entrez.read(handle)
print(rec_list)

{'Count': '22', 'RetMax': '20', 'RetStart': '0', 'IdList': ['1700998533', '1700998531', '254939607', '1370522804', '1370522803', '1370522753', '568815575', '991820413', '991820412', '991820411', '991820410', '74273659', '71517132', '29127010', '25140225', '307686498', '66882028', '27978647', '66863989', '66863987'], 'TranslationSet': [{'From': '"human"[Organism]', 'To': '"Homo sapiens"[Organism]'}], 'TranslationStack': [{'Term': 'ACE2[Gene Name]', 'Field': 'Gene Name', 'Count': '1184', 'Explode': 'N'}, {'Term': '"Homo sapiens"[Organism]', 'Field': 'Organism', 'Count': '27615764', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': 'ACE2[Gene Name] AND "Homo sapiens"[Organism]'}


In [4]:
id_list = rec_list['IdList']
hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb', retmax=rec_list['Count'])
recs = list(SeqIO.parse(hdl, 'gb'))

In [5]:
#print(recs) 
for rec in recs:
    if rec.name == 'BC039902':
        print(rec.name)
        print(rec.description)
        break

BC039902
Homo sapiens angiotensin I converting enzyme (peptidyl-dipeptidase A) 2, mRNA (cDNA clone MGC:47598 IMAGE:5243048), complete cds


In [6]:
for feature in rec.features:
    print('====\n',feature,'\n----')
    if feature.type == 'gene':
        print(feature.qualifiers['gene'])
    elif feature.type == 'exon':
        loc = feature.location
        print('Exon', loc.start, loc.end, loc.strand)
    else:
        print('not processed')

====
 type: source
location: [0:3348](+)
qualifiers:
    Key: clone, Value: ['MGC:47598 IMAGE:5243048']
    Key: clone_lib, Value: ['NIH_MGC_121']
    Key: db_xref, Value: ['taxon:9606']
    Key: lab_host, Value: ['DH10B']
    Key: mol_type, Value: ['mRNA']
    Key: note, Value: ['Vector: pCMV-SPORT6']
    Key: organism, Value: ['Homo sapiens']
    Key: tissue_type, Value: ['Brain, fetal, whole pooled']
 
----
not processed
====
 type: gene
location: [0:3348](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:59272', 'HGNC:HGNC:13557', 'MIM:300335']
    Key: gene, Value: ['ACE2']
    Key: gene_synonym, Value: ['ACEH', 'DKFZP434A014']
 
----
['ACE2']
====
 type: CDS
location: [18:2436](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GeneID:59272', 'HGNC:HGNC:13557', 'MIM:300335']
    Key: gene, Value: ['ACE2']
    Key: gene_synonym, Value: ['ACEH', 'DKFZP434A014']
    Key: product, Value: ['angiotensin I converting enzyme (peptidyl-dipeptidase A) 2']
    Key: 

In [7]:
for name, value in rec.annotations.items():
    print('%s=%s' % (name, value))

molecule_type=mRNA
topology=linear
data_file_division=PRI
date=24-JUL-2006
accessions=['BC039902']
sequence_version=1
keywords=['MGC']
source=Homo sapiens (human)
organism=Homo sapiens
taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
references=[Reference(title='Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences', ...), Reference(title='Direct Submission', ...)]
comment=Contact: MGC help desk
Email: cgapbs-r@mail.nih.gov
Tissue Procurement: Life Technologies, Inc.
cDNA Library Preparation: Life Technologies, Inc.
cDNA Library Arrayed by: The I.M.A.G.E. Consortium (LLNL)
DNA Sequencing by: National Institutes of Health Intramural
Sequencing Center (NISC),
Gaithersburg, Maryland;
Web site: http://www.nisc.nih.gov/
Contact: nisc_mgc@nhgri.nih.gov
Akhter,N., Ayele,K., Beckstrom-Sternberg,S.M., Benjamin,B

In [8]:
print(len(rec.seq))

3348


In [9]:
refs = rec.annotations['references']
for ref in refs:
    print(ref)
    if ref.pubmed_id != '':
        print(ref.pubmed_id)
        handle = Entrez.efetch(db="pubmed", id=[ref.pubmed_id],
                                rettype="medline", retmode="text")
        records = Medline.parse(handle)
        for med_rec in records:
            for k, v in med_rec.items():
                print('%s: %s' % (k, v))

location: [0:3348]
authors: Strausberg,R.L., Feingold,E.A., Grouse,L.H., Derge,J.G., Klausner,R.D., Collins,F.S., Wagner,L., Shenmen,C.M., Schuler,G.D., Altschul,S.F., Zeeberg,B., Buetow,K.H., Schaefer,C.F., Bhat,N.K., Hopkins,R.F., Jordan,H., Moore,T., Max,S.I., Wang,J., Hsieh,F., Diatchenko,L., Marusina,K., Farmer,A.A., Rubin,G.M., Hong,L., Stapleton,M., Soares,M.B., Bonaldo,M.F., Casavant,T.L., Scheetz,T.E., Brownstein,M.J., Usdin,T.B., Toshiyuki,S., Carninci,P., Prange,C., Raha,S.S., Loquellano,N.A., Peters,G.J., Abramson,R.D., Mullahy,S.J., Bosak,S.A., McEwan,P.J., McKernan,K.J., Malek,J.A., Gunaratne,P.H., Richards,S., Worley,K.C., Hale,S., Garcia,A.M., Gay,L.J., Hulyk,S.W., Villalon,D.K., Muzny,D.M., Sodergren,E.J., Lu,X., Gibbs,R.A., Fahey,J., Helton,E., Ketteman,M., Madan,A., Rodrigues,S., Sanchez,A., Whiting,M., Madan,A., Young,A.C., Shevchenko,Y., Bouffard,G.G., Blakesley,R.W., Touchman,J.W., Green,E.D., Dickson,M.C., Rodriguez,A.C., Grimwood,J., Schmutz,J., Myers,R.M., Butt