#### GenBank databases from the National Center for Biotechnology Information (NCBI) though an API

In [1]:
from Bio import Entrez, Medline, SeqIO

In [2]:
Entrez.email = "user@email.com" #enter a valid email address

In [3]:
handle = Entrez.einfo()
rec = Entrez.read(handle)
print(rec)

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']}


In [4]:
handle = Entrez.esearch(
    db="nucleotide", 
    term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]'
    )
rec_list = Entrez.read(handle)
if int(rec_list['RetMax']) < int(rec_list['Count']):
    handle = Entrez.esearch(
        db="nucleotide",
        term='CRT[Gen Name] AND "Plasmodium falciparum"[Organism]',
        retmax=rec_list['Count']
        )
id_list = rec_list['IdList']
hdl = Entrez.efetch(
    db='nucleotide', 
    id=id_list, 
    rettype='gb',
    retmax=rec_list['Count'] 
    )
recs = list(SeqIO.parse(hdl, 'gb'))

In [5]:
for rec in recs:
    if rec.name == 'KM288867':
        break

print(rec.name)
print(rec.description)

OQ672432
Plasmodium falciparum isolate CZS_15 chloroquine resistance transporter (crt) gene, partial cds


In [7]:
for feature in rec.features:
    if feature.type =='gene':
        print(feature.qualifiers['gene'])
    elif feature.type == 'exon':
        loc = feature.location
        print('Exon', loc.start, loc.end, loc.strand)
    else:
        print('not processed:\n%s' % feature)

not processed:
type: source
location: [0:145](+)
qualifiers:
    Key: country, Value: ['Brazil']
    Key: db_xref, Value: ['taxon:5833']
    Key: isolate, Value: ['CZS_15']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Plasmodium falciparum']

['crt']
not processed:
type: mRNA
location: [<0:>145](+)
qualifiers:
    Key: gene, Value: ['crt']
    Key: product, Value: ['chloroquine resistance transporter']

not processed:
type: CDS
location: [<0:>145](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: gene, Value: ['crt']
    Key: note, Value: ['localized within the digestive vacuole membrane']
    Key: product, Value: ['chloroquine resistance transporter']
    Key: protein_id, Value: ['WHO19581.1']
    Key: translation, Value: ['CAHVFKLIFKEIKDNIFIYILSIIYLSVSVMNTIFAKRTLNKIGNYSF']



In [9]:
for name, value in rec.annotations.items():
    print('%s=%s' % (name, value))

molecule_type=DNA
topology=linear
data_file_division=INV
date=24-MAY-2023
accessions=['OQ672432']
sequence_version=1
keywords=['']
source=Plasmodium falciparum (malaria parasite P. falciparum)
organism=Plasmodium falciparum
taxonomy=['Eukaryota', 'Sar', 'Alveolata', 'Apicomplexa', 'Aconoidasida', 'Haemosporida', 'Plasmodiidae', 'Plasmodium', 'Plasmodium (Laverania)']
references=[Reference(title='Plasmodium falciparum Chloroquine-pfcrt Resistant Haplotypes in Brazilian Endemic Areas Four Decades after CQ Withdrawn', ...), Reference(title='Direct Submission', ...)]
structured_comment=defaultdict(<class 'dict'>, {'Assembly-Data': {'Sequencing Technology': 'Sanger dideoxy sequencing'}})


In [10]:
len(rec.seq)

145

In [11]:
refs = rec.annotations['references']
print(refs)
for ref in refs:
    if ref.pubmed_id != '':
        print(ref.pubmed_id)
        handle = Entrez.efetch(
            db="bubmed",
            id=[ref.pubmed_id],
            rettype="medline",
            retmode = "text"
        )
        records = Medline.parse(handle)
        for med_rec in records:
            for k, v in med_rec.items():
                print('%s: %s' % (k,v))

[Reference(title='Plasmodium falciparum Chloroquine-pfcrt Resistant Haplotypes in Brazilian Endemic Areas Four Decades after CQ Withdrawn', ...), Reference(title='Direct Submission', ...)]
