In [12]:
from Bio import Entrez, SeqIO
#Entrez provides access to NCBI directly from the console
#SeqIO processes the sequences parsed from NCBI
Entrez.email = 'steveko35@naver.com'

In [13]:
info = Entrez.einfo()
database = Entrez.read(info)
print(database)

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']}


In [14]:
seq = Entrez.esearch(db = 'nucleotide', term = 'CRT[Gene Name] AND "Plasmodium falciparum"[Organism]')
#For a description of the parameters in esearch, consult: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
#Searching the nucleotide database for [Gene Name] and [Organism]
rec_list = Entrez.read(seq)
#Parses an XML file from NCBI into Python objects
#Standard search will limit reference records to 20
if rec_list['RetMax'] < rec_list['Count']:
#Overriding the reference limit
    seq = Entrez.esearch(db = 'nucleotide', term = 'CRT[Gene Name] AND "Plasmodium falciparum"[Organism]', retmax = rec_list['Count'])
    rec_list = Entrez.read(seq)

In [15]:
#Downloads all the matching nt sequences from GenBank
#If the target sequences have large sequences, avoid this procedure to prevent straining the NCBI database
id_list = rec_list['IdList']
handle = Entrez.efetch(db = 'nucleotide', id = id_list, rettype = 'gb')
#For a description of the parameters in efetch, consult: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
#db: Database from which to retrieve records
#id: UID list from the database specified by db
#rettype: Retrieval type

In [16]:
records = list(SeqIO.parse(handle, 'gb'))
#Parses an XML file from NCBI into Python objects

In [17]:
for record in records:
    if record.name == 'KM288867':
    #Loop to include the record of interest
        break
print(record.name)
print(record.description)
# Human-readable description

KM288867
Plasmodium falciparum clone PF3D7_0709000 chloroquine resistance transporter (CRT) gene, complete cds


In [18]:
#Extracting sequence features (gene products, exon positions)
for feature in record.features:
    if feature.type == 'gene':
        print(feature.qualifiers['gene'])
    #Printing the gene name in the qualifiers dictionary
    elif feature.type == 'exon':
        loc = feature.location
        print('Exon', loc.start, loc.end, loc.strand)
    #Printing the location of the exons (start, end, strand)
    else:
        print('not processed:\n%s' % feature)

not processed:
type: source
location: [0:10000](+)
qualifiers:
    Key: clone, Value: ['PF3D7_0709000']
    Key: db_xref, Value: ['taxon:5833']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Plasmodium falciparum']

['CRT']
not processed:
type: mRNA
location: join{[2751:3543](+), [3720:3989](+), [4168:4341](+), [4513:4646](+), [4799:4871](+), [4994:5070](+), [5166:5249](+), [5376:5427](+), [5564:5621](+), [5769:5862](+), [6055:6100](+), [6247:6302](+), [6471:7598](+)}
qualifiers:
    Key: gene, Value: ['CRT']
    Key: product, Value: ['chloroquine resistance transporter']

not processed:
type: 5'UTR
location: [2751:3452](+)
qualifiers:
    Key: gene, Value: ['CRT']

not processed:
type: primer_bind
location: [2935:2958](+)
qualifiers:

not processed:
type: primer_bind
location: [3094:3121](+)
qualifiers:

not processed:
type: CDS
location: join{[3452:3543](+), [3720:3989](+), [4168:4341](+), [4513:4646](+), [4799:4871](+), [4994:5070](+), [5166:5249](+), [5376:54

In [19]:
for name, value in record.annotations.items():
    print('%s = %s' %(name, value))

print(len(record.seq))

molecule_type = DNA
topology = linear
data_file_division = INV
date = 12-NOV-2014
accessions = ['KM288867']
sequence_version = 1
keywords = ['']
source = Plasmodium falciparum (malaria parasite P. falciparum)
organism = Plasmodium falciparum
taxonomy = ['Eukaryota', 'Sar', 'Alveolata', 'Apicomplexa', 'Aconoidasida', 'Haemosporida', 'Plasmodiidae', 'Plasmodium', 'Plasmodium (Laverania)']
references = [Reference(title='Versatile control of Plasmodium falciparum gene expression with an inducible protein-RNA interaction', ...), Reference(title='Direct Submission', ...)]
10000


In [20]:
from Bio import Medline

In [21]:
references = record.annotations['references']
#Checking if all reference annotations have a PubMed identifier, then retrieving
for reference in references:
    if reference.pubmed_id != '':
        print(reference.pubmed_id)
        handle = Entrez.efetch(db = 'pubmed', id = [reference.pubmed_id], rettype = 'medline', retmode = 'text')
        #retmode: Specifies the data format of the returned data (in this case, text)
        medicalRecords = Medline.parse(handle)
        for med_rec in medicalRecords:
            for k, b in med_rec.items():
                print('%s: %s' % (k, b))

25370483
PMID: 25370483
OWN: NLM
STAT: MEDLINE
DCOM: 20160112
LR: 20181113
IS: 2041-1723 (Electronic) 2041-1723 (Linking)
VI: 5
DP: 2014 Nov 5
TI: Versatile control of Plasmodium falciparum gene expression with an inducible protein-RNA interaction.
PG: 5329
LID: 10.1038/ncomms6329 [doi]
AB: The available tools for conditional gene expression in Plasmodium falciparum are limited. Here, to enable reliable control of target gene expression, we build a system to efficiently modulate translation. We overcame several problems associated with other approaches for regulating gene expression in P. falciparum. Specifically, our system functions predictably across several native and engineered promoter contexts, and affords control over reporter and native parasite proteins irrespective of their subcellular compartmentalization. Induction and repression of gene expression are rapid, homogeneous and stable over prolonged periods. To demonstrate practical application of our system, we used it to re