## Entrez package

Thanks to Entrez package we can comunnicate with Entrez databases.
( https://www.ncbi.nlm.nih.gov/Class/MLACourse/Original8Hour/Entrez/ ) 

In [3]:
# protein search based on protein ID. result in gb format
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "zihaladavid@gmail.com"

def get_protein(protein_id):
    net_handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
    with open(protein_id + '.gb', "w") as out_handle:
        out_handle.write(net_handle.read())
    net_handle.close()

In [4]:
for record in SeqIO.parse('rab20.fasta', 'fasta'):
    get_protein(record.id)

In [None]:
handle = open(protein_id + '.gb', mode="r")
protein_record = SeqIO.read(handle, "genbank")

In [None]:
x = protein_record.features[0]

In [None]:
x.qualifiers['organism']

In [None]:
x = ''
for feature in protein_record.features:
    if feature.type == 'CDS':
        print(feature.qualifiers['db_xref'])

In [5]:
# gene search base on ID of its corresponding protein products
import os
from Bio import Entrez
from Bio import SeqIO

Entrez.email = "zihaladavid@gmail.com"  # Always tell NCBI who you are

def read_protein(protein_id):
    filename = protein_id
    if not os.path.isfile(filename): #if protein record is not in your folder
        net_handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
        out_handle = open(filename, "w")
        out_handle.write(net_handle.read())
        out_handle.close()
        net_handle.close()
    handle = open(filename, mode="r")
    protein_record = SeqIO.read(handle, "genbank")
    return protein_record

def get_gene_id(protein_record):
    for feature in protein_record.features:
        if feature.type == "CDS":
            for item in feature.qualifiers["db_xref"]:
                if 'GeneID' in item:
                    return((item.split(':'))[1])
                
protein_record = read_protein('NP_035357.1')
gene_id = get_gene_id(protein_record)
print('gene id:', gene_id)

gene id: 19332


In [6]:
# number of publications for given author
from Bio import Entrez
Entrez.email = "A.N.Other@example.com"     # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="David Žihala", retmax=100000)
record = Entrez.read(handle)
len(record["IdList"])

4

In [9]:
# coauthor search
from Bio import Entrez
from Bio import Medline

MAX_COUNT = 20
TERM = 'Žihala David'

print('Getting {0} publications containing {1}...'.format(MAX_COUNT, TERM))
Entrez.email = 'A.N.Other@example.com'
h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
result = Entrez.read(h)
print('Total number of publications containing {0}: {1}'.format(TERM, result['Count']))
ids = result['IdList']
h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
records = Medline.parse(h)

authors = []
for record in records:
    au = record.get('AU', '?')
    for a in au: 
        if a not in authors:
            authors.append(a)
    authors.sort()
print('Authors: {0}'.format(', '.join(authors)))

Getting 20 publications containing Žihala David...
Total number of publications containing Žihala David: 4
Authors: Barlow LD, Bates PA, Becvar T, Brzon O, Butenko A, Cepicka I, Dacks JB, Derelle R, Elias M, Eme L, Hampl V, Herman EK, Hradilova M, Karnkowska A, Kleschenko Y, Klimes V, Kostygov AY, Lukes J, Macedo DH, Novak L, Opperdoes FR, Panek T, Petrzelkova R, Pipaliya SV, Podesvova L, Roger AJ, Sadlova J, Sokol M, Soukal P, Stairs CW, Susko E, Treitli SC, Vacek V, Volf P, Yurchenko V, Zadrobilkova E, Zihala D
