In [2]:
from Bio import Entrez
Entrez.email = 'annabalan267@gmail.com'

### 1. esearch searches articles in NCBI PubMed

In [3]:
handle = Entrez.esearch(db = "pubmed", term = "crustacyanin")
record = Entrez.read(handle)
print(record)

{'Count': '79', 'RetMax': '20', 'RetStart': '0', 'IdList': ['35247793', '35010161', '34436301', '33919403', '33465290', '32851672', '32596057', '32236233', '31813041', '30860355', '29683674', '29178679', '28851818', '26220698', '25797168', '25605312', '24782450', '23570752', '23510436', '23441225'], 'TranslationSet': [{'From': 'crustacyanin', 'To': '"crustacyanin"[All Fields] OR "crustacyanins"[Supplementary Concept] OR "crustacyanins"[All Fields]'}], 'QueryTranslation': '"crustacyanin"[All Fields] OR "crustacyanins"[Supplementary Concept] OR "crustacyanins"[All Fields]'}


### 2. efetch returns abstracts of 3 first articles

In [4]:
mshandle = Entrez.efetch(db="pubmed", id=record["IdList"][0:3], rettype="abstract", retmode="text")
print(mshandle.read())

1. Comp Biochem Physiol Part D Genomics Proteomics. 2022 Jun;42:100977. doi: 
10.1016/j.cbd.2022.100977. Epub 2022 Feb 16.

Searching and identifying pigmentation genes from Neocaridina denticulate 
sinensis via comparison of transcriptome in different color strains.

Lin S(1), Zhang L(2), Wang G(1), Huang S(1), Wang Y(1).

Author information:
(1)Key Laboratory of Healthy Mariculture for the East China Sea, Ministry of 
Agriculture, Fisheries College, Jimei University, Xiamen 361021, China.
(2)Key Laboratory of Healthy Mariculture for the East China Sea, Ministry of 
Agriculture, Fisheries College, Jimei University, Xiamen 361021, China. 
Electronic address: llzhang@jmu.edu.cn.

Aquaria species are characterized by their amazing colors and patterns. Research 
on the breeding molecular genetics of ornamental shrimps is surprisingly 
limited. We conducted a transcriptome analysis to investigate the expression of 
encoding genes in the integument of the strains Neocaridina denticulate 
si

### 3. esearch searches in the base all the sequences for a certain gene and species, returning a list if IDs

In [11]:
handle = Entrez.esearch(db = "nucleotide", term = "crustacyanin AND Homarus[orgn]") #orgn=organism
record = Entrez.read(handle)
print(record)
Entrez.efetch(db = "nucleotide", id = record["IdList"])

{'Count': '19', 'RetMax': '19', 'RetStart': '0', 'IdList': ['2065206281', '2065193120', '2065190079', '2065188392', '2065186734', '2065171964', '2065159855', '2065028193', '2056514813', '2056498811', '2056493211', '2056492151', '2056488359', '2056483089', '2056467578', '2056465480', '2056439345', '48433440', '48432807'], 'TranslationSet': [{'From': 'Homarus[orgn]', 'To': '"Homarus"[Organism]'}], 'TranslationStack': [{'Term': 'crustacyanin[All Fields]', 'Field': 'All Fields', 'Count': '174', 'Explode': 'N'}, {'Term': '"Homarus"[Organism]', 'Field': 'Organism', 'Count': '128126', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': 'crustacyanin[All Fields] AND "Homarus"[Organism]'}


<_io.TextIOWrapper encoding='UTF-8'>

### searches taxon ID with a given name

In [9]:
handle = Entrez.esearch(db = "taxonomy", term = "Homarus americanus")
record = Entrez.read(handle)
print(record)
print(record['IdList'])

{'Count': '1', 'RetMax': '1', 'RetStart': '0', 'IdList': ['6706'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'Homarus americanus[All Names]', 'Field': 'All Names', 'Count': '1', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'Homarus americanus[All Names]'}
['6706']


### 4. esearch+summary searches in database of proteins and nucl sequences with a name of a protein. Returns UID. actually its XML parcing.

In [20]:
handle = Entrez.esearch(db="protein", term="crustacyanin AND Homarus americanus[orgn]")
record = Entrez.read(handle)
for rec in record["IdList"]:
        temphandle = Entrez.read(Entrez.esummary(db="protein", id=rec, retmode="text"))
        print(temphandle[0]['Id']+"\t"+temphandle[0]['Caption']+"\t"+str(temphandle[0]['Length']))#+"\n")
##str(int(temphandle[0]['Length'])))

2068680993	XP_042236484	197
2068680990	XP_042236483	190
2068650119	XP_042225885	190
2068650116	XP_042225884	197
2068642615	XP_042227234	190
2068642613	XP_042227223	197
2068642611	XP_042227211	190
2068642608	XP_042227198	197
2068642605	XP_042227187	190
2068642602	XP_042227176	197
2056515232	KAG7177238	197
2056515231	KAG7177237	190
2056515230	KAG7177236	197
2056515229	KAG7177235	190
2056515228	KAG7177234	197
2056515227	KAG7177233	190
2056483091	KAG7166898	197
2056483090	KAG7166897	190
2056467580	KAG7160784	197
2056467579	KAG7160783	190


### 5. returns fasta and writes it in a file

In [21]:
handle = Entrez.esearch(db="protein", term="crustacyanin AND Homarus americanus[orgn]")
record = Entrez.read(handle)

Entrez.efetch(db="protein", id=record["IdList"], retmode="text", rettype="fasta").read()
with open("crcn.fasta", "w") as ouf:
    for rec in record["IdList"]:
        lne = Entrez.efetch(db="protein", id=rec, retmode="text", rettype="fasta").read()
        ouf.write(lne+"\n")
with open("crcn.fasta", "r") as fastaf:
    snippet = [next(fastaf) for x in range(5)]
    print(snippet)

['>XP_042236484.1 crustacyanin-C1 subunit-like [Homarus americanus]\n', 'MNSLSILLVFVASVAADKIPDFVVPGKCASVDRNKLWAEQTPNRNNYAGVWYQFALTNNPYQLIEKCVRN\n', 'EYSFDGEQFVITSTGIAYDGNLLKRNGKLYPNPFGEPHLSIDYENSFAAPLVILETDYSNYACLYSCIDY\n', 'NFGYHSDFSFIFSRSANLAEQYVKKCEAAFKNINVDTTRFVKTVQGSSCPYDTQKTL\n', '\n']


### 6. downloads a protein, takes a UID of a nucleotide

In [23]:
lhandle = Entrez.elink(dbfrom="nucleotide", db="protein", id="2065188392")
lrecord = Entrez.read(lhandle)
prothandle = lrecord[0]["LinkSetDb"][0]['Link'][0]['Id']
rrecord = Entrez.efetch(db="protein", id=prothandle, rettype="fasta", retmode="text")
with open ("prot_from_nt.fasta", "w") as ouf:
    ouf.write(rrecord.read()+"\n")

### 7. Downloads fasta sequences from a work with PMID (e.g. fron the first task)

In [24]:
lhandle = Entrez.elink(dbfrom="pubmed", db="nucleotide", id="20558169")
lrecord = Entrez.read(lhandle)
ids = []
for el in lrecord[0]["LinkSetDb"][0]["Link"]:
    ids.append(el['Id'])
rrecord = Entrez.efetch(db="nucleotide", id=ids[:4], rettype="fasta", retmode="text")
with open ("py_fasta_pmid.fasta", "w") as ouf:
    ouf.write(rrecord.read()+"\n")