# ACCESS BIOINFORMATICS DATABASES WITH BIO-PYTHON

1. [NCBI](#1.-NCBI)<br>
    1.1. [Nucleotide BLAST](#1.1.-Nucleotide-BLAST)<br>
    1.2. [Protein BLAST](#1.2.-Protein-BLAST)
    
2. [ENTREZ](#2.-ENTREZ)<br>
    2.1. [PUBMED](#2.1.-PUBMED)<br>
    2.2. [Nucleotide](#2.2.-Nucleotide)
    
3. [PDB](#3.-PDB)

4. [EXPASY](#4.-EXPASY)<br>
    4.1. [PROSITE](#4.1.-PROSITE)<br>
    4.2. [ScanProsite](#4.2.-ScanProsite)
    
5. [KEGG](#5.-KEGG)

# 1. NCBI

### Import Modules

In [2]:
from Bio.Blast import NCBIWWW
from Bio import SeqIO, SearchIO

In [5]:
#help(NCBIWWW)

## 1.1. Nucleotide BLAST

In [4]:
!ls

notebook.ipynb	nuc_seq.fasta  prot_seq.fasta


In [7]:
nuc_record = SeqIO.read('nuc_seq.fasta', format = 'fasta')
len(nuc_record)

774

In [8]:
nuc_record.description

'MT598137.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/IRN/PN-2142-S/2020 surface glycoprotein (S) gene, partial cds'

In [9]:
nuc_record.seq

Seq('ATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGAT...GGT')

In [10]:
result_handle = NCBIWWW.qblast('blastn', 'nt', nuc_record.seq)
blast_result = SearchIO.read(result_handle, 'blast-xml')

In [11]:
print(blast_result[0:2])

Program: blastn (2.16.1+)
  Query: No (774)
         definition line
 Target: core_nt
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  gi|2522133207|emb|OX682734.1|  Severe acute respiratory...
            1      1  gi|2555084375|emb|OY596573.1|  Severe acute respiratory...


In [12]:
Seq = blast_result[0]
print(f"Sequence ID: {Seq.id}")
print(f"Sequence Description: {Seq.description}")

details = Seq[0]
print(f"E-value: {details.evalue}")

Sequence ID: gi|2522133207|emb|OX682734.1|
Sequence Description: Severe acute respiratory syndrome coronavirus 2 genome assembly, complete genome: monopartite
E-value: 0.0


In [13]:
print(f"alignment:\n{details.aln}")

alignment:
Alignment with 2 rows and 774 columns
ATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAA...GGT No
ATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAA...GGT gi|2522133207|emb|OX682734.1|


## 1.2. Protein BLAST

In [14]:
prot_record = SeqIO.read("prot_seq.fasta", format="fasta")
len(prot_record)

258

In [16]:
result_handle = NCBIWWW.qblast("blastp", "pdb", prot_record.seq)
blast_result = SearchIO.read(result_handle, "blast-xml")

In [17]:
print(blast_result[0:2])

Program: blastp (2.16.1+)
  Query: unnamed (258)
         protein product
 Target: pdb
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  pdb|8ELJ|A  Chain A, Spike glycoprotein [Severe acute r...
            1      1  pdb|7CAB|A  Chain A, Spike glycoprotein [Severe acute r...


In [18]:
Seq = blast_result [0]
print(f"Sequence ID: {Seq.id}")
print(f"Sequence Description: {Seq.description}")

details = Seq[0]
print(f"E-value: {details.evalue}")

Sequence ID: pdb|8ELJ|A
Sequence Description: Chain A, Spike glycoprotein [Severe acute respiratory syndrome coronavirus 2]
E-value: 0.0


In [19]:
print(f"alignment:\n {details.aln}")

alignment:
 Alignment with 2 rows and 258 columns
IAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLY...PIG unnamed
IAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLY...PIG pdb|8ELJ|A


------------------------------------------------------

# 2. ENTREZ

### Import Modules

In [20]:
from Bio import Entrez

In [22]:
# help(Entrez)

In [23]:
Entrez.email = "saglam.chd@gmail.com"

In [24]:
handle = Entrez.einfo()
record = Entrez.read(handle)
record['DbList']

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

## 2.1. PUBMED

In [25]:
handle = Entrez.einfo(db='pubmed')
record = Entrez.read(handle)

record['DbInfo']['Description']

'PubMed bibliographic record'

In [26]:
record['DbInfo']['Count']

'38407359'

In [27]:
handle = Entrez.esearch(db="pubmed", term="biopython")
record = Entrez.read(handle)

record['IdList']

['39883659', '39882099', '39717221', '39546778', '39507944', '39445816', '38808697', '38650605', '38365590', '38235175', '37810457', '37668712', '36818783', '36245797', '36094101', '35497637', '35496474', '35402671', '34735950', '34484417']

In [30]:
handle = Entrez.esummary(db="pubmed", id='39883659, 39882099')
records = Entrez.parse(handle)


for record in records:
    print(record['AuthorList'], 
          record['Title'], 
          record['PubDate'], 
          record['FullJournalName']
         )

['Ghorbani A', 'Rostami M', 'Ashrafi-Dehkordi E', 'Guzzi PH'] AutoPVPrimer: A comprehensive AI-Enhanced pipeline for efficient plant virus primer design and assessment. 2025 PloS one
['Wang W', 'Wong ER', 'Cato ML', 'Daly PC', 'Doody CG', 'Gillespie T', 'Ksor CN', 'Zainab A', 'Zainab Z', 'Crook M'] Quick Oil Red O, Paint3D and Biopython as an economical tool to measure total lipid levels in Caenorhabditis elegans. 2025 microPublication biology


In [35]:
handle = Entrez.efetch(db="pubmed", id="34484417")

print(handle.read())

b'<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2025//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Manual"><PMID Version="1">34484417</PMID><DateCompleted><Year>2021</Year><Month>12</Month><Day>06</Day></DateCompleted><DateRevised><Year>2022</Year><Month>04</Month><Day>26</Day></DateRevised><Article PubModel="Electronic-eCollection"><Journal><ISSN IssnType="Electronic">1748-6718</ISSN><JournalIssue CitedMedium="Internet"><Volume>2021</Volume><PubDate><Year>2021</Year></PubDate></JournalIssue><Title>Computational and mathematical methods in medicine</Title><ISOAbbreviation>Comput Math Methods Med</ISOAbbreviation></Journal><ArticleTitle>Identification of Immune-Related Genes in Sepsis due to Community-Acquired Pneumonia.</ArticleTitle><Pagination><StartPage>8020067</StartPage><MedlinePgn>8020067</MedlinePgn></Pagination

## 2.2. Nucleotide

In [36]:
handle = Entrez.esearch(db="nucleotide", retmax=10, term="Severe acute respiratory syndrome")
record = Entrez.read(handle)
record["IdList"]

['2912461298', '2912461297', '2912461296', '2912461295', '2912461294', '2912461293', '2912461292', '2912461291', '2912461290', '2912461289']

In [38]:
handle = Entrez.efetch(db='nucleotide', 
                       id="2912461298", 
                       rettype="gb", 
                       retmode="text"
                      )

print(handle.read())

LOCUS       OZ223683               29831 bp    DNA     linear   VRL 15-FEB-2025
DEFINITION  Severe acute respiratory syndrome coronavirus 2 isolate RNA genome
            assembly, complete genome: monopartite.
ACCESSION   OZ223683
VERSION     OZ223683.1
DBLINK      BioProject: PRJEB45305
            BioSample: SAMEA117690962
KEYWORDS    .
SOURCE      Severe acute respiratory syndrome coronavirus 2
  ORGANISM  Severe acute respiratory syndrome coronavirus 2
            Viruses; Riboviria; Orthornavirae; Pisuviricota; Pisoniviricetes;
            Nidovirales; Cornidovirineae; Coronaviridae; Orthocoronavirinae;
            Betacoronavirus; Sarbecovirus; Severe acute respiratory
            syndrome-related coronavirus.
REFERENCE   1
  AUTHORS   Sladecek,T.
  CONSRTM   Science Park
  TITLE     Direct Submission
  JOURNAL   Submitted (13-FEB-2025) Science Park of Comenius University of
            Bratislava, Ilkovicova 8, 841 04 Bratislava, Slovakia
FEATURES             Location/Qualifier

In [39]:
handle = Entrez.esearch(db='nucleotide', term='accD[Gene Name] AND "E. coli"[Organism]', retmax="20")
result_list = Entrez.read(handle)

In [40]:
id_list = result_list['IdList']
count = result_list['Count']

print(id_list)
print("\n")
print(count)

['2912461494', '2912461442', '2912461441', '2912461440', '2912461246', '2912461226', '2912461220', '2912461216', '2912461209', '2912461192', '2912461137', '2912461091', '2912461077', '2912461076', '2912461073', '2912461067', '2912461036', '2912461033', '2912461031', '2912461030']


315469


In [41]:
handle.close()

------------------------------------------------------

# 3. PDB

### Import Modules

In [42]:
from Bio.PDB import PDBParser,PDBList

In [47]:
#help(PDBList)

In [48]:
pdbl=PDBList()
pdbl.retrieve_pdb_file('7BYR', file_format='pdb', pdir='dir')

Downloading PDB structure '7byr'...


'dir/pdb7byr.ent'

In [49]:
parser = PDBParser()
structure = parser.get_structure('7BYR', 'dir/pdb7byr.ent')

In [50]:
for chain in structure[0]:
    print(f"chainid: {chain.id}")

chainid: A
chainid: B
chainid: C
chainid: H
chainid: L
chainid: D
chainid: E
chainid: F
chainid: G
chainid: I
chainid: J


In [51]:
resolution = structure.header["resolution"]
resolution

3.84

In [52]:
keywords = structure.header["keywords"]
keywords

'sars-cov-2, antigen, rbd, neutralizing antibody, viral protein'

------------------------------------------------------

# 4. EXPASY

## 4.1. PROSITE

### Import Modules

In [53]:
from Bio import ExPASy
from Bio.ExPASy import Prosite

In [55]:
#help(Prosite)

In [56]:
handle = ExPASy.get_prosite_raw('PS51442')
record = Prosite.read(handle)

In [57]:
print(record.description)

Coronavirus main protease (M-pro) domain profile.


In [66]:
print(record.pdb_structs[:10])

[]


In [61]:
handle = ExPASy.get_prosite_raw('PS00001')
record = Prosite.read(handle)
print(record.pattern)

N-{P}-[ST]-{P}.


## 4.2. ScanProsite

### Import Modules

In [67]:
from Bio.ExPASy import ScanProsite

In [68]:
prot_record = SeqIO.read("prot_seq.fasta", format="fasta")
len(prot_record.seq)

258

In [69]:
handle = ScanProsite.scan(seq=prot_record.seq, mirror="https://prosite.expasy.org/")
result = ScanProsite.read(handle)

In [70]:
result.n_match

1

In [71]:
result[0]

{'sequence_ac': 'USERSEQ1',
 'start': 1,
 'stop': 118,
 'signature_ac': 'PS51921',
 'score': '32.871',
 'level': '0'}

------------------------------------------------------

# 5. KEGG

### Import Modules

In [72]:
from Bio.KEGG import REST, Enzyme

In [74]:
#help(Enzyme)

In [77]:
request = REST.kegg_get("ec:5.4.2.2")
open("ec_5.4.2.2.txt","w").write(request.read())

309567

In [79]:
records = Enzyme.parse(open("ec_5.4.2.2.txt"))
record = list(records)[0]

record.classname

['Isomerases;',
 'Intramolecular transferases;',
 'Phosphotransferases (phosphomutases)']

In [80]:
record.pathway

[('PATH', 'ec00010', 'Glycolysis / Gluconeogenesis'),
 ('PATH', 'ec00030', 'Pentose phosphate pathway'),
 ('PATH', 'ec00052', 'Galactose metabolism'),
 ('PATH', 'ec00230', 'Purine metabolism'),
 ('PATH', 'ec00500', 'Starch and sucrose metabolism'),
 ('PATH', 'ec00520', 'Amino sugar and nucleotide sugar metabolism'),
 ('PATH', 'ec00521', 'Streptomycin biosynthesis'),
 ('PATH', 'ec01100', 'Metabolic pathways'),
 ('PATH', 'ec01110', 'Biosynthesis of secondary metabolites'),
 ('PATH', 'ec01120', 'Microbial metabolism in diverse environments')]

In [81]:
record.genes[:10]

[('HSA', ['5236', '55276']),
 ('PTR', ['456908', '461162']),
 ('PPS', ['100977295', '100993927']),
 ('GGO', ['101128874', '101131551']),
 ('PON', ['100190836', '100438793']),
 ('PPYG', ['129034752', '129035286']),
 ('NLE', ['100596081', '100600656']),
 ('HMH', ['116456694', '116457795']),
 ('SSYN', ['129458637', '129464875']),
 ('MCC', ['100424648', '699401'])]

In [82]:
list_genes = []

for x,y in record.genes:
    list_genes += x.split("\n")
    
print(list_genes[:10])

['HSA', 'PTR', 'PPS', 'GGO', 'PON', 'PPYG', 'NLE', 'HMH', 'SSYN', 'MCC']


------------------------------------------------------