## Blast.NCBIWWW

We can use NCBIWWW module for blasting on NCBI server.

read more about it: http://biopython.org/DIST/docs/tutorial/Tutorial.html

In [1]:
from Bio.Blast import NCBIWWW
from Bio import Entrez
Entrez.email = "zihaladavid@gmail.com"  # Always tell NCBI who you are

# our sequence for blasting
seq = 'MKKPDLKVVLLGDMNVGKTSLLHRYMERRFQDTVSTVGGAFYLKQWGPYNISI\
WDTAGREQFHGLGSMYCRAASAVILTYDVSNMQSLLELEDRFLGLTDTASDDCIFAVVGNK\
IDLTEDYNSDSDIEGERPRTSSKIRRQVNLEDAFALYKRIMKYKMLDENVVPAAEKMCFET\
SAKTGYNVDVLFEGVFNMVIPLIVKKKASGLDETVNLAQSKPNKSKSRCCK'

# it is necessary to specify at least type of blast search and database
handle = NCBIWWW.qblast("blastp", "nr", sequence=seq)
with open("my_blast.xml", "w") as res:
    res.write(handle.read())

In [None]:
# similar example as above only with sequence loaded from file
from Bio import SeqIO
from Bio import Entrez
from Bio.Blast import NCBIWWW
Entrez.email = "zihaladavid@gmail.com"  # Always tell NCBI who you are

infile = "mus_musculus_rab20.fasta"
record = SeqIO.read(infile, format="fasta")
handle = NCBIWWW.qblast("blastp", "refseq_protein", record.seq)
with open(infile.split('.')[0]+ ".xml", "w") as res:
    res.write(handle.read())

In [None]:
# easy way how to blast all sequnces from some file
from Bio import SeqIO
from Bio.Blast import NCBIWWW

def NCBI_blast(seq_record):
    net_handle = NCBIWWW.qblast("blastp", "nr", sequence=seq_record.seq)
    with open (seq_record.name + '_NCBI.xml', 'w') as res:
        res.write(net_handle.read())
        
for record in SeqIO.parse('rab20.fasta', 'fasta'):
    NCBI_blast(record)

## Blast.NCBIXML

One of the most importan modules in Biopython. Thanks to NCBIXML we can easily manipulate with blast results for one or more sequences (thousand, million..)

In [1]:
# NCBIXML.read for file with one sequence, for more sequence: NCBIXML.parse
from Bio.Blast import NCBIXML

# we always need to open our file before parsing
blastout = open('my_blast.xml')
blast_record = NCBIXML.read(blastout)

In [3]:
help(blast_record)

Help on Blast in module Bio.Blast.Record object:

class Blast(Header, DatabaseReport, Parameters)
 |  Saves the results from a blast search.
 |  
 |  Members:
 |  descriptions        A list of Description objects.
 |  alignments          A list of Alignment objects.
 |  multiple_alignment  A MultipleAlignment object.
 |  + members inherited from base classes
 |  
 |  Method resolution order:
 |      Blast
 |      Header
 |      DatabaseReport
 |      Parameters
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from Header:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [2]:
# number of hits
len(blast_record.alignments)

50

In [5]:
# avaible functions and attributes for one blast hit
dir(blast_record.alignments[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'accession',
 'hit_def',
 'hit_id',
 'hsps',
 'length',
 'title']

In [4]:
# hit_id
for hit in blast_record.alignments:
    print(hit.accession)

NP_001086022
OCA46054
NP_001017295
XP_012812505
NP_001087734
XP_018412509
OCT95381
OXB60592
XP_015706927
XP_021270904
PIO25014
XP_005488078
XP_416953
XP_003207895
XP_006851577
XP_005037529
XP_005009112
XP_006119758
XP_010573925
XP_002189536
XP_008494111
XP_015477424
XP_014113976
XP_021401254
OPJ66868
KQK78243
XP_005145302
XP_008634095
XP_014738728
XP_014799603
XP_024061847
XP_005238422
PKU41047
XP_013809485
XP_006260526
XP_019380939
XP_009897010
XP_017672819
XP_006017293
XP_003765785
XP_019385208
XP_004447077
XP_005303042
XP_007063971
ANG60437
XP_005317044
XP_003414027
XP_009554335
XP_007937102
XP_005441649


In [7]:
# hit_description
blast_record.alignments[0].hit_def

'RAB20, member RAS oncogene family L homeolog [Xenopus laevis] >gi|49119003|gb|AAH73712.1| MGC83654 protein [Xenopus laevis]'

In [8]:
# full hit name
blast_record.alignments[0].title

'gi|148232154|ref|NP_001086022.1| RAB20, member RAS oncogene family L homeolog [Xenopus laevis] >gi|49119003|gb|AAH73712.1| MGC83654 protein [Xenopus laevis]'

In [9]:
# alignment length
blast_record.alignments[0].length

226

In [10]:
# first hsp
blast_record.alignments[0].hsps[0]

<Bio.Blast.Record.HSP at 0x7f36c004d5c0>

In [11]:
# avaible functions and attributes for one hsp
dir(blast_record.alignments[0].hsps[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'align_length',
 'bits',
 'expect',
 'frame',
 'gaps',
 'identities',
 'match',
 'num_alignments',
 'positives',
 'query',
 'query_end',
 'query_start',
 'sbjct',
 'sbjct_end',
 'sbjct_start',
 'score',
 'strand']

In [12]:
# evalue for first hsp
blast_record.alignments[0].hsps[0].expect

1.50693e-168

In [13]:
# frames of first hsp
blast_record.alignments[0].hsps[0].frame

(0, 0)

In [14]:
# query start
blast_record.alignments[0].hsps[0].query_start

1

In [15]:
# hit start
blast_record.alignments[0].hsps[0].sbjct_start

1

In [16]:
# query end
blast_record.alignments[0].hsps[0].query_end

226

In [17]:
# hit end
blast_record.alignments[0].hsps[0].sbjct_end

226

In [18]:
# print acession for all blast hits
from Bio.Blast import NCBIXML

blastout = open('my_blast.xml')
blast_record = NCBIXML.read(blastout)

for hit in blast_record.alignments:
    print(hit.accession)

NP_001086022
OCA46054
NP_001017295
XP_012812505
NP_001087734
XP_018412509
OCT95381
OXB60592
XP_015706927
XP_021270904
PIO25014
XP_005488078
XP_416953
XP_003207895
XP_006851577
XP_005037529
XP_005009112
XP_006119758
XP_010573925
XP_002189536
XP_008494111
XP_015477424
XP_014113976
XP_021401254
OPJ66868
KQK78243
XP_005145302
XP_008634095
XP_014738728
XP_014799603
XP_024061847
XP_005238422
PKU41047
XP_013809485
XP_006260526
XP_019380939
XP_009897010
XP_017672819
XP_006017293
XP_003765785
XP_019385208
XP_004447077
XP_005303042
XP_007063971
ANG60437
XP_005317044
XP_003414027
XP_009554335
XP_007937102
XP_005441649


In [19]:
# print acession and evalue for all hits in a format: id': id; 'evalue': evalue
from Bio.Blast import NCBIXML

blastout = open('my_blast.xml')
blast_record = NCBIXML.read(blastout)

for aln in blast_record.alignments:
    for hsp in aln.hsps:
        print('id: {}\nevalue:{}\n'.format(aln.accession, hsp.expect))

id: NP_001086022
evalue:1.50693e-168

id: OCA46054
evalue:5.54271e-158

id: NP_001017295
evalue:6.71443e-158

id: XP_012812505
evalue:6.94334e-157

id: NP_001087734
evalue:9.68184e-155

id: XP_018412509
evalue:1.34786e-141

id: OCT95381
evalue:3.88831e-122

id: OXB60592
evalue:6.78537e-122

id: XP_015706927
evalue:7.01215e-122

id: XP_021270904
evalue:7.57121e-122

id: PIO25014
evalue:1.27013e-121

id: XP_005488078
evalue:5.50246e-121

id: XP_416953
evalue:7.07967e-121

id: XP_003207895
evalue:2.28673e-120

id: XP_006851577
evalue:2.77902e-120

id: XP_005037529
evalue:8.15145e-120

id: XP_005009112
evalue:1.08383e-119

id: XP_006119758
evalue:1.27482e-119

id: XP_010573925
evalue:2.60407e-119

id: XP_002189536
evalue:3.69767e-119

id: XP_008494111
evalue:4.12586e-119

id: XP_015477424
evalue:4.80988e-119

id: XP_014113976
evalue:5.92306e-119

id: XP_021401254
evalue:7.70461e-119

id: OPJ66868
evalue:1.08209e-118

id: KQK78243
evalue:1.08209e-118

id: XP_005145302
evalue:1.22069e-118

i