In [3]:
# import Biopythin functions
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import Entrez

# Loading and viewing sequences

In [4]:
# load the gene sequence
gene_id1 = 'NM_001012505.1' # human Foxp1
gene_id1 = 'NM_005522.4' # human HOXA1

Entrez.email = 'A.N.Other@example.com'

# read the first sequence
handle = Entrez.efetch(db="nucleotide", id=gene_id1, rettype="gb", retmode="text")
gene = SeqIO.read(handle, "genbank")
handle.close()

print(gene.description)

Homo sapiens homeobox A1 (HOXA1), transcript variant 1, mRNA


In [5]:
# this is the whole transcript
# location of the coding sequence is here
for f in gene.features:
    if f.type=='CDS':
        print('Coding sequence at:',f.location)
        cds_loc = f.location


('Coding sequence at:', FeatureLocation(ExactPosition(94), ExactPosition(1102), strand=1))


In [6]:
print(gene.seq[cds_loc.start:cds_loc.start+100])
letters="CTCAAGCGTGAGGCCGAGACCCTACGGGAGCGGGAAGGC"

ATGGACAATGCAAGAATGAACTCCTTCCTGGAATACCCCATACTTAGCAGTGGCGACTCGGGGACCTGCTCAGCCCGAGCCTACCCCTCGGACCATAGGA


# Run and parse a BLAST

In [7]:
# run a BLAST on a 100 BP part of the transcript, starting at the start codon

# for available databases, see:
# ftp://ftp.ncbi.nlm.nih.gov/pub/factsheets/HowTo_BLASTGuide.pdf

database = 'refseq_genomic' #refseq_rna' #'refseq_genomic'
#result_handle = NCBIWWW.qblast("blastn", database, gene.seq[cds_loc.start:cds_loc.start+100])
result_handle = NCBIWWW.qblast("blastn", database, letters)


In [8]:
# Parse the retuned structure
blast_records = NCBIXML.parse(result_handle)

In [9]:
# take the first record (we only did one search, so there is only one)
item = next(blast_records)
print("a")


a


In [10]:
E_VALUE_THRESH = 1#0.05

for alignment in item.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print('****************************')
            print('sequence:', alignment.title)
            print('length:', alignment.length)
            print('e value:', hsp.expect)
            print(hsp.query[0:75] + '...')
            print(hsp.match[0:75] + '...')
            print(hsp.sbjct[0:75] + '...')

****************************
('sequence:', u'gi|240958148|ref|NW_002890726.1| Pongo abelii isolate ISIS 71 chromosome 20 genomic scaffold, P_pygmaeus_2.0.2')
('length:', 616788)
('e value:', 1.57037e-09)
CTCAAGCGTGAGGCCGAGACCCTACGGGAGCGGGAAGGC...
|||||||||||||||||||||||||||||||||||||||...
CTCAAGCGTGAGGCCGAGACCCTACGGGAGCGGGAAGGC...
****************************
('sequence:', u'gi|1215891260|ref|NW_018757324.1| Papio anubis isolate 1X1155 chromosome 10 genomic scaffold, Panu_3.0, whole genome shotgun sequence')
('length:', 362516)
('e value:', 1.57037e-09)
CTCAAGCGTGAGGCCGAGACCCTACGGGAGCGGGAAGGC...
|||||||||||||||||||||||||||||||||||||||...
CTCAAGCGTGAGGCCGAGACCCTACGGGAGCGGGAAGGC...
****************************
('sequence:', u'gi|289736363|ref|NW_003185857.1| Callithrix jacchus chromosome 5 unlocalized genomic scaffold, Callithrix jacchus-3.2 CJA5_random_071, whole genome shotgun sequence')
('length:', 137040)
('e value:', 1.57037e-09)
CTCAAGCGTGAGGCCGAGACCCTACGGGAGCGGGAAGGC...
||||||||||