# Introduction to the bioinformatics armory




## Technical setup

In [9]:
## When I sit behind a firewall, the ExPASy module reports the error SSL: Certificate_VERIFY_FAILED
## the code part below removes the error of SSL: Certificate_VERIFY_FAILED
## source: http://blog.pengyifan.com/how-to-fix-python-ssl-certificate_verify_failed/
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    getattr(ssl, '_create_unverified_context', None)): 
    ssl._create_default_https_context = ssl._create_unverified_context

## Question ini

Count nucleotide acids in a DNA sequence with *Bio.Seq* from Biopython

In [2]:
from Bio.Seq import Seq

def countNt(str):
    mySeq = Seq(str)
    res = [mySeq.count('A'), mySeq.count('C'), mySeq.count('G'), mySeq.count('T')]
    return(res)

def printCountNt(s):
    res = countNt(s)
    print(' '.join(list(map(str, res))))
    
myStr = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"

countNt(myStr)
printCountNt(myStr)

20 12 17 21


In [3]:
with open('rosalind_ini.txt', 'r') as f:
    line = f.readline()
    
printCountNt(line)

225 231 220 239


## Question dbpr: protein database

In [4]:
from Bio import ExPASy
from Bio import SwissProt

handle = ExPASy.get_sprot_raw('B5ZC00') ## find a target protein by its ID
record = SwissProt.read(handle)

dir(record) ## check the list of attributes
record.cross_references[0]

('EMBL', 'CP001184', 'ACI60310.1', '-', 'Genomic_DNA')

In [27]:
def getBP(proteinId):
    handle = ExPASy.get_sprot_raw(proteinId)
    res = SwissProt.read(handle)
    gos = []
    for cr in res.cross_references:
        if cr[0]=='GO' and cr[2].startswith('P:'):
            gos.append(cr[2][2:])
    return(gos)

q5slp9Res = getBP('Q5SLP9')
print('\n'.join(q5slp9Res))

DNA recombination
DNA repair
DNA replication


In [42]:
## rstrip('\n') was used to remove the newline character at the end
## or use splitlines()
with open('rosalind_dbpr.txt', 'r') as f:
    ##proid = f.readline().splitlines()[0]
    proid = f.readline().rstrip('\n')


print('\n'.join(getBP(proid)))

regulation of blood pressure
vasodilation


## Problem gbk

Access number of nucleotide GenBank entries for the given genus that were published between the dates specificied


In [58]:
from Bio import Entrez

def gbkEntries(organism, minDate, maxDate):
    Entrez.email = 'jitao_david.zhang@roche.com'
    handle = Entrez.esearch(db="nucleotide", 
                            term='"{organism}"[Organism] AND (["{mindate}"[PDAT] : "{maxdate}"[PDAT]])'.format(
                              organism=organism, mindate=minDate, maxdate=maxDate)
                            )
    record = Entrez.read(handle)
    return(record["Count"])

gbkEntries("Anthoxanthum", "2003/07/25", "2005/12/27")

'7'

In [63]:
with open('rosalind_gbk.txt', 'r') as f:
    species = f.readline().rstrip('\n')
    startdate = f.readline().rstrip('\n')
    enddate = f.readline().rstrip('\n')
    
print(species, startdate, enddate)
print(gbkEntries(species, startdate, enddate))

Prorocentrum 2002/08/10 2003/12/09
103


## Problem frmt: data formats

In [19]:
## programming access to GenBank
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "jitao_david.zhang@roche.com"
handle = Entrez.efetch(db="nucleotide", id=["FJ817486, JX069768, JX469983"], rettype="fasta")
records = list(SeqIO.parse(handle, "fasta"))
print(records[0].id)
print(dir(records[0]))
print(len(records[-1].seq)) ## -1 refers to the last record

FJ817486.1
['__add__', '__bool__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__iter__', '__le__', '__le___', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_per_letter_annotations', '_seq', '_set_per_letter_annotations', '_set_seq', 'annotations', 'dbxrefs', 'description', 'features', 'format', 'id', 'letter_annotations', 'lower', 'name', 'reverse_complement', 'seq', 'upper']
771


In [1]:
def shortestStrings(ids):
    from Bio import Entrez
    from Bio import SeqIO
    import numpy as np
    Entrez.email = "jitao_david.zhang@roche.com"
    handle = Entrez.efetch(db="nucleotide", id=[ids], rettype="fasta")
    records = list(SeqIO.parse(handle, "fasta"))
    lens = [len(r.seq) for r in records]
    isMin = np.argmin(lens)
    print(">{desc}\n{seq}".format(desc=records[isMin].description,
                                  seq=records[isMin].seq))

    
shortestStrings('FJ817486 JX069768 JX469983')

>JX469983.1 Zea mays subsp. mays clone UT3343 G2-like transcription factor mRNA, partial cds
ATGATGTATCATGCGAAGAATTTTTCTGTGCCCTTTGCTCCGCAGAGGGCACAGGATAATGAGCATGCAAGTAATATTGGAGGTATTGGTGGACCCAACATAAGCAACCCTGCTAATCCTGTAGGAAGTGGGAAACAACGGCTACGGTGGACATCGGATCTTCATAATCGCTTTGTGGATGCCATCGCCCAGCTTGGTGGACCAGACAGAGCTACACCTAAAGGGGTTCTCACTGTGATGGGTGTACCAGGGATCACAATTTATCATGTGAAGAGCCATCTGCAGAAGTATCGCCTTGCAAAGTATATACCCGACTCTCCTGCTGAAGGTTCCAAGGACGAAAAGAAAGATTCGAGTGATTCCCTCTCGAACACGGATTCGGCACCAGGATTGCAAATCAATGAGGCACTAAAGATGCAAATGGAGGTTCAGAAGCGACTACATGAGCAACTCGAGGTTCAAAGACAACTGCAACTAAGAATTGAAGCACAAGGAAGATACTTGCAGATGATCATTGAGGAGCAACAAAAGCTTGGTGGATCAATTAAGGCTTCTGAGGATCAGAAGCTTTCTGATTCACCTCCAAGCTTAGATGACTACCCAGAGAGCATGCAACCTTCTCCCAAGAAACCAAGGATAGACGCATTATCACCAGATTCAGAGCGCGATACAACACAACCTGAATTCGAATCCCATTTGATCGGTCCGTGGGATCACGGCATTGCATTCCCAGTGGAGGAGTTCAAAGCAGGCCCTGCTATGAGCAAGTCA


In [2]:
with open('rosalind_frmt.txt', 'r') as f:
    frmt_ids = f.readline().rstrip('\n')
    
print(frmt_ids)
shortestStrings(frmt_ids)

NM_001168970 JX469985 JX205496 JF927165 JX308821 NM_001251956 JX469983 JX428803 JQ011276 NM_001266228
>NM_001168970.1 Papio anubis huntingtin interacting protein K (HYPK), mRNA
ATGCGGCGGCGTGGTGAGATAGATATGGCGACTGAGGGGGATGTGGAGCTGGAGTTAGAGACTGAGACCAGTGGACCAGAGCGGCCTCCCGAGAAGCCACGGAAGCATGACAGCGGTGCCGCGGACTTGGAGCGGGTCACCGACTATGCGGAGGAGAAGGAGATCCAGAGTTCCAATCTGGAGACGGCCATGTCTGTGATTGGAGACAGAAGGTCCCGGGAGCAGAAAGCCAAACAGGAGCGGGAGAAAGAACTGGCAAAAGTCACTATCAAGAAGGAAGATCTGGAGCTGATAATGACCGAGATGGAGATATCTCGAGCAGCAGCAGAACGCAGCTTGCGGGAACACATGGGCAACGTGGTAGAGGCGCTTATTGCCCTAACCAACTGA
