public
Description: repository for the code featured in the blog
Homepage: http://python.genedrift.org
Clone URL: git://github.com/nuin/beginning-python-for-bioinformatics.git
Search Repo:
new scripts added with extra comments
Paulo Nuin (author)
Fri May 09 10:57:48 -0700 2008
commit  bff13f0a739d8c120b8b789154a28a6e0bd3fe85
tree    ff0396d935751ea9a0c1d8a339a848ac0c2310e3
parent  f3a50a14796a2e7127a94e5107518f3340e05b40
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
0
@@ -1 +1,46 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+simple script to find motifs on DNA sequences using regex
0
+the script is interactive
0
+'''
0
+
0
+# we use the RegEx module
0
+import re
0
+import string
0
+
0
+#still keep the file fixed
0
+dnafile = "AY162388.seq"
0
+
0
+#opening the file, reading the sequence and storing in a list
0
+seqlist = open(dnafile, 'r').readlines()
0
+
0
+#let's join the the lines in a temporary string
0
+temp = ''.join(seqlist)
0
+
0
+#assigning our sequence, with no carriage returns to our
0
+#final variable/object
0
+sequence = temp.replace('\n', '')
0
+
0
+#we start to deal with user input
0
+#first we use a boolean variable to check for valid input
0
+inputfromuser = True
0
+
0
+#while loop: while there is an motif larger than 0
0
+#the loop continues
0
+while inputfromuser:
0
+ #raw_input received the user input as string
0
+ inmotif = raw_input('Enter motif to search: ')
0
+ #now we check for the size of the input
0
+ if len(inmotif) >= 1:
0
+ #we compile a regex with the input given
0
+ motif = re.compile('%s' % inmotif)
0
+ #looking to see if the entered motif is in the sequence
0
+ if re.search(motif, sequence):
0
+ print 'Yep, I found it'
0
+ else:
0
+ print 'Sorry, try another one'
0
+ else:
0
+ print 'Done, thanks for using motif_search'
0
+ inputfromuser = False
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
0
@@ -1 +1,65 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+another script to find motifs on DNA sequences with more features than
0
+code_05.py
0
+'''
0
+
0
+# we use the RegEx module
0
+import re
0
+import string
0
+#we also import the sys module
0
+import sys
0
+
0
+#set the variable to control the loop
0
+fileinput = True
0
+while fileinput == True:
0
+ #ask user for the input
0
+ filename = raw_input('Enter file name:')
0
+ if len(filename) > 0:
0
+ #we try to open the file
0
+ try:
0
+ dnafile = open(filename, 'r')
0
+ #success! we finish the loop and move to the next input
0
+ fileinput = False
0
+ except:
0
+ #no dice, file does not exist
0
+ #keep the loop on and ask again
0
+ print 'File does not exist'
0
+ else:
0
+# fileinput = False
0
+ sys.exit()
0
+
0
+
0
+#opening the file, reading the sequence and storing in a list
0
+seqlist = open(filename, 'r').readlines()
0
+
0
+#let's join the the lines in a temporary string
0
+temp = ''.join(seqlist)
0
+
0
+#assigning our sequence, with no carriage returns to our
0
+#final variable/object
0
+sequence = temp.replace('\n', '')
0
+
0
+#we start to deal with user input
0
+#first we use a boolean variable to check for valid input
0
+inputfromuser = True
0
+
0
+#while loop: while there is an motif larger than 0
0
+#the loop continues
0
+while inputfromuser:
0
+ #raw_input received the user input as string
0
+ inmotif = raw_input('Enter motif to search: ')
0
+ #now we check for the size of the input
0
+ if len(inmotif) >= 1:
0
+ #we compile a regex with the input given
0
+ motif = re.compile('%s' % inmotif)
0
+ #looking to see if the entered motif is in the sequence
0
+ if re.search(motif, sequence):
0
+ print 'Yep, I found it'
0
+ else:
0
+ print 'Sorry, try another one'
0
+ else:
0
+ print 'Done, thanks for using motif_search'
0
+ inputfromuser = False
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
0
@@ -1 +1,43 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+counting the nucleotides in a sequence, iterating
0
+through lists
0
+'''
0
+
0
+#let's keep the file fixed for now
0
+dnafile = "AY162388.seq"
0
+
0
+#opening the file, reading the sequence and storing in a list
0
+file = open(dnafile, 'r')
0
+
0
+#initialize a string to receive the data
0
+sequence = ''
0
+for line in file:
0
+ sequence += line.strip() #notice the strip, to remove \n
0
+
0
+#"exploding" the sequence in a list
0
+seqlist = list(sequence)
0
+
0
+#initializing integers to store the counts
0
+totalA = 0
0
+totalC = 0
0
+totalG = 0
0
+totalT = 0
0
+
0
+#checking each item in the list and updating counts
0
+for base in seqlist:
0
+ if base == 'A':
0
+ totalA += 1
0
+ elif base == 'C':
0
+ totalC += 1
0
+ elif base == 'G':
0
+ totalG += 1
0
+ elif base == 'T':
0
+ totalT += 1
0
+
0
+print str(totalA) + ' As found'
0
+print str(totalC) + ' Cs found'
0
+print str(totalG) + ' Gs found'
0
+print str(totalT) + ' Ts found'
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
0
@@ -1 +1,26 @@
0
+#!/usr/bin/env python
0
+
0
+'''script that counts the number of bases in a DNA sequence
0
+showing the string.count() method'''
0
+
0
+#still keep the file fixed
0
+dnafile = "AY162388.seq"
0
+
0
+#opening the file, reading the sequence and storing in a list
0
+seqlist = open(dnafile, 'r').readlines()
0
+
0
+#let's join the the lines in a temporary string
0
+temp = ''.join(seqlist)
0
+
0
+#counting
0
+totalA = temp.count('A')
0
+totalC = temp.count('C')
0
+totalG = temp.count('G')
0
+totalT = temp.count('T')
0
+
0
+#printing results
0
+print str(totalA) + ' As found'
0
+print str(totalC) + ' Cs found'
0
+print str(totalG) + ' Gs found'
0
+print str(totalT) + ' Ts found'
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
0
@@ -1 +1,57 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+script that counts the number of each nucleotide in a sequence using
0
+user input and saving to a file.
0
+'''
0
+
0
+import sys
0
+import re
0
+
0
+
0
+fileentered = True #flag that determines if a filename has been entered
0
+
0
+while fileentered == True:
0
+ #ask the user to input a filename
0
+ filename = raw_input('Please enter a file to check: ')
0
+ #if a filename was entered, go ...
0
+ if len(filename) >= 1:
0
+ try:
0
+ #open the file
0
+ seqlist = open(filename, 'r').readlines()
0
+ #sequence is read as a list, convert to string
0
+ sequence = ''.join(seqlist)
0
+ #remove carriage returns
0
+ sequence = sequence.replace('\n', '')
0
+ #counting
0
+ totalA = sequence.count('A')
0
+ totalC = sequence.count('C')
0
+ totalG = sequence.count('G')
0
+ totalT = sequence.count('T')
0
+ #create a regex object with non-nucleotide letters to check for "errors"
0
+ otherletter = re.compile('[BDEFHIJKLMNOPQRSUVXZ]+')
0
+ #find possible non-nucleotides
0
+ extra = re.findall(otherletter, sequence)
0
+ #open an output filename to output counts
0
+ output = open(filename + '.count', 'w')
0
+ #writing the output
0
+ output.write('Count report for file ' + filename + '\n')
0
+ output.write('A = ' + str(totalA) + '\n')
0
+ output.write('C = ' + str(totalC) + '\n')
0
+ output.write('G = ' + str(totalG) + '\n')
0
+ output.write('T = ' + str(totalT) + '\n')
0
+ #if there are non-nucleotides in the sequence, report them
0
+ if len(extra) > 0:
0
+ output.write('Also were found ' + str(len(extra)) + ' errors\n')
0
+ for i in extra:
0
+ output.write(i + ' ')
0
+ else:
0
+ output.write('No error found')
0
+ print 'Result file saved on ' + filename + '.count'
0
+ except:
0
+ print 'File not found. Please try again.'
0
+ else:
0
+ #if no filename entered, exit
0
+ fileentered = False
0
+ sys.exit()
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
0
@@ -1 +1,27 @@
0
+#! /usr/bin/env python
0
+
0
+'''
0
+Python functions
0
+'''
0
+
0
+def add_tail(seq):
0
+ '''function that adds a poly-T tail to sequences'''
0
+ result = seq + 'TTTTTTTTTTTTTTTTTTTTT'
0
+ return result
0
+
0
+#opening the file
0
+dnafile = 'AY162388.seq'
0
+file = open(dnafile, 'r')
0
+
0
+#reading the sequence from the file
0
+sequence = ''
0
+for line in file:
0
+ sequence += line.strip()
0
+
0
+#printing result
0
+print sequence
0
+#calling the function to add the tail
0
+sequence = add_tail(sequence)
0
+#printing new sequence
0
+print sequence
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
0
@@ -1 +1,34 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+making a function to count nucleotides
0
+'''
0
+
0
+import sys
0
+
0
+def count_nucleotide_types(seq):
0
+ '''counting nucleotides and returning a list with counts'''
0
+ result = []
0
+ totalA = seq.count('A')
0
+ totalC = seq.count('C')
0
+ totalG = seq.count('G')
0
+ totalT = seq.count('T')
0
+
0
+ result.append(totalA)
0
+ result.append(totalC)
0
+ result.append(totalG)
0
+ result.append(totalT)
0
+
0
+ return result
0
+
0
+#opening the file
0
+sequencefile = open(sys.argv[1], 'r').readlines()
0
+#joining a sequence as a list into a string
0
+sequence = ''.join(sequencefile)
0
+#replacing carriage returns
0
+sequence = sequence.replace('\n', '')
0
+#counting the nucleotides
0
+values = count_nucleotide_types(sequence)
0
+#printing the results
0
+print values
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
0
@@ -1 +1,33 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+an extremely simple dice game
0
+'''
0
+
0
+#we need the random module
0
+import random
0
+import string
0
+
0
+#generating two dices, between 1 and 6 for the human player
0
+dice1 = random.randint(1, 6)
0
+dice2 = random.randint(1, 6)
0
+
0
+#generating two dices, between 1 and 6 for the computer player
0
+computerdice1 = random.randint(1, 6)
0
+computerdice2 = random.randint(1, 6)
0
+
0
+#summing up both dices for each player
0
+mine = dice1 + dice2
0
+his_hers = computerdice1 + computerdice2
0
+
0
+#printing the values
0
+print 'mine = ' + str(mine) + ' vs. computer = ' + str(his_hers)
0
+
0
+#chdking the results and proclaiming the winner
0
+if mine > his_hers:
0
+ print "I won"
0
+elif mine < his_hers:
0
+ print "Computer won"
0
+else:
0
+ print "Tie. Try again"
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
0
@@ -1 +1,27 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+very simple script to generate random DNA sequences
0
+'''
0
+
0
+#random module is needed
0
+import random
0
+import sys
0
+
0
+#sequence length is a parameter
0
+length = int(sys.argv[1])
0
+
0
+#template DNA is a list with ACGT repeats
0
+dnaseq = list('ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT')
0
+
0
+#print the template
0
+print dnaseq
0
+
0
+result = ''
0
+for i in range(length):
0
+ #for the simulated sequence we use random.choice
0
+ #that randonly selects items of a list
0
+ result += random.choice(dnaseq)
0
+
0
+print result
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
0
@@ -1 +1,41 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+a more elaborated script to generate random DNA sequences
0
+'''
0
+
0
+import random
0
+import sys
0
+
0
+def simulate_sequence(length):
0
+ '''function the generates the simulations'''
0
+ #list with nucleotides
0
+ dna = ['A', 'C', 'G', 'T']
0
+ #initializing the sequence
0
+ sequence = ''
0
+ #iterates over the input sequence length ...
0
+ for i in range(length):
0
+ #and chooses randomly the nucletides
0
+ sequence += random.choice(dna)
0
+ #returns simulated sequence
0
+ return sequence
0
+
0
+#first parameter is the number of sequences to generate
0
+setsize = int(sys.argv[1])
0
+#minimum and maximum sequence lengths
0
+minlength = int(sys.argv[2])
0
+maxlength = int(sys.argv[3])
0
+
0
+#initializes a list to store the sequence set
0
+sequenceset = []
0
+for i in range(setsize):
0
+ #generate a random integer between min and max seq lenght
0
+ rlength = random.randint(minlength, maxlength)
0
+ #appending to the sequence set and calling simulated sequence
0
+ #function
0
+ sequenceset.append(simulate_sequence(rlength))
0
+
0
+#printing output
0
+for sequence in sequenceset:
0
+ print sequence
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
0
@@ -1 +1,65 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+a even more elaborated DNA sequence simulation script with
0
+sequence identity calculation (not overall, just neighbours)
0
+'''
0
+
0
+import random
0
+import sys
0
+
0
+def simulate_sequence(length):
0
+ '''function that simulates the sequences'''
0
+ #nucleotides list
0
+ dna = ['A', 'C', 'G', 'T']
0
+ sequence = ''
0
+ #randomly picking from the nucleotide list
0
+ for i in range(length):
0
+ sequence += random.choice(dna)
0
+ return sequence
0
+
0
+def nucleotide_percentage(sequence):
0
+ #counting the nucleotides
0
+ print str(sequence.count('A')) + ' As ',
0
+ print str(sequence.count('C')) + ' Cs ',
0
+ print str(sequence.count('G')) + ' Gs ',
0
+ print str(sequence.count('T')) + ' Ts '
0
+
0
+def sequence_identity(seqset):
0
+ '''function that calculates sequence identies'''
0
+ iden = []
0
+ count = 0.0
0
+ #iterates through the sequences in the set -1
0
+ #and calculates sequence identities
0
+ for x in range(len(seqset) - 1):
0
+ print str(x), str(x+1)
0
+ for n in range(len(seqset[x])):
0
+ #iterates over all nucleotides and checks for identical ones
0
+ if seqset[x][n] == seqset[x + 1][n]:
0
+ count += 1
0
+ iden.append(count / len(seqset[x]))
0
+ count = 0.0
0
+ return iden
0
+
0
+#input parameters
0
+setsize = int(sys.argv[1])
0
+minlength = int(sys.argv[2])
0
+maxlength = int(sys.argv[3])
0
+
0
+#generates simulated sequence sets
0
+sequenceset = []
0
+for i in range(setsize):
0
+ rlength = random.randint(minlength, maxlength)
0
+ sequenceset.append(simulate_sequence(rlength))
0
+
0
+#calculate sequence identities
0
+identity = sequence_identity(sequenceset)
0
+
0
+#prints the results
0
+for i in range(len(sequenceset)):
0
+ print sequenceset[i]
0
+ if i < len(sequenceset) - 1:
0
+ print 'sequence identity to next sequence : ' + str(identity[i])
0
+ nucleotide_percentage(sequenceset[i])
0
+ print
...
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
0
@@ -1 +1,12 @@
0
+#! /usr/bin/env python
0
+
0
+'''
0
+extremely simple script to DNA transcription
0
+'''
0
+
0
+
0
+myDNA = 'ACGTTGCAACGTTGCAACGTTGCA'
0
+#string buil-in replace method
0
+myRNA = myDNA.replace('T', 'U')
0
+print myRNA
...
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
0
@@ -1 +1,14 @@
0
+#! /usr/bin/env python
0
+
0
+'''
0
+reading a sequence file and printing first and last lines
0
+'''
0
+
0
+dnafile = "AY162388.seq"
0
+file = open(dnafile, 'r').readlines()
0
+print 'I want the first line'
0
+print file[0]
0
+print 'now the last line'
0
+#print file[len(file)-1]
0
+print file[-1]
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
0
@@ -1 +1,26 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+translating DNA into proteins
0
+'''
0
+
0
+#import our homemade module that has the DNA
0
+#translating function
0
+import dnatranslate
0
+
0
+#OK, we are using the same DNA file
0
+dnafile = open("AY162388.seq", 'r').readlines()
0
+
0
+#opening the file and stripping and joining the lines
0
+sequence = ''
0
+for line in dnafile:
0
+ sequence += line.strip()
0
+
0
+#call the function in our module and translating the sequence
0
+protein = dnatranslate.translate_dna(sequence)
0
+
0
+#output, simple, we could make it better
0
+print sequence, len(sequence)
0
+print
0
+print protein, len(protein)
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
0
@@ -1 +1,17 @@
0
+#!/usr/bin/env python
0
+
0
+import dnatranslate
0
+
0
+dnafile = open("AY162388.seq", 'r').readlines()
0
+
0
+sequence = ''
0
+for line in dnafile:
0
+ sequence += line.strip()
0
+
0
+
0
+protein = dnatranslate.translate_dna(sequence)
0
+
0
+print sequence, len(sequence)
0
+print
0
+print protein, len(protein)
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
0
@@ -1 +1,23 @@
0
+#!/usr/bin/env python
0
+
0
+'''
0
+using the fasta module to read sequences
0
+'''
0
+
0
+#import our freshly created module
0
+import fasta
0
+import sys
0
+
0
+#read the fasta file in one line: open the file, read the contents
0
+#and send it to the fasta reading function
0
+sequences = fasta.read_fasta(open(sys.argv[1], 'r').readlines())
0
+
0
+temp = []
0
+for i in sequences:
0
+ #print the sequence name
0
+ print i.name
0
+ #use range with a step of 80, printing 80 characters at
0
+ #a time. The value could be set by a input parameter
0
+ for j in range(0,len(i.sequence),80):
0
+ print i.sequence[j:j+80]

Comments

    No one has commented yet.