forked from xuwd11/Coursera-Bioinformatics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
13_GreedyMotifSearch.py
126 lines (114 loc) · 3.45 KB
/
13_GreedyMotifSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# python3
import sys
import math
import numpy as np
def Profile(motifs):
k = len(motifs[0])
n = len(motifs)
s = 1 / n
seq1 = 'ACGTacgt01230123'
seq_dict = { seq1[i]:int(seq1[i+8]) for i in range(8) }
P = [[0 for _ in range(k)] for __ in range(4)]
for motif in motifs:
for i in range(k):
P[seq_dict[motif[i]]][i] += s
return P
def PseudoProfile(motifs):
k = len(motifs[0])
n = len(motifs)
s = 1 / (n + 4)
seq1 = 'ACGTacgt01230123'
seq_dict = { seq1[i]:int(seq1[i+8]) for i in range(8) }
P = [[1 for _ in range(k)] for __ in range(4)]
for motif in motifs:
for i in range(k):
P[seq_dict[motif[i]]][i] += s
return P
def Consensus(motifs):
P = Profile(motifs)
k = len(P[0])
Pt = [[row[i] for row in P] for i in range(k)]
seq_dict = ['A', 'C', 'G', 'T']
return ''.join([seq_dict[np.argmax(Pt[i])] for i in range(k)])
def Score(motifs):
k = len(motifs[0])
n = len(motifs)
seq1 = 'ACGTacgt01230123'
seq_dict = { seq1[i]:int(seq1[i+8]) for i in range(8) }
P = [[0 for _ in range(4)] for __ in range(k)]
for motif in motifs:
for i in range(k):
P[i][seq_dict[motif[i]]] += 1
Sm = 0
for i in range(k):
Sm += max(P[i])
return n * k - Sm
def Count(motifs):
k = len(motifs[0])
n = len(motifs)
seq1 = 'ACGTacgt01230123'
seq_dict = { seq1[i]:int(seq1[i+8]) for i in range(8) }
P = [[0 for _ in range(k)] for __ in range(4)]
for motif in motifs:
for i in range(k):
P[seq_dict[motif[i]]][i] += 1
return P
def Pr(pattern, profile):
seq1 = 'ACGTacgt01230123'
seq_dict = { seq1[i]:int(seq1[i+8]) for i in range(8) }
p = 1
k = len(pattern)
for i in range(k):
p *= profile[seq_dict[pattern[i]]][i]
return p
def ProfileMostPr_kmer(seq, k, profile):
l = len(seq)
pmax = -1
imax = -1
for i in range(l-k+1):
p = Pr(seq[i:i+k], profile)
if p > pmax:
pmax = p
imax = i
return seq[imax:imax+k]
def GreedyMotifSearch(dna, k, t):
BestMotifs = [dna[i][0:k] for i in range(t)]
BestScore = float('inf')
dna1 = dna[0]
l1 = len(dna1)
for i in range(l1-k+1):
motifs = []
motifs.append(dna1[i:i+k])
for i in range(1, t):
P = Profile(motifs)
motifs.append(ProfileMostPr_kmer(dna[i], k, P))
currScore = Score(motifs)
if currScore < BestScore:
BestMotifs = motifs
BestScore = currScore
return BestMotifs
def GreedyMotifSearch2(dna, k, t):
#GreedyMotifSearch with pseudocounts
BestMotifs = [dna[i][0:k] for i in range(t)]
BestScore = float('inf')
dna1 = dna[0]
l1 = len(dna1)
for i in range(l1-k+1):
motifs = []
motifs.append(dna1[i:i+k])
for i in range(1, t):
P = PseudoProfile(motifs)
motifs.append(ProfileMostPr_kmer(dna[i], k, P))
currScore = Score(motifs)
if currScore < BestScore:
BestMotifs = motifs
BestScore = currScore
return BestMotifs
if __name__ == "__main__":
data = list(sys.stdin.read().strip().split())
k = int(data[0])
t = int(data[1])
dna = data[2:]
BestMotifs = GreedyMotifSearch2(dna, k, t)
for motif in BestMotifs:
print(motif)