forked from xuwd11/Coursera-Bioinformatics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
61_07_ShortestNonsharedSubstring.py
281 lines (250 loc) · 10.5 KB
/
61_07_ShortestNonsharedSubstring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# python3
import sys
import queue
import numpy as np
from copy import deepcopy
'''
Shortest Non-Shared Substring Problem: Find the shortest substring of one string that does not appear in another string.
Input: Strings Text1 and Text2.
Output: The shortest substring of Text1 that does not appear in Text2.
Sample Input:
CCAAGCTGCTAGAGG
CATGCTGGGCTGGCT
Sample Output:
AA
'''
class Node:
total = 0
def __init__(self, isLeaf = True, startIdx = 0, depth = 0, parentEdge = None):
Node.total += 1
self.id = self.total
self.isLeaf = isLeaf
self.startIdx = startIdx
self.depth = depth
self.parentEdge = parentEdge
class Edge:
def __init__(self, startIdx, endIdx, text, startNode, endNode, leafLabel = None):
self.startIdx = startIdx
self.endIdx = endIdx
self.text = text
self.startNode = startNode
self.endNode = endNode
self.leafLabel = leafLabel
def length(self):
return self.endIdx - self.startIdx + 1
def str(self):
return self.text[self.startIdx:self.endIdx+1]
def startChar(self):
return self.text[self.startIdx]
def __str__(self):
return self.startChar()
class dfs:
def __init__(self, G, root):
self.G = G
self.root = root
def search(self):
G = self.G
root = self.root
forward = 1 # traversing edge (v,w) from v to w
reverse = -1 # returning backwards on (v,w) from w to v
nontree = 0 # edge (v,w) is not part of the DFS tree
"""
Generate sequence of triples (v,w,edgetype) for DFS of graph G.
The subsequence for each root of each tree in the DFS forest starts
with (root,root,forward) and ends with (root,root,reverse).
If the initial vertex is given, it is used as the root and vertices
not reachable from it are not searched.
"""
visited = set()
for v in [root]:
if v not in visited:
yield v,v,forward
visited.add(v)
stack = [(v,iter([edge.endNode for edge in G[v].values()]))]
while stack:
parent,children = stack[-1]
try:
child = next(children)
if child in visited:
yield parent,child,nontree
else:
yield parent,child,forward
visited.add(child)
stack.append((child,iter([edge.endNode for edge in G[child].values()])))
except StopIteration:
stack.pop()
if stack:
yield stack[-1][0],parent,reverse
yield v,v,reverse
def postorder(self):
"""Generate all vertices of graph G in depth-first postorder."""
forward = 1 # traversing edge (v,w) from v to w
reverse = -1 # returning backwards on (v,w) from w to v
nontree = 0 # edge (v,w) is not part of the DFS tree
for v,w,edgetype in self.search():
if edgetype is reverse:
yield w
class SuffixTree: #Naive algorithm
def __init__(self):
#text1, text2 = self._input()
text1, text2 = self.readFromFile()
self.text = text1+'#'+text2+'$'
self.l1, self.l2 = len(text1), len(text2)
self.root = Node()
self.tree = dict()
self.build(self.root, self.text)
#lss = self.longestSharedSubstring(self.tree, self.root, self.text, self.l1, self.l2)
#print(lss)
sns = self.shortestNonsharedSubstring(self.tree, self.root, self.text, self.l1, self.l2)
print(sns)
def _input(self):
data = sys.stdin.read().strip().split()
return data[0], data[1]
def readFromFile(self, fileName = 'input.txt'):
f = open(fileName, 'r')
data = f.read().strip().split()
f.close()
return data[0], data[1]
def match(self, i, root, text):
l = len(text)
currNode = root
atNode = True
for j in range(i, l):
if atNode:
currPos = 0
if not text[j] in self.tree[currNode]:
return (currNode, None, j, -1)
else:
currEdge = self.tree[currNode][text[j]]
currString = currEdge.str()
lrString = len(currString) - 1
if lrString == 0:
currNode = currEdge.endNode
continue
else:
atNode = False
else:
currPos += 1
if text[j] != currString[currPos]:
return (currNode, currEdge, j, currEdge.startIdx + currPos)
else:
lrString -= 1
if lrString == 0:
currNode = currEdge.endNode
atNode = True
def addEdge(self, node, startIdx, endIdx, leafLabel):
newEndNode = Node(True, leafLabel, endIdx-leafLabel)
self.tree[newEndNode] = dict()
newEdge = Edge(startIdx, endIdx, self.text, node, newEndNode, leafLabel)
newEndNode.parentEdge = newEdge
self.tree[node][newEdge.startChar()] = newEdge
node.isLeaf = False
def splitEdge(self, edge, startIdx, endIdx, cutIdx, leafLabel):
newNode = Node(False, leafLabel, startIdx-leafLabel)
newEndNode = Node(True, leafLabel, endIdx-leafLabel)
newEdge = Edge(startIdx, endIdx, self.text, newNode, newEndNode, leafLabel)
newEndNode.parentEdge = newEdge
self.tree[newNode] = dict()
self.tree[newNode][newEdge.startChar()] = newEdge
self.tree[newEndNode] = dict()
edge2 = Edge(cutIdx, edge.endIdx, self.text, newNode, edge.endNode, leafLabel)
edge.endNode.parentEdge = edge2
self.tree[newNode][edge2.startChar()] = edge2
self.tree[edge.startNode][edge.startChar()].endIdx = cutIdx - 1
self.tree[edge.startNode][edge.startChar()].endNode = newNode
def build(self, root, text):
l = len(text)
node = Node(True, 0, l-1)
edge1 = Edge(0, l-1, text, root, node)
node.parentEdge = edge1
self.tree[root] = dict()
self.tree[root][edge1.startChar()] = edge1
self.tree[node] = dict()
for i in range(1, l):
currNode, currEdge, j, cutIdx = self.match(i, root, text)
if not currEdge:
self.addEdge(currNode, j, l-1, i)
else:
self.splitEdge(currEdge, j, l-1, cutIdx, i)
def exploreEdges(self, tree):
results = []
for node in tree.keys():
for edge in tree[node].values():
results.append(edge.str())
return results
def printTree(self, tree, color):
for node in tree.keys():
print(node.id, node.startIdx, node.depth, color[node])
for edge in tree[node].values():
if edge.endNode == None:
e = None
else:
e = edge.endNode.id
print(edge.startIdx, edge.endIdx, edge.startNode.id, e, color[edge.endNode], edge.str())
print('')
def printColors(self, color):
for v, c in color.items():
print(v.id, c)
def saveEdges(self, tree):
f = open('result.txt', 'w')
f.write('\n'.join(self.exploreEdges(tree)))
f.close()
def longestSharedSubstring(self, tree, root, text, l1, l2):
# 0: red; a path ending in it spells out a substring that appears in text2 but not in text1
# 1: blue; a path ending in it spells out a substring that appears in text1 but not in text2
# 2: purple; a path ending in a purple node in the suffix tree of Text1 and Text2 spells out a substring shared by
# Text1 and Text2
post = list(dfs(tree, root).postorder())
color = dict()
for v in post:
if v.isLeaf:
color[v] = 0 if v.depth <= l2 else 1
else:
ccolors = [color[w.endNode] for w in tree[v].values()]
if 1 == len(set(ccolors)):
color[v] = ccolors[0]
else:
color[v] = 2
# Find longest shared substring
best = (0, 0)
for v in post:
if 2 == color[v] and v.depth > best[1]:
best = (v.startIdx, v.depth)
return text[best[0]:best[0]+best[1]]
def shortestNonsharedSubstring(self, tree, root, text, l1, l2):
# 0: red; a path ending in it spells out a substring that appears in text2 but not in text1
# 1: blue; a path ending in it spells out a substring that appears in text1 but not in text2
# 2: purple; a path ending in a purple node in the suffix tree of Text1 and Text2 spells out a substring shared by
# Text1 and Text2
post = list(dfs(tree, root).postorder())
color = dict()
for v in post:
if v.isLeaf:
color[v] = 0 if v.parentEdge.startIdx > l1 else 1
else:
ccolors = [color[w.endNode] for w in tree[v].values()]
if 1 == len(set(ccolors)):
color[v] = ccolors[0]
else:
color[v] = 2
# Find shortest nonshared substring
best = (None, np.inf)
for v in post:
if 1 != color[v] or None == v.parentEdge:
continue
if v.isLeaf:
leafLength = l1 - v.parentEdge.startIdx # length of text1 part at leaf edge
if leafLength > 0:
currLength = v.parentEdge.startIdx+1-v.startIdx
if currLength < best[1]:
best = (v.startIdx, currLength)
else:
currLength = v.depth+1-v.parentEdge.length()
if currLength < best[1]:
best = (v.startIdx, currLength)
if best[0] != None:
return text[best[0]:best[0]+best[1]]
else:
return 'NA'
if __name__ == "__main__":
SuffixTree()