In [12]:
'''
Created on Dec 21, 2012

@author: Carl Raymond
'''

from operator import itemgetter
from functools import reduce

def maxOverlap(head, tail):
    maxoverlap = min(len(head), len(tail))
    for n in range(maxoverlap, 0, -1):
        if head[-n:] == tail[:n]:
            return n
    return 0

# Simple greedy algorithm for finding shortest superstring. May not work in the general
# case.  The starting point is the sequence which is the worst follower (having minimum maxoverlap compared to
# the others as heads when it is the tail).  Then find the successor, which is the sequence with the most overlap.
# Continue finding the best successor, and hope we don't get a cycle.  (Hypothesis: the "overlap by more than
# half" condition will prevent a cycle.)
with open('data/rosalind_long.txt') as spec:
    data = [seq.strip() for seq in spec]
count = len(data)
print("{0} sequences.".format(count))

# Compute overlap of all pairs.  Omit self-comparisons. overlap[i][j] contains the maximum overlap
# between head sequence i and tail sequence j
overlap = [[ -1 if h == t else maxOverlap(data[h], data[t]) for t in range(count)] for h in range(count) ]

print("Overlaps:")
for row in overlap: print(row)

# Find starting point: choose the sequence that's the worst follower, by having the minimum of maximum
# overlaps with other sequences when it's the tail
# Build list of max follow scores for each sequence
maxfollow = [reduce(max, (row[i] for row in overlap)) for i in range(count)]
print(maxfollow)
# Find the index of the worst follower. It is  the leader.
index, score = min(enumerate(maxfollow), key=itemgetter(1))
print(f" 0: {index:2} ({score:3})")

# Build the superstring as a list
superstring = [x for x in data[index]]

# Find each successor, which has the most overlap with the previous sequence
for n in range(count-1):
    nextindex, score = max(enumerate(overlap[index]), key=itemgetter(1))
    print("{n+1:2}: {nextindex:2} ({score:3})")
    superstring.append(data[nextindex][score:])
    index = nextindex
    
    
result = "".join(superstring)
print("Result:")
print(result)
print("Total length: {0}".format(len(result)))


50 sequences.
Overlaps:
[-1, 0, 0, 0, 2, 1, 1, 1, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 2, 0, 1, 0, 0, 0, 704, 359, 1, 0]
[0, -1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 2, 188, 0, 0, 0, 0, 586, 1, 2, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 1, 0, 2, 0, 0, 0, 1, 0]
[1, 0, -1, 620, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 155, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 2, 2, 0, 0]
[2, 0, 0, -1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 196, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 535, 2, 0, 0, 0, 1, 1, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0]
[3, 0, 0, 0, -1, 0, 0, 0, 1, 0, 117, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 3, 1, 0, 1, 0, 2, 0, 1, 2, 1, 0, 1, 0, 0, 2, 528]
[0, 1, 1, 0, 1, -1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 0, 0, 1, 0, 546, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 1, 1, 0, 1, 0, -1, 5, 243, 0, 1, 0, 1,