In [69]:
'''
Created on Nov. 24 2015
Updated July 2020
@author: Carl J. Raymond
'''

# Solves CTEA. Modified from EDTA to update for Python 3
# and count optimal alignments

from fasta import read
from enum import Flag, auto

class EditOp(Flag):
    MATCH = auto()
    SUBST = auto()
    GAP_S = auto()
    GAP_T = auto()
    
    #def __repr__(self):
    #    return '<%s>' % self.name
    
with open("data/test_ctea.txt") as spec:
    data_s, data_t = read(spec)

_, S = data_s
#S = "PLEASANTLY"
#S = "MEANLY"
#S = "A"
#S = "PLEASANTLYMEANLY"
#S = "YLTNASAELPYLTNASAELP"
len_s = len(S)
_, T = data_t    
#T = "MEANLY"
# = "PLEASANTLY"
#T = "EAS"
#T = "MEANLYPLEASANTLY"
#T = "YLNAEMYLNAEM"
len_t = len(T)
print(f"S (length {len_s}): {S}")
print(f"T (length {len_t}): {T}")

cost_gap_S = 1
cost_gap_T = 1
cost_substitute = 1

# Allocate and initialize the cost array. Each cell is a tuple:
# (cost, best choice EditOps, num. of best choice EditOps)
# with the best cost at this point, one or more EditOps giving that cost,
# and the count of such EditOps.
# The EditOp token MATCH means that the characters matched; MISMATCH
# means they didn't match, and the choice is to substitute; GAP_S means that a
# position in S was skipped; GAP_T means a postion in T was skipped.
# Cost[0][j] = (j,EditOp.GAP_T,1) for all j and cost[i][0] = (i,EditOp.GAP_S,1) for all i. This
# represents the cost of a gap from positions 1..i of S or T.

# Create first row of cost matrix - gap in S, consume in T.
lastrow = [ (i, EditOp.MATCH if i==0 else EditOp.GAP_T, 1) for i in range(len_s+1)]
cost = [ lastrow ]

# Create each new row, referring to the previous row, and append to the cost matrix
for j in range(len_t):
    thisrow = [ (j+1, EditOp.GAP_S, 1) ]
    for i in range(len_s):
        
        prev_cost = lastrow[i][0]
        if S[i] == T[j]:
            # Match.
            thisrow.append( (prev_cost, EditOp.MATCH, 1) )
            continue
            
        # Mismatch. Find minimum cost of three possibilities. Accumulate
        # all possibilities that have the same minimum cost.
        # 1. Substitute
        substitute = prev_cost + cost_substitute
        op = (substitute, EditOp.SUBST, 1)
        
        # 2. Gap in S, consume in T
        gap_s = lastrow[i+1][0] + cost_gap_S
        if gap_s < op[0]:
            # Best so far
            op = (gap_s, EditOp.GAP_S, 1)
        elif gap_s == op[0]:
            # Same cost as current best. Accumulate EditOp.
            op = (op[0], op[1] | EditOp.GAP_S, op[2]+1)
            
        # 3. Gap in T, consume in S
        gap_t = thisrow[i][0] + cost_gap_T
        if gap_t < op[0]:
            # Best so far
            op = (gap_t, EditOp.GAP_T, 1)
        elif gap_t == op[0]:
            # Same cost as current best. Accumulate EditOp.
            op = (op[0], op[1] | EditOp.GAP_T, op[2]+1)

        thisrow.append(op)

    cost.append(thisrow)
    lastrow = thisrow

#for row in cost:
#    for k in row:
#        print(f"({k[0]}, {k[1]})", end='')
#    print()

# The final edit cost is down in the corner
dist = cost[len_t][len_s][0]
print(f"Edit distance: {dist}")





# Walk the cost matrix from the far corner back to the origin
# recording a lowest cost alignment sequences in reverse
i, j = len_s, len_t
revalign_s = []
revalign_t = []
while i>0 or j>0:

    step = cost[j][i]
    action = step[1]
    if action & (EditOp.MATCH | EditOp.SUBST):
        # Consume in S and T.
        i -= 1
        revalign_s.append(S[i])
        j -= 1
        revalign_t.append(T[j])
    elif action & EditOp.GAP_S:
        # Gap in S, consume in T
        revalign_s.append('-')
        j -= 1
        revalign_t.append(T[j])
    elif action & EditOp.GAP_T:
        # Gap in T, consume in S
        i -= 1
        revalign_s.append(S[i])
        revalign_t.append('-')
    
align_s = "".join(revalign_s[::-1])
align_t = "".join(revalign_t[::-1])

print(align_s)
print(align_t)
with open("data/rosalind_ctea.out", "w+") as output:
    output.write("{0}\n".format(ed))
    output.write("{0}\n".format(align_s))
    output.write("{0}\n".format(align_t))

    


S (length 10): PLEASANTLY
T (length 6): MEANLY
Edit distance: 5
PLEASANTLY
-ME--AN-LY
