In [12]:
'''
Created on Aug. 16 2020
@author: Carl J. Raymond
'''

# Solves GLOB. Modified from CTEA

from scoring.blosum62 import BLOSUM62
bl62 = BLOSUM62()
from fasta import read
from enum import Flag, auto

class EditOp(Flag):
    MATCH = auto()
    SUBST = auto()
    GAP_S = auto()
    GAP_T = auto()
    
    #def __repr__(self):
    #    return '<%s>' % self.name
    
with open("data/rosalind_glob.txt") as spec:
    data_s, data_t = read(spec)

_, S = data_s
#S = "PLEASANTLY"
#S = "MEANLY"
len_s = len(S)
_, T = data_t    
#T = "MEANLY"
#T = "PLEASANTLY"
len_t = len(T)
print(f"S (length {len_s}): {S}")
print(f"T (length {len_t}): {T}")

cost_gap_S = -5
cost_gap_T = -5


# Allocate and initialize the cost/count matrix. Each cell is a tuple:
# (cost, best choice EditOps, number of optimal alignments)
# with the best cost at this point, and one or more EditOps giving that cost,
# and the number of optimal alignments of that cost at this point.
# The EditOp token MATCH means that the characters matched; SUBST
# means they didn't match, and the choice is to substitute; GAP_S means that a
# position in S was skipped; GAP_T means a postion in T was skipped.
# Cost[0][j] = (j,EditOp.GAP_T,1) for all j and cost[i][0] = (i,EditOp.GAP_S,1)
# for all i.

# Work row-by-row, computing each position in the current row by looking
# at the current row's previous position and the previous row.

# Create first row of cost matrix - gap in S, consume in T.
thisrow = [ (i*cost_gap_T, EditOp.MATCH if i==0 else EditOp.GAP_T, 1) for i in range(len_s+1)]
cost = [ thisrow ]

# Create each new row, referring to the previous row, and append to the cost matrix
for j in range(len_t):
    lastrow = thisrow
    thisrow = [ ((j+1)*cost_gap_S, EditOp.GAP_S, 1) ]
    for i in range(len_s):
        
        prev_score = lastrow[i][0]
        prev_counts = lastrow[i][2]
        op = (prev_score + bl62.score(S[i], T[j]), EditOp.MATCH, prev_counts)
            
        # Compare other operations. Accumulate
        # all possibilities that have the same minimum cost.
        
        # Gap in S, consume in T
        gap_s = lastrow[i+1][0] + cost_gap_S
        if gap_s > op[0]:
            # Best so far
            op = (gap_s, EditOp.GAP_S, lastrow[i+1][2])
        elif gap_s == op[0]:
            # Same score as current best. Accumulate EditOp.
            op = (op[0], op[1] | EditOp.GAP_S, op[2]+lastrow[i+1][2])
            
        # Gap in T, consume in S
        gap_t = thisrow[i][0] + cost_gap_T
        if gap_t > op[0]:
            # Best so far
            op = (gap_t, EditOp.GAP_T, thisrow[i][2])
        elif gap_t == op[0]:
            # Same score as current best. Accumulate EditOp.
            op = (op[0], op[1] | EditOp.GAP_T, op[2]+thisrow[i][2])

        thisrow.append(op)

    cost.append(thisrow)

#for row in cost:
#    for k in row:
#        print(f"({k[0]}, {k[1]})", end='')
#    print()

# The final edit cost is down in the corner
dist = cost[len_t][len_s][0]
print(f"Edit distance: {dist}")

optimalcount = cost[len_t][len_s][2]
print(f"Number of optimal alignments: {optimalcount}")
print(f"Mod counts: {optimalcount % 134217727}")

# Walk the cost matrix from the far corner back to the origin
# recording a lowest cost alignment sequences in reverse. This
# is one optimal alignment out of many.
i, j = len_s, len_t
revalign_s = []
revalign_t = []
revtrace = []
while i>0 or j>0:

    step = cost[j][i]
    action = step[1]
    tracechar = '*'
    if action & (EditOp.MATCH | EditOp.SUBST):
        # Consume in S and T.
        i -= 1
        revalign_s.append(S[i])
        j -= 1
        revalign_t.append(T[j])
        if action & EditOp.MATCH:
            tracechar = ' '
            
    elif action & EditOp.GAP_S:
        # Gap in S, consume in T
        revalign_s.append('-')
        j -= 1
        revalign_t.append(T[j])
    elif action & EditOp.GAP_T:
        # Gap in T, consume in S
        i -= 1
        revalign_s.append(S[i])
        revalign_t.append('-')
    
    revtrace.append(tracechar)
    
align_s = "".join(revalign_s[::-1])
align_t = "".join(revalign_t[::-1])
trace = "".join(revtrace[::-1])

print(align_s)
print(align_t)
#print(trace)
with open("data/rosalind_glob.out", "w+") as output:
    output.write(f"Optimal distance: {dist}\n")
    output.write(f"Number of optimal alignments: {optimalcount}\n")
    output.write(f"Modulo 2^27-1: {optimalcount % 13417727}\n")
    output.write("{0}\n".format(align_s))
    output.write("{0}\n".format(align_t))
    output.write("{0}\n".format(trace))


S (length 781): YWCVICMYCNIYWGNTVRINCKQWYIMAFRYAGGGRVCLPKDNCGEFTWHDYQTIPFNHHNMWGQDCDMDKYLDALEHLVPRQWWTWDKLQNEQYIAPGDIELYPPWGDTEPDPLANKAMENCHTMIGWPPAKKQGKGTGTAKGSFRRTRKPQRDFHGMGQQTDSSFNVEFSTANISSRAHNMIFINFIKGHMYSIYESNFCQKMEMPGFDDQMQGTERNPFHKMYIAHSMSQVDELIDNSQFVSIEPIFSAGAEWHWSMWTSKSCYVPMEVTGTQVHIGQNESWDAAKSPTIGRRAFTPVRPMQEQWSPHQKTSWFNYRMGITFPQPWNSFCAWKSAYVFTSEIMGTFYNLFSKDRSYFRPCGFNRFIGFEEFSSEEPIPDPTEPISSAVVLTGFFFGLCGLTCCMLTWWSNPCVPSCWYPSKIMPILCWYIPTWQHSNHHRFPCAHMYEQGRSLYDDPNFMQFAAAHAMRCVMQDAWRMVYKFGKTHLALAPYNRSTTPQLEYGMELDGPKFRHYGVKAYMLWSILCRWNGEFGNEAAGVPEVKNMCDELSIITPMTCEFGSVHVCFMQADLFCWSPKFPTNNKAIGDRNFDLDPHHHQIRTRSSVVDAACSVWEGGMHTRTLTMMADTFSNSEPQINLPSEHKEAIAPGEGLMASQYSADHVMTSCFINEPYIDWTESRCRWSMARPQALANLTGLGGKKNETCHQCYMQNCFMPIAGTQPWQGFLRRRISINRADTNTKVEQQYIKMYRGSEFKHGFTHGVSKMNFYPNPVFEVDQT
T (length 789): YWCALFCMYCNIYWGNTVRINCKPWYIMAFRAAGGKDYIEYRSYQTIPFNHHNMWGQDCDMDKVPRQGIWTWDKYCYNGFAELYPPGGDTEPDPLVTLWFFNKAMENCHTMIGWPPAKKQGKGTGTAKGSFRRGYQRAKGQQTDSSFNSSRAHNHMYSMEMPGFDDQMQTEWDPRNPFWKMWIAWG

In [9]:
bl62.score('P', 'M')


-2

In [10]:
bl62.score('L', 'M')

2