# Dyanmic Programming - Sequence Alignment

In [4]:
import numpy as np
from numpy import array

## modified from https://gist.github.com/num3ric/1222752
# define letter values for array manipulation
A, C, G, T = 0, 1, 2, 3
int_to_char = {0:'A', 1:'C', 2:'G', 3:'T'}

# set params for scoring mis-/matches and skips
indelPenlty = -1
matchScoring = array([[1,-1,-1,-1],
                 [-1,1,-1,-1],
                 [-1,-1,1,-1],
                 [-1,-1,-1,1]])

#
class SequenceAligner(object):
    def __init__(self, seq1, seq2):
        self.seq1 = seq1
        self.seq2 = seq2
        self.D = np.zeros((self.seq1.size+1, self.seq2.size+1), dtype=np.int16)
        self.getscoreMatrix()

    # create matrix via DP
    def getscoreMatrix(self):
        # penalty for full skips
        for i in range(self.seq1.size+1):
            self.D[i,0] = i*indelPenlty
        for j in range(self.seq2.size+1):
            self.D[0,j] = j*indelPenlty
        # build out full matrix
        for i in range(1, self.seq1.size+1):
            for j in range(1, self.seq2.size+1):
                self.D[i,j]=max(self.D[i-1, j-1] + matchScoring[self.seq1[i-1],self.seq2[j-1]],
                                self.D[i-1, j] + indelPenlty,
                                self.D[i, j-1] + indelPenlty)

    # find optimal matching path
    def traceback(self):
        alignment= []
        i = self.seq1.size
        j = self.seq2.size
        while i >0 and j>0:
            # traceback with a match
            if self.D[i-1, j-1] + matchScoring[self.seq1[i-1],self.seq2[j-1]] == self.D[i,j]:
                alignment.append(self._get_aligned_pair(i, j))
                i -= 1
                j -= 1
            # traceback with an indelPenlty for seq1
            elif self.D[i-1, j] + indelPenlty == self.D[i,j]:
                alignment.append(self._get_aligned_pair(i, 0))
                i -= 1
            # traceback with an indelPenlty for seq2                
            else:
                alignment.append(self._get_aligned_pair(0, j))
                j -= 1
        # full traceback with an indelPenlty for seq1
        while i > 0:
            alignment.append(self._get_aligned_pair(i, 0))
            i -= 1
        # full traceback with an indelPenlty for seq2
        while j > 0:
            alignment.append(self._get_aligned_pair(0, j))
            j -= 1
        alignment.reverse()
        return alignment  

    # return aligned nucleotides
    def _get_aligned_pair(self, i, j):
        n1 = int_to_char[self.seq1[i-1]] if i>0 else '_'
        n2 = int_to_char[self.seq2[j-1]] if j>0 else '_'
        return (n1, n2)

def print_sequences(pairs):
    top_seq = []
    bottom_seq = []
    for (b, t) in pairs:
        bottom_seq.append(b)
        top_seq.append(t)
    for n in top_seq:
        print (n,)
    print (' ')
    for n in bottom_seq:
        print (n,)

if __name__ == "__main__":
    s1 = array([G, T, A, C, A, G, T, A], dtype=np.int16)
    s2 = array([G, G, T, A, C, G, T], dtype=np.int16)
    aligner = SequenceAligner(s1, s2)
    aligned = aligner.traceback()
    print_sequences(aligned)

G
G
T
A
C
_
G
T
_
 
_
G
T
A
C
A
G
T
A
