In [2]:
def print_scoring_matrix(H, seq1, seq2):
    whitespace = ' '
    
    row1 = whitespace * 4
    for char in seq1:
        row1 += char + 3*whitespace
    print(row1)
    
    for i, char in enumerate(seq2):
        row_string = char + whitespace * 3
        for num in list(map(str, H[i])):
            row_string += num + ' ' * (4 - len(num))
        print(row_string)

In [12]:
#random sequence generation, for testing
import random
def random_sequence(sequence_length: int) -> str:
    return ''.join([random.choice('ATCG') for _ in range(sequence_length)])

In [13]:
#similarity score
def S(a, b):
    MATCH = 3
    MISMATCH = -3
    return MATCH if a == b else MISMATCH
#S = lambda a, b: 3 if a==b else -3

In [14]:
def scoring_matrix(seq1, seq2, W_1 = 2):
    H = [[0]*len(seq1) for _ in range(len(seq2))]

    max_in_matrix = [0,(0,0)]
    arrows = {}
    
    for j in range(1, len(seq2)):
        for i in range(1, len(seq1)):
            mismatch = H[j-1][i-1] + S(seq1[i], seq2[j])
            delete = H[j][i-1] - W_1
            insert = H[j-1][i] - W_1
            value = max(0, mismatch, delete, insert)
                    
            H[j][i] = value
            
            if value == 0:
                continue
                
            arrow = []
            if value == mismatch:
                arrow.append((i-1, j-1))
            if value == delete:
                arrow.append((i-1, j))
            if value == insert:
                arrow.append((i, j-1))
            arrows[(i, j)] = arrow
                
            if value >= max_in_matrix[0]:
                max_in_matrix = [value, (i, j)]
    return (H, arrows, max_in_matrix)

In [15]:
def backtrack(H, pos, arrows):
    i, j = pos[1]
    path = []
    while H[j][i]:
        path.append((i, j))
        i, j = arrows[(i,j)][0]
    return path[::-1]

In [16]:
def get_optimal_allignment(path, seq1, seq2):
    seq1_allign, seq2_allign = '', ''
    for k, step in enumerate(path):
        i, j = step
        seq1_allign += seq1[i] if path[k-1][0] != path[k][0] else '-'
        seq2_allign += seq2[j] if path[k-1][1] != path[k][1] else '-'
    return seq1_allign, seq2_allign

In [17]:
def print_optimal_allignment(seq1_allign, seq2_allign):
    print(seq1_allign)
    row2 = ''
    for i, _ in enumerate(seq1_allign):
        row2 += '|' if seq1_allign[i] == seq2_allign[i] else ' '
    print(row2)
    print(seq2_allign)

In [18]:
def smith_waterman(seq1, seq2, print_allign = False, print_matrix = False):
    seq1, seq2 = '_' + seq1, '_' + seq2
    H, arrows, max_in_matrix = scoring_matrix(seq1, seq2) 
    path = backtrack(H, max_in_matrix, arrows)
    alligned = get_optimal_allignment(path, seq1, seq2)
    
    if print_matrix: print_scoring_matrix(H, seq1, seq2) 
    if print_allign: print_optimal_allignment(alligned[0], alligned[1])
    
    return alligned

In [19]:
smith_waterman('TGTTACGG', 'GGTTGACTA', True, True)

    _   T   G   T   T   A   C   G   G   
_   0   0   0   0   0   0   0   0   0   
G   0   0   3   1   0   0   0   3   3   
G   0   0   3   1   0   0   0   3   6   
T   0   3   1   6   4   2   0   1   4   
T   0   3   1   4   9   7   5   3   2   
G   0   1   6   4   7   6   4   8   6   
A   0   0   4   3   5   10  8   6   5   
C   0   0   2   1   3   8   13  11  9   
T   0   3   1   5   4   6   11  10  8   
A   0   1   0   3   2   7   9   8   7   
GTT-AC
||| ||
GTTGAC


('GTT-AC', 'GTTGAC')

In [20]:
smith_waterman(random_sequence(1000), random_sequence(1000))

('TTAG-TCT-GAGCATCGT-TTGC-TC-ATGTG-T-C-G-C-CTGT-TTCC-A--GGTAGCCGCTG-TATTCTGCTGGG--A-AACC-C-----CGTATCGCT--ACT-TC-GAAGTTC-----AGC-A-GATA-TCAC-TCTTA-TATCAGTCACA--GTCAATCGA-GC-GCCA-GTCCAGA-AGT--CCG---CC-CC-TCAACCAT--GA--CTGCCAA--CGATCTCTCGGACCCTTCGCTCGATAGCGGACCCTCACC-GAAG-CCGC-CGAGGCG----GAA--CTGTGGTGATGAG--C-T-TA-ACACTC--CACGTGATTGAAACGTGCCAAATCAGG-TG--T--TC-TGATGATCAGCTGGTA-GACG-TTCAACGTATGGTAAGCAAAAACA--GACTTAGTTGGCCGCCTCCGTACCA-CGACTCGAACGGTTTGAGTCGCA-GCGC-CTTAATATCTT---T-A-G-A-A--AAT-GGT----CAT--TA-TAC-GGA--ACC-C-G-A-A-CC--C-GTGTACGCCATCTG-TCTGTAGGCTACCTGC-TAGTATGCGTAGACTTACCGCCT-AT-CG---GGT-CAG-ACG--CGCAA--C-AGCG-GA-TGTG--A-GTTTGTGCGGTAATACATCCGGAGTG--CGATATAAGAAC--AGCAA-TCA-TTA---GAG-A-CCCAGTC--T-ACG-TCGCT-GGTTATACT--ATGTCAACGGC--A-GGGTAGGC-C--C-C-G-GGGCCT---G---GG---GGACACCGA-ACCT-C-ATCCTGC--CAAGGAAT-CCTC--TC-TCA-CATTG-G-TGAAA-TGAATGCAGCCCCCG-T-AT--C--G--TTA-CGGGGC--TCCGCTATT-G-GTTAAC-T-CAGTCAAT-AAGGGCC--C--GCCGACCCCTTGCATTGGATTACAAT-GAGTCATTCCGTAGAGTCTG-GATCGCGCT-GCGCTCGAAAAAC-A