In [1]:
from array import array
from collections import deque
import numpy as np

In [2]:
# вспомогательная функция для преобразования двумерного индекса к одномурному.
def index2d_to_1d(i, j, m):
    return m * i + j

# Локальное выравнивание

In [29]:
def local_alignment_weights(s, t, match_weight, mismatch_weight, gap_weight, gap='_'):
    n = len(s)
    m = len(t)
    
    max_weight = None
    max_i, max_j = None, None
    
    C = array('h', [0]) * ((n + 1) * (m + 1))
    
    i = 1
    while i <= n:
        j = 1
        while j <= m:
            weight = max(
                0, # free ride
                C[index2d_to_1d(i - 1, j - 1, m + 1)] + (match_weight if s[i - 1] == t[j - 1] else mismatch_weight),
                C[index2d_to_1d(i, j - 1, m + 1)] + gap_weight,
                C[index2d_to_1d(i - 1, j, m + 1)] + gap_weight,
            )
            
            C[index2d_to_1d(i, j, m + 1)] = weight
            
            if max_weight is None or max_weight < weight:
                max_weight = weight
                max_i, max_j = i, j

            j += 1
        i += 1
    return C, max_i, max_j

def local_alignment(s, t, match_weight, mismatch_weight, gap_weight, gap='_'):
    m = len(s)
    n = len(t)
    
    C, i_end, j_end = local_alignment_weights(s, t, match_weight, mismatch_weight, gap_weight, gap)
    #print(max_i, max_j, weight)
    #print(np.array(C).reshape(m + 1, n + 1))
    
    s_a, t_a = deque(), deque()
    
    i, j = i_end, j_end
        
    while i > 0 and j > 0 and C[index2d_to_1d(i, j, n + 1)] > 0:
        if C[index2d_to_1d(i, j, n + 1)] == C[index2d_to_1d(i - 1, j - 1, n + 1)] + (match_weight if s[i - 1] == t[j - 1] else mismatch_weight):
            s_a.appendleft(s[i - 1])
            t_a.appendleft(t[j - 1])
            i -= 1
            j -= 1
        elif C[index2d_to_1d(i, j - 1, n + 1)] > C[index2d_to_1d(i - 1, j, n + 1)]:
            s_a.appendleft(gap)
            t_a.appendleft(t[j - 1])
            j -= 1
        else:
            s_a.appendleft(s[i - 1])
            t_a.appendleft(gap)
            i -= 1
    
    i_start, j_start = i, j
    s_a.appendleft('(')
    t_a.appendleft('(')
    while i > 0 or j > 0:
        if i > 0:
            s_a.appendleft(s[i - 1])
            i -= 1
        else:
            s_a.appendleft(gap)
        if j > 0:
            t_a.appendleft(t[j - 1])
            j -= 1
        else:
            t_a.appendleft(gap)
    
    i, j = i_end + 1, j_end + 1
    s_a.append(')')
    t_a.append(')')
    while i <= m or j <= n:
        if i <= m:
            s_a.append(s[i - 1])
            i += 1
        else:
            s_a.append(gap)
        if j <= n:
            t_a.append(t[j - 1])
            j += 1
        else:
            t_a.append(gap)
        
    
    return ''.join(s_a), ''.join(t_a)

# Глобальное выравнивание

In [30]:
def alignment_weights(s, t, match_weight, mismatch_weight, gap_weight, gap='_'):
    n = len(s)
    m = len(t)
    
    C = array('h', [0]) * ((n + 1) * (m + 1))

    for j in range(m + 1):
        C[index2d_to_1d(0, j, m + 1)] += j * gap_weight
    for i in range(n + 1):
        C[index2d_to_1d(i, 0, m + 1)] += i * gap_weight
        
    i = 1
    while i <= n:
        j = 1
        while j <= m:
            C[index2d_to_1d(i, j, m + 1)] = max(
                C[index2d_to_1d(i - 1, j - 1, m + 1)] + (match_weight if s[i - 1] == t[j - 1] else mismatch_weight),
                C[index2d_to_1d(i, j - 1, m + 1)] + gap_weight,
                C[index2d_to_1d(i - 1, j, m + 1)] + gap_weight,
            )
            j += 1
        i += 1
    return C

def alignment(s, t, match_weight, mismatch_weight, gap_weight, gap='_'):
    m = len(s)
    n = len(t)
    
    C = alignment_weights(s, t, match_weight, mismatch_weight, gap_weight, gap)
    
    s_a, t_a = deque(), deque()
    i, j = m, n
    while i > 0 and j > 0:
        if C[index2d_to_1d(i, j, n + 1)] == C[index2d_to_1d(i - 1, j - 1, n + 1)] + (match_weight if s[i - 1] == t[j - 1] else mismatch_weight):
            s_a.appendleft(s[i - 1])
            t_a.appendleft(t[j - 1])
            i -= 1
            j -= 1
        elif C[index2d_to_1d(i, j - 1, n + 1)] > C[index2d_to_1d(i - 1, j, n + 1)]:
            s_a.appendleft(gap)
            t_a.appendleft(t[j - 1])
            j -= 1
        else:
            s_a.appendleft(s[i - 1])
            t_a.appendleft(gap)
            i -= 1
    
    return ''.join(s_a), ''.join(t_a)

# Сравнение выравниваний

## Пример 1

In [31]:
for line in local_alignment('actgag', 'gctact', 1, -1, -1):
    print(line)

___(act)gag
gct(act)___


In [32]:
for line in alignment('actgag', 'gctact', 1, -1, -1):
    print(line)

actga_g
gct_act


## Пример 2

Создадим две строки с большим совпадающим участком в центре и различающиеся по краям.

In [33]:
import random
random.seed(0)

common_subseq = 'BIOINFORMATICS'
population = 'QWERTYUIOPASDFGHJKLZXCVBNM'

seq1 = ''.join([*random.choices(population, k=20), common_subseq, *random.choices(population, k=12)])
seq2 = ''.join([*random.choices(population, k=6), common_subseq, *random.choices(population, k=30)])

In [34]:
seq1

'CZAUFAXIDHBFIZJUBMCBBIOINFORMATICSOLBKDESHBMDV'

In [35]:
seq2

'UXGQLABIOINFORMATICSCKQDVUOVTGUMXSEOFNEGLGCGMHHSHA'

In [48]:
for line in local_alignment(seq1, seq2, 1, -1, -2):
    print(line)

CZAUFAXIDHBFIZJUBMCB(BIOINFORMATICS)OLBKDESHBMDV__________________
______________UXGQLA(BIOINFORMATICS)CKQDVUOVTGUMXSEOFNEGLGCGMHHSHA


In [49]:
for line in alignment(seq1, seq2, 1, -1, -2):
    print(line)

CZAUFAXIDHBFIZJUBMCBBIOINFORMATICS_O___LBKDESHBMDV
UXGQLABIOINFORMATICSCKQDVUOVTGUMXSEOFNEGLGCGMHHSHA


## Пример 3

Попробуем сравнить выравнивания на случайных строках.

In [50]:
random.seed(0)

population = 'ACTG'

seq1 = random.choices(population, k=60)
seq2 = random.choices(population, k=80)

In [51]:
for line in local_alignment(seq1, seq2, 1, -1, -1):
    print(line)

____________________(GGCCTCGCCTGTCGTCGGG_GCTG_TCACTGGCGCGTATCG_TA____CGACGATAGG_CAC)TGATTT
GTGTTCTCTCAATTCAGGGG(GGTCTCGGGTGTC_T_GGGATCTGAT_A__AGCGAATAT_GTTATTTCCGAAGAAAGGACAC)ATCA__


In [52]:
for line in alignment(seq1, seq2, 1, -1, -1):
    print(line)

G_G____C_C___TC__GCCTGTCGTCGGG_G_CTGTCA_CTG____GCGCGTATCG_TA____CGACGATAGG_CACTGATTT
GTGTTCTCTCAATTCAGGGGGGTC_TCGGGTGTCTGGGATCTGATAAGCGAATAT_GTTATTTCCGAAGAAAGGACAC__ATCA
