In [112]:
import itertools
import math
import string
from copy import deepcopy

import blosum as bl
import numpy as np
from enum import IntEnum

In [141]:
class Score(IntEnum):
    MATCH = 1
    MISMATCH = -1
    GAP = -1

# Assigning the constant values for the traceback
class Trace(IntEnum):
    STOP = 0
    LEFT = 1
    UP = 2
    DIAGONAL = 3

# Implementing the Smith Waterman local alignment
def smith_waterman(seq1, seq2):
    # Generating the empty matrices for storing scores and tracing
    row = len(seq1) + 1
    col = len(seq2) + 1
    matrix = np.zeros(shape=(row, col))
    tracing_matrix = np.zeros(shape=(row, col))

    # Initialising the variables to find the highest scoring cell
    max_score = -1
    max_index = (-1, -1)

    # Calculating the scores for all cells in the matrix
    for i in range(1, row):
        for j in range(1, col):
            # Calculating the diagonal score (match score)
            match_value = Score.MATCH if seq1[i - 1] == seq2[j - 1] else Score.MISMATCH
            diagonal_score = matrix[i - 1, j - 1] + match_value

            # Calculating the vertical gap score
            vertical_score = matrix[i - 1, j] + Score.GAP

            # Calculating the horizontal gap score
            horizontal_score = matrix[i, j - 1] + Score.GAP

            # Taking the highest score
            matrix[i, j] = max(0, diagonal_score, vertical_score, horizontal_score)

            # Tracking where the cell's value is coming from
            if matrix[i, j] == 0:
                tracing_matrix[i, j] = Trace.STOP

            elif matrix[i, j] == horizontal_score:
                tracing_matrix[i, j] = Trace.LEFT

            elif matrix[i, j] == vertical_score:
                tracing_matrix[i, j] = Trace.UP

            elif matrix[i, j] == diagonal_score:
                tracing_matrix[i, j] = Trace.DIAGONAL

            # Tracking the cell with the maximum score
            if matrix[i, j] >= max_score:
                max_index = (i,j)
                max_score = matrix[i, j]

    # Initialising the variables for tracing
    aligned_seq1 = ""
    aligned_seq2 = ""
    current_aligned_seq1 = ""
    current_aligned_seq2 = ""
    (max_i, max_j) = max_index

    # Tracing and computing the pathway with the local alignment
    while tracing_matrix[max_i, max_j] != Trace.STOP:
        if tracing_matrix[max_i, max_j] == Trace.DIAGONAL:
            current_aligned_seq1 = seq1[max_i - 1]
            current_aligned_seq2 = seq2[max_j - 1]
            max_i = max_i - 1
            max_j = max_j - 1

        elif tracing_matrix[max_i, max_j] == Trace.UP:
            current_aligned_seq1 = seq1[max_i - 1]
            current_aligned_seq2 = '-'
            max_i = max_i - 1

        elif tracing_matrix[max_i, max_j] == Trace.LEFT:
            current_aligned_seq1 = '-'
            current_aligned_seq2 = seq2[max_j - 1]
            max_j = max_j - 1

        aligned_seq1 = aligned_seq1 + current_aligned_seq1
        aligned_seq2 = aligned_seq2 + current_aligned_seq2

    # Reversing the order of the sequences
    aligned_seq1 = aligned_seq1[::-1]
    aligned_seq2 = aligned_seq2[::-1]

    return aligned_seq1, aligned_seq2, max_score

Вход: слово длины 4 по дефолту но с изменением, нуклеотидные последовательности,
ВЫход: упорядоченный список диагоналей с наибольшим числом совпадений
сравнение индексов (длина слова, последовательности)

In [36]:
def to_quadriple(str):
    sum = 0.0
    cnt = 0
    for s in reversed(str):
        if s == 'A':
            sum += math.pow(4, cnt) * 0
        elif s == 'C':
            sum += math.pow(4, cnt) * 1
        elif s == 'G':
            sum += math.pow(4, cnt) * 2
        elif s == 'T':
            sum += math.pow(4, cnt) * 3
        cnt += 1
    return sum


In [37]:
def to_map(seq: string, k: int):
    dict = {}
    for i in range(len(seq) - k + 1):
        key = seq[i : i + k]
        if dict.get(key) is None:
            dict[key] = []
        dict[key].append(i)
    return dict

In [198]:
def sub_maps_diff_pos(map_1, map_2):
    dist = {}
    for k1 in map_1.keys(): #k1 - 'AA'; val1 - [1, 2, 3]
        if map_2.get(k1) is None:
            continue

        for pos1 in map_1[k1]:
            for pos2 in map_2[k1]:
                delta = pos2 - pos1
                if dist.get(delta) is None:
                    dist[delta] = []
                dist[delta].append(pos1)


    res = []
    for gap in dist:
        li = dist[gap]
        res.append(Diag(convert_diag_to_xy(gap, li)))

    return res

def cutt_off(dist, count=1):
    res = {}
    for key in dist.keys():
        if len(dist[key]) > count:
            res[key] = dist[key]
    return res


In [40]:
def sub_maps(map_1, map_2):
    # dict[subsequence] = set(distance(subsequence))
    dist = {}

    for k1 in map_1.keys():
        if map_2.get(k1) is None:
            continue
        if dist.get(k1) is None:
            dist[k1] = []
        for pos1 in map_1[k1]:
            for pos2 in map_2[k1]:
                dist[k1].append(pos2 - pos1)
    return dist

In [91]:
def convert_to_matrix(diffs, n1, n2):
    res = np.zeros((n1, n2))
    for gap in diffs:
        for i, pos1 in enumerate(diffs[gap]):
            pos2 = pos1 + gap
            res[pos1][pos2] = 1
    return res


In [106]:
s1 = 'GCATCGGC'
s2 = 'CCATCGCCATCG'

m1 = to_map(s1, 2)
m2 = to_map(s2, 2)
# diff = sub_maps(m1, m2)
diff2 = sub_maps_diff_pos(m1, m2)
# print(m1)
# print(m2)
# print(diff)
print(diff2)
# print(convert_to_matrix(diff2, len(s1), len(s2)))


{5: [0], 9: [0], -1: [6], 3: [6], 0: [1, 2, 3, 4], 6: [1, 4]}
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [205]:
class Diag:
    # def __init__(self, gap, poss1):
    #     MAX = float('inf')
    #     MIN = float('-inf')
    #     x_min = MAX
    #     x_max = MIN
    #     y_min = MAX
    #     y_max = MIN
    #     for i, pos1 in enumerate(poss1):
    #         pos2 = pos1 + gap
    #         x_min = min(x_min, pos1)
    #         x_max = max(x_max, pos1)
    #         y_min = min(y_min, pos2)
    #         y_max = max(y_max, pos2)
    #     self.x_min = x_min
    #     self.x_max = x_max
    #     self.y_min = y_min
    #     self.y_max = y_max

    def __init__(self, li):
        self.x_min = li[0]
        self.y_min = li[1]
        self.x_max = li[2]
        self.y_max = li[3]

    def __add__(self, diag):
        return Diag([min(self.x_min, diag.x_min), min(self.y_min, diag.y_min), max(diag.x_max, self.x_max), max(diag.y_max, self.y_max)])

    def __len__(self):
        return self.x_max - self.x_min + self.y_max - self.y_min

    def __getitem__(self, item):
        x = self.x_min + item
        y = self.y_min + item
        if x > self.x_max or y > self.y_max:
            raise KeyError
        return Diag([x, y, x, y])

    def __le__(self, other):
        return len(self) <= len(other)

    def __lt__(self, other):
        return len(self) < len(other)

    def __str__(self):
        return f"({self.x_min}, {self.y_min}) -> ({self.x_max}, {self.y_max})"

In [153]:
def convert_diag_to_xy(gap, poss1):
    MAX = float('inf')
    MIN = float('-inf')
    x_min = MAX
    x_max = MIN
    y_min = MAX
    y_max = MIN
    for i, pos1 in enumerate(poss1):
        pos2 = pos1 + gap
        x_min = min(x_min, pos1)
        x_max = max(x_max, pos1)
        y_min = min(y_min, pos2)
        y_max = max(y_max, pos2)
    return x_min, y_min, x_max, y_max


In [224]:
def operate(str1, str2, substr_len=2, gap=-10, blosum_cf=5):
    map_1 = to_map(str1, substr_len)
    map_2 = to_map(str2, substr_len)
    diags = get_top_10(sub_maps_diff_pos(map_1, map_2))
    diffs2 = []
    blosum_matrix = bl.BLOSUM(62)
    diag_scores = {}
    for d1 in diags:
        diag_scores[d1] = 0.0

        for i in range(d1.x_max - d1.x_min):
            print(i)
            len_1_diag = d1[i]
            # print(len_1_diag)
            diag_scores[d1] += blosum_matrix[str1[len_1_diag.x_min] + str2[len_1_diag.y_min]]
            print(str1[len_1_diag.x_min] + str2[len_1_diag.y_min])
        if diag_scores[d1] < blosum_cf:
            del diag_scores[d1]
        else:
            diffs2.append(d1)

    diff_concat_scores = {}



    for d1 in diffs2:
        for d2 in diffs2:
            if d2.x_min >= d1.x_min and d2.y_min >= d1.y_min:
                # чтобы вторая диагоналей была не выше и не левее первой
                print(f"diagonal 1: {d1}, diagonal 2: {d2}")
                if d2.x_min  <= d1.x_max:
                    dist = d2.y_min - d1.y_min
                elif d2.y_min <= d1.y_max:
                    dist = d2.x_min - d1.x_min
                else:
                    dist = d2.y_min - d1.y_min + d2.x_min - d1.x_min
                res = diag_scores[d1] + diag_scores[d2] + dist * gap
                if res > diag_scores[d1] and res > diag_scores[d2]:
                    print(res)
                    diff_concat_scores[d1 + d2] = res

    for diag in diff_concat_scores:
        print(diag)
        print(diff_concat_scores[diag])
    # for k in diff_concat_scores:
    #     print(k)
    #     print(diff_concat_scores[k])
    # print(diff_concat_scores)
    #
    # ma = float('-inf')
    # li = list()
    # for list_diff in diff_concat_scores:
    #     if diff_concat_scores[list_diff] > ma:
    #         ma = diff_concat_scores[list_diff]
    #         li = list_diff
    #
    # print(li)
    # for delta in li:
    #     pos1_min = min(diffs2[delta])
    #     pos1_max = max(diffs2[delta])
    #     pos2_min = pos1_min + delta
    #     pos2_max = pos1_max + delta
    #     print(smith_waterman(s1[pos1_min:pos1_max + 1], s2[pos2_min:pos2_max + 1]))

    # for list_diff in diff_concat_scores:
    #     for delta in list_diff:
    #         pos1_min = min(diffs2[delta])
    #         pos1_max = max(diffs2[delta])
    #         pos2_min = pos1_min + delta
    #         pos2_max = pos1_max + delta
    #         print(smith_waterman(s1[pos1_min:pos1_max + 1], s2[pos2_min:pos2_max + 1]))
    return


s3 = 'MALRKGGLALALLLLSWVALGPRSLEGADPGTPGEAEGPACPAACVCSYDDDADELSVFCSSRNLTRLPDGVPGGTQALW\
LDGNNLSSVPPAAFQNLSSLGFLNLQGGQLGSLEPQALLGLENLCHLHLERNQLRSLALGTFAHTPALASLGLSNNRLSR\
LEDGLFEGLGSLWDLNLGWNSLAVLPDAAFRGLGSLRELVLAGNRLAYLQPALFSGLAELRELDLSRNALRAIKANVFVQ\
LPRLQKLYLDRNLIAAVAPGAFLGLKALRWLDLSHNRVAGLLEDTFPGLLGLRVLRLSHNAIASLRPRTFKDLHFLEELQ\
LGHNRIRQLAERSFEGLGQLEVLTLDHNQLQEVKAGAFLGLTNVAVMNLSGNCLRNLPEQVFRGLGKLHSLHLEGSCLGR\
IRPHTFTGLSGLRRLFLKDNGLVGIEEQSLWGLAELLELDLTSNQLTHLPHRLFQGLGKLEYLLLSRNRLAELPADALGP\
LQRAFWLDVSHNRLEALPNSLLAPLGRLRYLSLRNNSLRTFTPQPPGLERLWLEGNPWDCGCPLKALRDFALQNPSAVPR\
FVQAICEGDDCQPPAYTYNNITCASPPEVVGLDLRDLSEAHFAPC'

s4 = 'MALRKGGLALALLLLSWVALGPRSLEGADPGTPGEAEGPACPAACVCSYDDDADELSVFCSSRNLTRLPDGVPGGTYUIW\
LDGNNLSSVPPAAFQNLSSLGFLNLQGGQLGSLEPQALLGLENLCHLHLERNQLRSLALGTFAHTPALASLGLSNNRLSR\
LEDGLFEGLGSLWDLNLGWNSLAVLPDAAFRGLGSLRELVLUGNRLAYLQPALFSGLAELRELDLSRNALRAIKANVFVQ\
LPRLQKLYLDRNLIAAVAPGAFLGLKALRWLDLSHNRVAGLLEDTFPGLLGLRVLRLSHNAIASLRPRTFKDLHFLEELQ\
LGHNRIRQLAERSFEGLGQLEVLTLDHNQLQEVKAGAFLGLTNVAVMNLSGNCLRNLPEQVFRGLGKLHSLHLEGSCLGR\
IRPHTFTGBJJIBJNJKNKNKNVGIEEQSLWGLAELLELDLTSNQLTHLPHRLFQGLGKLEYLLLSRNRLAELPADALGP\
LQRAFWLDVSHNRLEALPNSLLAPLGRLRYLSLRNNSLRTFTPQPPGLERLWLEGNPWDCGCPLKAHGGDRYYVPR\
FVQAICEGDDCQPPAYTYNNITCASPPEVVGLDLRDLSEAHFAPC'

operate(s3, s4, substr_len=2, gap=-10)


In [183]:
class Diagonal:
    def __init__(self, gap, pos1):
        self.pos1 = pos1
        self.pos2 = pos1 + gap
        self.gap = gap

    def concat(self, diag):
        pass

    def __str__(self):
        return f"{self.pos1}, {self.pos2}, gap={self.gap}"


def set_diagonals(arr):
    res = []
    res_s = []

    for i in arr:
        if len(res_s) == 0 or abs(res_s[-1] - i) <= 1:
            res_s.append(i)
        else:
            res.append(Diagonal(res_s[0], res_s[-1] - res_s[0]))
            res_s = [i]
    res.append(Diagonal(res_s[0], res_s[-1] - res_s[0]))


    return res

def get_top_10(dist):
    res = sorted(dist)
    return res[:10]
    # for i in dist:
    #     temp.append(len(dist[i]))
    # temp = sorted(temp)
    # if len(temp) > 10:
    #     plank = temp[9]
    # else:
    #     plank = temp[0]
    # res = {}
    # for i in dist:
    #     if len(dist[i]) >= plank:
    #         res[i] = dist[i]
    # return res


In [57]:
cutten = get_top_10(sub_maps_diff_pos(m1, m2))

[1, 1, 4, 4]
{5: [0], -1: [6], 0: [1, 2, 3, 4], 6: [1, 2, 3, 4]}
