In [1]:
import numpy as np

In [2]:
def min_dist(dist_matrix):
    n, m = dist_matrix.shape
    min_dist = None
    min_i = 0
    min_j = 0
    for i in range(n):
        for j in range(i + 1, m):
            dist = dist_matrix[i, j]
            if dist > 0 and (min_dist is None or dist < min_dist):
                min_dist = dist
                min_i = i
                min_j = j
    return min_dist, min_i, min_j

In [3]:
def recalculate_distances_weighted(dist_matrix, i, j):
    dist_matrix = dist_matrix.copy()
    distances_row = (dist_matrix[i, :] + dist_matrix[j, :])/2
    distances_col = (dist_matrix[:, i] + dist_matrix[:, j])/2
    dist_matrix[i, :] = distances_row
    dist_matrix[:, i] = distances_col
    dist_matrix[j, :] *= 0 
    dist_matrix[:, j] *= 0
    dist_matrix[i, i] *= 0
    
    return dist_matrix

In [4]:
def recalculate_distances_unweighted(dist_matrix, i, j, size_i, size_j):
    dist_matrix = dist_matrix.copy()
    distances_row = (size_i * dist_matrix[i, :] + size_j * dist_matrix[j, :]) / (size_i + size_j)
    distances_col = (size_i * dist_matrix[:, i] + size_j * dist_matrix[:, j]) / (size_i + size_j)
    dist_matrix[i, :] = distances_row
    dist_matrix[:, i] = distances_col
    dist_matrix[j, :] *= 0 
    dist_matrix[:, j] *= 0
    dist_matrix[i, i] *= 0
    
    return dist_matrix

In [5]:
def update_sizes(sizes, i, j):
    sizes = sizes.copy()
    sizes[i] += sizes[j]
    sizes[j] = 0
    return sizes

In [6]:
def merge_subtrees(trees, i, j, length):
    trees = trees.copy()
    childl = trees[i]
    childr = trees[j]
    trees[i] = (childl, childr, length)
    trees[j] = 0
    return trees

In [7]:
def deduce_lengths(tree, length):
    if isinstance(tree, tuple):
        childl, childr, tree_length = tree
        childl = deduce_lengths(childl, tree_length)
        childr = deduce_lengths(childr, tree_length)
        return (childl, childr, length - tree_length)
    else:
        return (tree, length)

In [8]:
def to_newick(tree):
    def recurse(tree):
        if len(tree) > 2:
            childl, childr, tree_length = tree
            childl = recurse(childl)
            childr = recurse(childr)
            return f"({childl},{childr}):{tree_length :.2f}"
        else:
            node, node_length = tree
            return f"{node}:{node_length}"
        
    childl, childr, tree_length = tree
    childl = recurse(childl)
    childr = recurse(childr)
    return f"({childl},{childr})"

In [9]:
def wpgma(trees, dist_matrix):
    steps = len(trees) - 1
    while steps > 0:
        dist, i, j = min_dist(dist_matrix)
        dist_matrix = recalculate_distances_weighted(dist_matrix, i, j)
        trees = merge_subtrees(trees, i, j, dist / 2)
        steps -= 1
    tree = trees[i]
    deduced_tree = deduce_lengths(tree, tree[-1])
    return deduced_tree

In [10]:
def upgma(trees, dist_matrix):
    steps = len(trees) -1
    sizes = [1] * len(trees)
    while steps > 0:
        dist, i, j = min_dist(dist_matrix)
        dist_matrix = recalculate_distances_unweighted(dist_matrix, i, j, sizes[i], sizes[j])
        trees = merge_subtrees(trees, i, j, dist / 2)
        sizes = update_sizes(sizes, i, j)
        steps -= 1
    tree = trees[i]
    deduced_tree = deduce_lengths(tree, tree[-1])
    return deduced_tree

# Тестовые данные

In [11]:
test_1_trees = ['A', 'B', 'C', 'D']
test_1_dist = np.array([[0, 16, 16, 10],
                        [16, 0,  8,  8],
                        [16, 8, 0,   4],
                        [10, 8, 4,   0]
                    ], dtype=np.float64)

In [12]:
test_2_trees = ['A', 'B', 'C', 'D', 'E', 'F']
test_2_dist = np.array([[0, 5,  4, 7, 6,  8],
                        [5, 0,  7, 9, 10, 11],
                        [4, 7,  0, 7, 6,  8],
                        [7, 9,  7, 0, 5,  9],
                        [6, 10, 6, 5, 0,  8],
                        [8, 11, 8, 9, 8,  0]
                       ], dtype=np.float64)

# Проверка WPGMA

## Тест 1

In [13]:
to_newick(wpgma(test_1_trees, test_1_dist))

'(A:7.25,(B:4.0,(C:2.0,D:2.0):2.00):3.25)'

## Тест 2

In [14]:
to_newick(wpgma(test_2_trees, test_2_dist))

'((((A:2.0,C:2.0):1.00,B:3.0):1.00,(D:2.5,E:2.5):1.50):0.50,F:4.5)'

# Тест UPGMA

## Тест 1

In [15]:
to_newick(upgma(test_1_trees, test_1_dist))

'(A:7.0,(B:4.0,(C:2.0,D:2.0):2.00):3.00)'

## Тест 2

In [16]:
to_newick(upgma(test_2_trees, test_2_dist))

'((((A:2.0,C:2.0):1.00,B:3.0):0.75,(D:2.5,E:2.5):1.25):0.65,F:4.4)'