# `TERMITE`
sTrand symmEtric tRiple MIncuTsupErtree

In [1]:
from itertools import combinations
from collections import Counter, defaultdict

from cogent3 import load_aligned_seqs, PhyloNode
import numpy as np
import networkx as nx

In [2]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors

tf.executing_eagerly()  # need to check whether this is the default for tensorflow > 2

2021-09-13 06:30:14.947332: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


True

In [3]:
# this stops tensorflow from snaffling all of the GPU
# thanks https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

2021-09-13 06:30:15.998917: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-09-13 06:30:16.072549: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-13 06:30:16.072853: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2080 with Max-Q Design computeCapability: 7.5
coreClock: 1.095GHz coreCount: 46 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 357.69GiB/s
2021-09-13 06:30:16.072871: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-09-13 06:30:16.074331: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-09-13 06:30:16.075550: I tensorflow/s

## Data import
Reads an alignments and creates a list of 4 x 4 x 4 joint frequencies tensors.

In [4]:
def get_triples(aln, nuc_order='ACGT', codon_position=None, verbose=False):
    if codon_position:
        aln = aln[codon_position - 1::3]
    aln = aln.no_degenerates()
    if verbose:
        print(f'Got {len(aln)} positions')
    assert len(aln) <= np.iinfo(np.int32).max
    triples = []
    nuc_map = {n:i for i, n in enumerate(nuc_order)}
    for triple in combinations(range(aln.num_seqs), 3):
        F = np.zeros([4, 4, 4], dtype=np.int32)
        subaln = aln.get_sub_alignment(seqs=triple)
        for a, b, c in subaln:
            F[nuc_map[a], nuc_map[b], nuc_map[c]] += 1
        triples.append([tuple(subaln.names), F])
    return triples

## Triple fitting functions
Collection of functions for concurrent fitting of many triples on CPUs and GPUs. Model is rooted, continuous-time, and strand-symmetric.

Also some functions for using Akaike-ish weights to build Semple and Steel-ish graphs.

In [5]:
@tf.function()
def transform_P_matrix(params):
    params = tf.exp(params)
    Q0 = tf.concat([[-tf.reduce_sum(params[0])], params[0]],
                   axis=0)
    Q1 = tf.concat([[params[1,0]], [-tf.reduce_sum(params[1])], params[1,1:]],
                   axis=0)
    Q = tf.concat([[Q0], [Q1], [Q1[::-1]], [Q0[::-1]]], axis=0)
    return tf.linalg.expm(Q)

@tf.function()
def transform(params):
    pi = tfb.SoftmaxCentered()(params[0])
    Pa = transform_P_matrix(params[1:3])
    Pm = transform_P_matrix(params[3:5])
    Pb = transform_P_matrix(params[5:7])
    Pc = transform_P_matrix(params[7:9])
    return pi, Pa, Pm, Pb, Pc
    
@tf.function()
def _loss(params_data):
    params, data = params_data
    pi, Pa, Pm, Pb, Pc = transform(params)
    J = tf.einsum('i,ij,ik,ku,kv', pi, Pa, Pm, Pb, Pc)
    loss = tf.reduce_sum(tf.keras.losses.KLDivergence()(J, data))
    return loss
    
@tf.function()
def loss(params, data):
    # could do better managing the variance in the shared matrix case here
    return tf.reduce_sum(tf.vectorized_map(_loss, (params, data)))

@tf.function()
def training_step(parameters, data, optimizer, unscrambler):
    with tf.GradientTape() as tape:
        unscrambled = _unscramble(parameters, unscrambler)
        loss_value = loss(unscrambled, data)
    gradients = tape.gradient(loss_value, parameters)
    return loss_value, gradients

# thanks https://github.com/mlgxmez/thelongrun_notebooks/blob/master/MLE_tutorial.ipynb
def mle_fit(data, loss, parameters, optimizer, unscrambler, steps=500, verbose=False):
    for i in range(steps):
        loss_value, gradients = training_step(parameters, data, optimizer, unscrambler)
        optimizer.apply_gradients([(gradients, parameters)])
        
        if i % 100 == 0:
            if verbose:
                iter_info = f"Step: {optimizer.iterations.numpy()}, initial loss: {loss_value.numpy()}"
                print(iter_info)

@tf.function()
def _unscramble(parameters, unscramble):
    unscrambled = []
    for t1 in unscramble:
        unscrambled.append(tf.stack([parameters[i] for i in t1]))
    return tf.stack(unscrambled)

def fit_triples(triples, learning_rate=0.01, cherries_share_matrices=True, steps=3000, verbose=False):
    K = 0
    cherry_loc = {}
    unscrambler = []
    data = []
    for names, F in triples:
        J = (F/F.sum()).astype(np.float32)
        for ix in [[0, 1, 2], [1, 2, 0], [2, 0, 1]]:
            t1 = list(range(K, K+5)) # for pi and two Ps
            K += 5
            cherry = [names[ix[1]], names[ix[2]]]
            frozen_cherry = frozenset(cherry)
            if not cherries_share_matrices or frozen_cherry not in cherry_loc:
                new_loc = {cherry[0]: [K,K+1], cherry[1]: [K+2,K+3]}
                K += 4
                cherry_loc[frozen_cherry] = new_loc
            t1.extend(cherry_loc[frozen_cherry][cherry[0]]) # for the first cherry
            t1.extend(cherry_loc[frozen_cherry][cherry[1]]) # for the second cherry
            unscrambler.append(t1)
        
            data.append(J.transpose(ix))

    normal_initializer = tf.random_normal_initializer()
    parameters = tf.Variable(normal_initializer(shape=[K, 3], dtype=tf.float32), name='params')
    data = tf.stack(data)
    
    if verbose:
        print(f'Fitting {data.shape[0]} triples')
    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
    mle_fit(data, loss, parameters, optimizer, unscrambler, steps=steps, verbose=verbose)
    
    parameters = _unscramble(parameters, unscrambler)
    losses = tf.vectorized_map(_loss, (parameters, data)).numpy()
    losses = [losses[i:i+3] for i in range(0, len(losses), 3)]
    fits = [[p.numpy() for p in transform(params)] for params in parameters]
    fits = [fits[i:i+3] for i in range(0, len(fits), 3)]
    return losses, fits

def cherry_weights(ls, N):
    ls = N*ls
    delta = ls - ls.min()
    weights = np.exp(-delta)
    return weights/weights.sum()

def get_edges(triples, losses):
    edges = Counter()
    for losses, (names, F) in zip(losses, triples):
        weights = cherry_weights(losses, F.sum())
        for name, weight in zip(names, weights):
            edges[frozenset(names) - {name}] += weight
    return edges

def get_Ps(cherry_names, triples, fits):
    Ps = {}
    ixes = [[0, 1, 2], [1, 2, 0], [2, 0, 1]]
    for (names, _), triple_fit in zip(triples, fits):
        if set(cherry_names) < set(names):
            for name, ix, fit in zip(names, ixes, triple_fit):
                if name not in cherry_names:
                    if names[ix[1]] == cherry_names[0]:
                        return fit[-2], fit[-1]
                    return fit[-1], fit[-2]

## Tree building algorithm
Where the magic happens.

In [6]:
def termite(triples, learning_rate=0.01, steps=3000, verbose=False):
    losses, _ = fit_triples(triples, cherries_share_matrices=False,
                               learning_rate=learning_rate, steps=steps, verbose=verbose)
    tree = termite_tree(triples, losses, verbose=verbose)
    return tree
    
def termite_tree(triples, losses, verbose=False):
    edges = get_edges(triples, losses)
    if verbose:
        print('Graph:')
        for edge, weight in edges.items():
            print(edge, weight)
    G = nx.Graph()
    for edge, weight in edges.items():
        G.add_edge(*edge, weight=weight)
    cut_value, partition = nx.stoer_wagner(G)
    if verbose:
        print(f'Cut value: {cut_value}, Partition:\n{partition}')
    assert len(partition) == 2, 'polytomy detected. bailing'
    this_node = PhyloNode()
    for part in partition:
        if len(part) <= 1:
            this_node.append(PhyloNode(part.pop()))
            continue
        elif len(part) == 2:
            child = PhyloNode()
            for grandchild in part:
                child.append(PhyloNode(grandchild))
            this_node.append(child)
            continue
    
        part = set(part)
        part_losses = []
        part_triples = []
        for losses, (names, F) in zip(losses, triples):
            if set(names) <= part:
                part_losses.append(losses)
                part_triples.append((names, F))
        this_node.append(termite_tree(part_triples, part_losses, verbose=verbose))
    return this_node

# Some examples
## Example 1
Fit a rooted phylogeny of 5 mammals.

In [7]:
aln = load_aligned_seqs('/home/ben/Data/pentads/ENSG00000197102.fa.gz', moltype="dna")
# aln = load_aligned_seqs('/home/ben/Data/pentads/ENSG00000131018.fa.gz', moltype="dna")
# aln = load_aligned_seqs('/home/ben/Data/pentads/ENSG00000179869.fa.gz', moltype="dna")
# aln = load_aligned_seqs('brca1.fasta', moltype='dna')

### All at once
First run `termite` all the way through.

In [8]:
%%time
triples = get_triples(aln, codon_position=3, verbose=True)
tree = termite(triples, verbose=True)
print(tree.ascii_art())

Got 4406 positions


2021-09-13 06:30:22.894771: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-13 06:30:22.916349: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2599990000 Hz
2021-09-13 06:30:22.916984: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55a8b985c290 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-09-13 06:30:22.916996: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-09-13 06:30:22.975985: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2

Fitting 30 triples


2021-09-13 06:30:37.314545: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-09-13 06:30:37.478705: I tensorflow/core/kernels/cuda_solvers.cc:180] Creating CudaSolver handles for stream 0x55a8b593be90
2021-09-13 06:30:37.478853: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2021-09-13 06:30:37.672809: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10


Step: 1, initial loss: 2.002349615097046
Step: 101, initial loss: 1.6990163326263428
Step: 201, initial loss: 1.1770401000976562
Step: 301, initial loss: 0.4651837646961212
Step: 401, initial loss: 0.33783480525016785
Step: 501, initial loss: 0.3070788085460663
Step: 601, initial loss: 0.29280954599380493
Step: 701, initial loss: 0.2708946168422699
Step: 801, initial loss: 0.253675639629364
Step: 901, initial loss: 0.22664757072925568
Step: 1001, initial loss: 0.21593135595321655
Step: 1101, initial loss: 0.19900184869766235
Step: 1201, initial loss: 0.19339296221733093
Step: 1301, initial loss: 0.18176187574863434
Step: 1401, initial loss: 0.16928890347480774
Step: 1501, initial loss: 0.1574772596359253
Step: 1601, initial loss: 0.15164227783679962
Step: 1701, initial loss: 0.145457461476326
Step: 1801, initial loss: 0.14200682938098907
Step: 1901, initial loss: 0.14094436168670654
Step: 2001, initial loss: 0.13856738805770874
Step: 2101, initial loss: 0.1372535675764084
Step: 2201, i

## Example 2
### A single iteration
Now run through a single iteration of the algorithm.
#### Fit triples
Fits rooted, strand-symmetric, continuous-time models to every taxon triple.

In [15]:
losses, fits = fit_triples(triples, cherries_share_matrices=False, verbose=True)

Fitting 30 triples
Step: 1, initial loss: 2.0117835998535156
Step: 101, initial loss: 1.698805809020996
Step: 201, initial loss: 1.186188817024231
Step: 301, initial loss: 0.459677129983902
Step: 401, initial loss: 0.3385312855243683
Step: 501, initial loss: 0.3154182434082031
Step: 601, initial loss: 0.31015241146087646
Step: 701, initial loss: 0.3008975386619568
Step: 801, initial loss: 0.27459800243377686
Step: 901, initial loss: 0.26621389389038086
Step: 1001, initial loss: 0.2488139271736145
Step: 1101, initial loss: 0.24297833442687988
Step: 1201, initial loss: 0.2217521369457245
Step: 1301, initial loss: 0.20997193455696106
Step: 1401, initial loss: 0.18395183980464935
Step: 1501, initial loss: 0.17843468487262726
Step: 1601, initial loss: 0.16511203348636627
Step: 1701, initial loss: 0.1607339084148407
Step: 1801, initial loss: 0.15662065148353577
Step: 1901, initial loss: 0.1522778570652008
Step: 2001, initial loss: 0.14635497331619263
Step: 2101, initial loss: 0.1456945240497

#### Create $S_\mathcal{T}\left/E^\text{max}_\mathcal{T}\right.$
Creates the edges in Semple and Steel's $S_\mathcal{T}\left/E^\text{max}_\mathcal{T}\right.$ graph, at least as I understand it.

In [16]:
edges = get_edges(triples, losses)
edges

Counter({frozenset({'Human', 'Mouse'}): 1.7177821099758148,
         frozenset({'Dog', 'Mouse'}): 1.866870939731598,
         frozenset({'Dog', 'Human'}): 2.342536687850952,
         frozenset({'Human', 'Opossum'}): 0.5797104686498642,
         frozenset({'Dog', 'Opossum'}): 0.840646892786026,
         frozenset({'Human', 'Platypus'}): 0.5245423316955566,
         frozenset({'Dog', 'Platypus'}): 0.23943248391151428,
         frozenset({'Mouse', 'Opossum'}): 0.5786677002906842,
         frozenset({'Mouse', 'Platypus'}): 0.19638927280904406,
         frozenset({'Opossum', 'Platypus'}): 1.1134211421012878})

#### Find the root by partitioning on the minimum cut
Perform the minimum cut to partition our tip set into two, one either side of the root.

In [17]:
G = nx.Graph()
for edge, weight in edges.items():
    G.add_edge(*edge, weight=weight)
cut_value, partition = nx.stoer_wagner(G)
print(f'Cut value: {cut_value}, Partition:\n{partition}')

Cut value: 2.073785230517403, Partition:
(['Platypus'], ['Mouse', 'Dog', 'Opossum', 'Human'])


The rest is left as an exercise for the reader (or just look in `termite` above) - the algorithm continues recursively.