# `pulsar`
The

Strand Symmetric Spectral

Method

In [2]:
from itertools import combinations
from collections import Counter, defaultdict

from cogent3 import load_aligned_seqs, PhyloNode
import numpy as np
import networkx as nx
from sklearn.cluster import SpectralClustering

In [3]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors

tf.executing_eagerly()  # need to check whether this is the default for tensorflow > 2

2021-11-12 08:09:07.137318: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


True

In [4]:
# this stops tensorflow from snaffling all of the GPU
# thanks https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

2021-11-12 08:09:09.714683: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-11-12 08:09:09.764534: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-12 08:09:09.764855: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2080 with Max-Q Design computeCapability: 7.5
coreClock: 1.095GHz coreCount: 46 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 357.69GiB/s
2021-11-12 08:09:09.764877: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-11-12 08:09:09.766352: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-11-12 08:09:09.767675: I tensorflow/s

## Data import
Reads an alignments and creates a list of 4 x 4 x 4 joint frequencies tensors.

In [5]:
def get_triples(aln, nuc_order='ACGT', codon_position=None, verbose=False):
    if codon_position:
        aln = aln[codon_position - 1::3]
    aln = aln.no_degenerates()
    if verbose:
        print(f'Got {len(aln)} positions')
    assert len(aln) <= np.iinfo(np.int32).max
    triples = []
    nuc_map = {n:i for i, n in enumerate(nuc_order)}
    for triple in combinations(range(aln.num_seqs), 3):
        F = np.zeros([4, 4, 4], dtype=np.int32)
        subaln = aln.get_sub_alignment(seqs=triple)
        for a, b, c in subaln:
            F[nuc_map[a], nuc_map[b], nuc_map[c]] += 1
        triples.append([tuple(subaln.names), F])
    return triples

## Triple fitting functions
Collection of functions for concurrent fitting of many triples on CPUs and GPUs. Model is rooted, continuous-time, and strand-symmetric.

Also some functions for using Akaike-ish weights to build Semple and Steel-ish graphs.

In [435]:
@tf.function()
def transform_P_matrix(params, t_param):
    # params = tf.exp(params)
    t = tf.exp(t_param)
    Q0 = tf.concat([[-tf.reduce_sum(params[0])], params[0]],
                   axis=0)
    Q1 = tf.concat([[params[1,0]], [-tf.reduce_sum(params[1])], params[1,1:]],
                   axis=0)
    Q = tf.concat([[Q0], [Q1], [Q1[::-1]], [Q0[::-1]]], axis=0)
    return tf.linalg.expm(Q*t)

@tf.function()
def transform(params):
    zero = tf.constant(0, dtype=tf.float32)
    pi = tfb.SoftmaxCentered()(params[0])
    Q = tfb.SoftmaxCentered()(tf.reshape(params[1:3], [-1])[:-1])
    Q = tf.stack([Q[:3], Q[3:]])
    Pa = transform_P_matrix(Q, params[3,0])
    Pm = transform_P_matrix(Q, params[2,2])
    Pb = transform_P_matrix(Q, params[3,1])
    Pc = transform_P_matrix(Q, params[3,2])
    return pi, Pa, Pm, Pb, Pc
    
@tf.function()
def _loss(params_data):
    params, data = params_data
    pi, Pa, Pm, Pb, Pc = transform(params)
    J = tf.einsum('i,ij,ik,ku,kv', pi, Pa, Pm, Pb, Pc)
    loss = tf.reduce_sum(tf.keras.losses.KLDivergence()(J, data))
    return loss
    
@tf.function()
def loss(params, data):
    return tf.reduce_sum(tf.vectorized_map(_loss, (params, data)))

@tf.function()
def training_step(parameters, data, optimizer):
    with tf.GradientTape() as tape:
        loss_value = loss(parameters, data)
    gradients = tape.gradient(loss_value, parameters)
    return loss_value, gradients

# thanks https://github.com/mlgxmez/thelongrun_notebooks/blob/master/MLE_tutorial.ipynb
def mle_fit(data, loss, parameters, optimizer, steps=500, verbose=False):
    for i in range(steps):
        loss_value, gradients = training_step(parameters, data, optimizer)
        optimizer.apply_gradients([(gradients, parameters)])
        
        if i % 100 == 0:
            if verbose:
                iter_info = f"Step: {optimizer.iterations.numpy()}, initial loss: {loss_value.numpy()}"
                print(iter_info)

def fit_triples(triples, learning_rate=0.01, steps=3000, verbose=False):
    K = 3*len(triples)
    cherry_loc = {}
    data = []
    for names, F in triples:
        J = (F/F.sum()).astype(np.float32)
        for ix in [[0, 1, 2], [1, 2, 0], [2, 0, 1]]:
            data.append(J.transpose(ix))

    normal_initializer = tf.random_normal_initializer()
    parameters = tf.Variable(normal_initializer(shape=[K, 4, 3], dtype=tf.float32), name='params')
#    for i in range(K):
#        parameters[i, 3, 0].assign(np.random.normal(np.log(2)))
    data = tf.constant(data)
    
    if verbose:
        print(f'Fitting {data.shape[0]} triples')
    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
    mle_fit(data, loss, parameters, optimizer, steps=steps, verbose=verbose)
    
    losses = tf.vectorized_map(_loss, (parameters, data)).numpy()
    losses = [losses[i:i+3] for i in range(0, len(losses), 3)]
    parameters = [parameters[i:i+3].numpy() for i in range(0, K, 3)]
#    lengths = [np.exp(parameters[i,3].numpy()) for i in range(0, K)]
#    lengths = [lengths[i:i+3] for i in range(0, len(lengths), 3)]
    return losses, parameters

def cherry_weights(ls, N):
    ls = N*ls
    delta = ls - ls.min()
    weights = np.exp(-delta)
    weights = weights/weights.sum()
    return weights
    h = (-weights*np.log(weights))[weights != 0.].sum()
#     print(ls)
#     print(weights)
#     print((ls == ls.min())/(h + 1))
#     print()
#    return (ls == ls.min())/(h + 1)
    return weights/(h + 1)  # this was my favourite
    if h < 1.2:
        return weights
        return (ls == ls.min())/(h+1)
    return np.zeros(3, dtype=bool)

def get_edges(triples, losses):
    edges = Counter()
    for losses, (names, F) in zip(losses, triples):
        weights = cherry_weights(losses, F.sum())
        for name, weight in zip(names, weights):
            edges[frozenset(names) - {name}] += weight
    # for edge in edges:
    #   edges[edge] = np.exp(edges[edge])
    return edges

def get_Ps(cherry_names, triples, fits):
    Ps = {}
    ixes = [[0, 1, 2], [1, 2, 0], [2, 0, 1]]
    for (names, _), triple_fit in zip(triples, fits):
        if set(cherry_names) < set(names):
            for name, ix, fit in zip(names, ixes, triple_fit):
                if name not in cherry_names:
                    if names[ix[1]] == cherry_names[0]:
                        return fit[-2], fit[-1]
                    return fit[-1], fit[-2]

## Tree building algorithm
Where the magic happens.

In [60]:
def pulsar(triples, learning_rate=0.01, steps=3000, verbose=False):
    losses, parameters = fit_triples(triples, learning_rate=learning_rate,
                         steps=steps, verbose=verbose)
    tree = pulsar_tree(triples, losses, verbose=verbose)
    return tree

def edges_to_graph(edges):
    G = nx.Graph()
    for edge, weight in edges.items():
        G.add_edge(*edge, weight=weight)
    return G

def normalised_cut(edges, verbose=False):
    tips = np.unique([t for p in edges.keys() for t in p])
    affinity = np.zeros((len(tips),)*2)
    for i, tipi in enumerate(tips):
        for j, tipj in enumerate(tips):
            if i == j:
                break
            affinity[i, j] = edges[frozenset((tipi, tipj))]
    affinity += affinity.T
    sc = SpectralClustering(2, affinity='precomputed',  # random_state=0,
                            assign_labels='discretize')
    ix = sc.fit_predict(affinity).astype(bool)
    partition = list(tips[ix]), list(tips[np.logical_not(ix)])
    if verbose:
        G = edges_to_graph(edges)
        cut_value = nx.cut_size(G, partition[0], weight='weight')
        print(f'Cut value: {cut_value}, Partition:\n{partition}')
    return partition
    
def min_cut(edges, verbose=False):
    G = edges_to_graph(edges)
    cut_value, partition = nx.stoer_wagner(G)
    if verbose:
        print(f'Cut value: {cut_value}, Partition:\n{partition}')
    return partition

def pulsar_tree(triples, losses, verbose=False):
    edges = get_edges(triples, losses)
    if verbose:
        print('Graph:')
        for edge, weight in edges.items():
            print(edge, weight)
    partition = normalised_cut(edges, verbose)
    assert len(partition) == 2, 'polytomy detected. bailing'
    this_node = PhyloNode()
    for part in partition:
        if len(part) <= 1:
            this_node.append(PhyloNode(part.pop()))
            continue
        elif len(part) == 2:
            child = PhyloNode()
            for grandchild in part:
                child.append(PhyloNode(grandchild))
            this_node.append(child)
            continue
    
        part = set(part)
        part_losses = []
        part_triples = []
        for losses_for_names, (names, F) in zip(losses, triples):
            if set(names) <= part:
                part_losses.append(losses_for_names)
                part_triples.append((names, F))
        this_node.append(pulsar_tree(part_triples, part_losses, verbose=verbose))
    return this_node

# Some examples
## Example 1
Fit a rooted phylogeny of 5 mammals.

In [38]:
# aln = load_aligned_seqs('/home/ben/Data/pentads/ENSG00000197102.fa.gz', moltype="dna")
aln = load_aligned_seqs('/home/ben/Data/pentads/ENSG00000131018.fa.gz', moltype="dna")
# aln = load_aligned_seqs('/home/ben/Data/pentads/ENSG00000179869.fa.gz', moltype="dna")
# aln = load_aligned_seqs('brca1.fasta', moltype='dna')

In [30]:
subaln = aln.take_seqs(['SpermWhale', 'HumpbackW', 'Rhino'])
subaln

0,1
,0
Rhino,TGTGGCACGAATACTCATGCCAGCTCATTGCAGCATGAGAACAGCAGTGTATTACTCACT
SpermWhale,........AG...................A........A.........T...........
HumpbackW,........AG...................A..A.....A.........T...........


In [27]:
subaln = aln.get_similar(aln.take_seqs(['Human']).seqs[0], min_similarity=0.82)
#                         min_similarity=0.84)
subaln

0,1
,0
Dog,TGTGGCACAAATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT
FreeTaile,.........G.........................................C........
LittleBro,.........G.........................................C.......C
LeafNose,..........................T.....TT.....C....................
Horse,.............................G..............................
Rhino,........G....................G..................G...........
Pangolin,............................................................
Cat,....C...........G...........................................
Llama,.........G..................................................


### All at once
First run `pulsar` all the way through.

In [39]:
%%time
triples = get_triples(aln, codon_position=2, verbose=True)
tree = pulsar(triples, verbose=True)
print(tree.ascii_art())

Got 1379 positions
Fitting 30 triples




Step: 1, initial loss: 9.560052871704102
Step: 101, initial loss: 6.047533988952637
Step: 201, initial loss: 3.3438515663146973
Step: 301, initial loss: 2.337021589279175
Step: 401, initial loss: 2.0015220642089844
Step: 501, initial loss: 1.8608124256134033
Step: 601, initial loss: 1.717704176902771
Step: 701, initial loss: 1.5398657321929932
Step: 801, initial loss: 1.4150830507278442
Step: 901, initial loss: 1.2605291604995728
Step: 1001, initial loss: 1.0925577878952026
Step: 1101, initial loss: 0.9561669826507568
Step: 1201, initial loss: 0.8565823435783386
Step: 1301, initial loss: 0.8171669840812683
Step: 1401, initial loss: 0.7617971897125244
Step: 1501, initial loss: 0.726557731628418
Step: 1601, initial loss: 0.7041598558425903
Step: 1701, initial loss: 0.6948224306106567
Step: 1801, initial loss: 0.6895954012870789
Step: 1901, initial loss: 0.6852792501449585
Step: 2001, initial loss: 0.6800824403762817
Step: 2101, initial loss: 0.6314908862113953
Step: 2201, initial loss: 0



### Min Cut
```
          /-Orangutan
---------|
         |          /-HairyArma
          \--------|
                   |          /-Sloth
                    \--------|
                             |          /-Pangolin
                              \--------|
                                       |          /-Chimpanzee
                                        \--------|
                                                 |          /-Gorilla
                                                  \--------|
                                                           |          /-FlyingLem
                                                            \--------|
                                                                     |          /-Rhesus
                                                                      \--------|
                                                                               |          /-Human
                                                                                \--------|
                                                                                         |          /-Llama
                                                                                          \--------|
                                                                                                   |          /-HowlerMon
                                                                                                    \--------|
                                                                                                             |          /-HumpbackW
                                                                                                              \--------|
                                                                                                                       |          /-Horse
                                                                                                                        \--------|
                                                                                                                                 |          /-Rhino
                                                                                                                                  \--------|
                                                                                                                                            \-SpermWhale
```

### Normalised Cut
```
                              /-Pangolin
                    /--------|
                   |         |          /-Rhino
                   |          \--------|
                   |                   |          /-HumpbackW
          /--------|                    \--------|
         |         |                              \-SpermWhale
         |         |
         |         |          /-Horse
         |          \--------|
         |                    \-Llama
---------|
         |                    /-HairyArma
         |          /--------|
         |         |         |          /-FlyingLem
         |         |          \--------|
         |         |                    \-Sloth
          \--------|
                   |                    /-Gorilla
                   |          /--------|
                   |         |          \-HowlerMon
                   |         |
                    \--------|                    /-Human
                             |          /--------|
                             |         |          \-Orangutan
                              \--------|
                                       |          /-Chimpanzee
                                        \--------|
                                                  \-Rhesus
```

### More Tips
```
                              /-LittleBro
                    /--------|
                   |         |          /-FreeTaile
                   |          \--------|
                   |                    \-LeafNose
                   |
                   |                              /-Hippo
          /--------|                    /--------|
         |         |                   |          \-Pangolin
         |         |          /--------|
         |         |         |         |          /-Pig
         |         |         |          \--------|
         |         |         |                   |          /-HumpbackW
         |         |         |                    \--------|
         |          \--------|                              \-SpermWhale
         |                   |
         |                   |                    /-Cat
         |                   |          /--------|
         |                   |         |          \-Rhino
         |                    \--------|
         |                             |                    /-Horse
         |                             |          /--------|
         |                              \--------|          \-Llama
         |                                       |
         |                                        \-Dog
---------|
         |                                        /-Mole
         |                              /--------|
         |                             |         |          /-FlyingSqu
         |                             |          \--------|
         |                    /--------|                    \-Gorilla
         |                   |         |
         |                   |         |          /-FlyingLem
         |                   |          \--------|
         |                   |                    \-Galago
         |          /--------|
         |         |         |                    /-Orangutan
         |         |         |          /--------|
         |         |         |         |         |          /-Chimpanzee
         |         |         |         |          \--------|
         |         |          \--------|                    \-Human
         |         |                   |
         |         |                   |          /-HowlerMon
         |         |                    \--------|
          \--------|                              \-Rhesus
                   |
                   |                              /-AfricanEl
                   |                    /--------|
                   |                   |         |          /-Dugong
                   |                   |          \--------|
                   |          /--------|                    \-Manatee
                   |         |         |
                   |         |         |          /-Aardvark
                    \--------|          \--------|
                             |                    \-AsianElep
                             |
                             |          /-NineBande
                              \--------|
                                       |          /-Anteater
                                        \--------|
                                                 |          /-HairyArma
                                                  \--------|
                                                            \-Sloth
CPU times: user 10h 25min 10s, sys: 45min 20s, total: 11h 10min 31s
Wall time: 5h 45min 42s
```

## Example 2
### A single iteration
Now run through a single iteration of the algorithm.
#### Fit triples
Fits rooted, strand-symmetric, continuous-time models to every taxon triple.

In [131]:
triples = get_triples(aln, codon_position=2, verbose=True)
losses, params = fit_triples(triples, verbose=True)

Got 408 positions
Fitting 12 triples




Step: 1, initial loss: 4.6430792808532715
Step: 101, initial loss: 1.5350946187973022
Step: 201, initial loss: 0.7035847902297974
Step: 301, initial loss: 0.5439425110816956
Step: 401, initial loss: 0.48084330558776855
Step: 501, initial loss: 0.4436599910259247
Step: 601, initial loss: 0.41333478689193726
Step: 701, initial loss: 0.37753191590309143
Step: 801, initial loss: 0.3322867453098297
Step: 901, initial loss: 0.2810487747192383
Step: 1001, initial loss: 0.22879187762737274
Step: 1101, initial loss: 0.1813226342201233
Step: 1201, initial loss: 0.14140407741069794
Step: 1301, initial loss: 0.11189055442810059
Step: 1401, initial loss: 0.09268411993980408
Step: 1501, initial loss: 0.0805152952671051
Step: 1601, initial loss: 0.07250875979661942
Step: 1701, initial loss: 0.06695887446403503
Step: 1801, initial loss: 0.06292945891618729
Step: 1901, initial loss: 0.05989427492022514
Step: 2001, initial loss: 0.05754075571894646
Step: 2101, initial loss: 0.055672790855169296
Step: 22

In [151]:
transform(params[3][2])

(<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.3047531 , 0.28164834, 0.23760423, 0.17599438], dtype=float32)>,
 <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[9.8576844e-01, 7.2712726e-05, 1.4108302e-02, 5.0507126e-05],
        [7.6896969e-05, 9.9932408e-01, 5.2272822e-05, 5.4672622e-04],
        [5.4672628e-04, 5.2272822e-05, 9.9932414e-01, 7.6896962e-05],
        [5.0507129e-05, 1.4108303e-02, 7.2712741e-05, 9.8576844e-01]],
       dtype=float32)>,
 <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[9.7904229e-01, 1.0758858e-04, 2.0775650e-02, 7.4532043e-05],
        [1.1325704e-04, 9.9900395e-01, 7.7625104e-05, 8.0510252e-04],
        [8.0510252e-04, 7.7625104e-05, 9.9900395e-01, 1.1325703e-04],
        [7.4532043e-05, 2.0775652e-02, 1.0758858e-04, 9.7904229e-01]],
       dtype=float32)>,
 <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[9.9626046e-01, 1.8966186e-05, 3.7073039e-03, 1.3228843e-05],
        [2.0201114e-05, 9.9982250e-01, 1.3557465e-05, 

In [152]:
params[3][2]

array([[ 0.54904985,  0.4702072 ,  0.3001543 ],
       [-2.0274463 ,  3.2505777 , -2.3862395 ],
       [-1.96184   , -2.3652027 , -3.8079286 ],
       [-4.198524  , -5.5405207 , -3.6191287 ]], dtype=float32)

#### Create $S_\mathcal{T}\left/E^\text{max}_\mathcal{T}\right.$
Creates the edges in Semple and Steel's $S_\mathcal{T}\left/E^\text{max}_\mathcal{T}\right.$ graph, at least as I understand it.

In [132]:
edges = get_edges(triples, losses)
edges

Counter({frozenset({'Greater horseshoe bat', 'Microbat'}): 0.3357943147420883,
         frozenset({'Greater horseshoe bat', 'Horse'}): 0.33598607778549194,
         frozenset({'Horse', 'Microbat'}): 0.3068389445543289,
         frozenset({'Microbat', 'Pig'}): 0.3031523525714874,
         frozenset({'Horse', 'Pig'}): 0.29223328828811646,
         frozenset({'Greater horseshoe bat', 'Pig'}): 0.33450715243816376})

#### Find the root by partitioning on the minimum cut
Perform the minimum cut to partition our tip set into two, one either side of the root.

In [153]:
G = nx.Graph()
for edge, weight in edges.items():
    G.add_edge(*edge, weight=weight)
cut_value, partition = nx.stoer_wagner(G)
print(f'Cut value: {cut_value}, Partition:\n{partition}')

Cut value: 0.9298927932977676, Partition:
(['Pig'], ['Microbat', 'Greater horseshoe bat', 'Horse'])


In [154]:
from sklearn.cluster import SpectralClustering

tips = list(set(t for p in edges.keys() for t in p))
affinity = np.zeros((len(tips),)*2)
for i, tipi in enumerate(tips):
    for j, tipj in enumerate(tips):
        if i == j:
            break
        affinity[i, j] = edges[frozenset((tipi, tipj))]
affinity += affinity.T
affinity
sc = SpectralClustering(2, affinity='precomputed',
                        assign_labels='discretize')
print
print(sc.fit_predict(affinity))
print(tips)

[0 0 0 1]
['Microbat', 'Greater horseshoe bat', 'Horse', 'Pig']


The rest is left as an exercise for the reader (or just look in `pulsar` above) - the algorithm continues recursively.

In [155]:
w = np.ones(3)/3
(-w*np.log(w)).sum()

1.0986122886681096

## Example 3
### Let's get systematic
For future reference: joblib and tensorflow do not play nice together.

In [156]:
from cogent3.app import io

In [157]:
dstore = io.get_data_store("../data/horse_pig_bats-filtered.tinydb")
loader = io.load_db()
dstore.describe

record type,number
completed,878
incomplete,122
logs,1


In [428]:
%%time
num_alns = len(dstore)
all_triples = []
for aln_name in dstore:
    aln = loader(aln_name)
    triples = get_triples(aln, codon_position=2, verbose=False)
    all_triples.extend(triples)
all_losses, all_parameters = fit_triples(all_triples, steps=12000, verbose=True)

Fitting 10536 triples




Step: 1, initial loss: 4412.3935546875
Step: 101, initial loss: 1435.83935546875
Step: 201, initial loss: 670.4434814453125
Step: 301, initial loss: 507.23193359375
Step: 401, initial loss: 428.7957763671875
Step: 501, initial loss: 378.2879333496094
Step: 601, initial loss: 339.9563293457031
Step: 701, initial loss: 307.46343994140625
Step: 801, initial loss: 277.52020263671875
Step: 901, initial loss: 248.21096801757812
Step: 1001, initial loss: 218.67623901367188
Step: 1101, initial loss: 189.62188720703125
Step: 1201, initial loss: 162.75228881835938
Step: 1301, initial loss: 139.31051635742188
Step: 1401, initial loss: 119.49615478515625
Step: 1501, initial loss: 102.89000701904297
Step: 1601, initial loss: 88.91476440429688
Step: 1701, initial loss: 77.19363403320312
Step: 1801, initial loss: 67.60419464111328
Step: 1901, initial loss: 59.997230529785156
Step: 2001, initial loss: 54.08952331542969
Step: 2101, initial loss: 49.5388069152832
Step: 2201, initial loss: 46.01953887939

In [436]:
trees = []
for i in range(0, len(all_losses), 4):
    tree = pulsar_tree(all_triples[i:i+4], all_losses[i:i+4])
    trees.append(tree)

In [437]:
ghb_siblings = Counter()
for tree in trees:
    tree = tree.unrooted()
    for sibling in tree.get_node_matching_name('Greater horseshoe bat').parent.children:
        if sibling.name in ('Microbat', 'Pig', 'Horse'):
            ghb_siblings[sibling.name] += 1

In [438]:
ghb_siblings

Counter({'Microbat': 346, 'Horse': 268, 'Pig': 264})

In [439]:
assert sum(ghb_siblings.values()) == len(trees)
1 - ghb_siblings['Microbat']/len(trees)

0.6059225512528474

In [180]:
print(tree.ascii_art())

          /-Greater horseshoe bat
         |
-edge.0--|--Horse
         |
         |          /-Microbat
          \--------|
                    \-Pig


In [164]:
pulsar_tree(all_triples[0:4], losses[0:4], verbose=True)

Graph:
frozenset({'Microbat', 'Pig'}) 0.33219969272613525
frozenset({'Greater horseshoe bat', 'Pig'}) 0.33232176303863525
frozenset({'Microbat', 'Greater horseshoe bat'}) 0.30895714461803436
frozenset({'Microbat', 'Horse'}) 0.3057943135499954
frozenset({'Greater horseshoe bat', 'Horse'}) 0.29714664816856384
frozenset({'Horse', 'Pig'}) 0.3312150835990906
Cut value: 0.9384255558252335, Partition:
(['Greater horseshoe bat'], ['Horse', 'Microbat', 'Pig'])
Graph:
frozenset({'Horse', 'Pig'}) 0.16781701147556305
frozenset({'Microbat', 'Horse'}) 0.14003528654575348
frozenset({'Microbat', 'Pig'}) 0.16949054598808289
Cut value: 0.30785229802131653, Partition:
(['Microbat', 'Pig'], ['Horse'])


Tree("(Greater_horseshoe_bat,((Microbat,Pig),Horse));")

In [237]:
pulsar_tree(all_triples[0:4], losses[0:4], verbose=True)

Graph:
frozenset({'Pig', 'Microbat'}) 0.6592081189155579
frozenset({'Pig', 'Greater horseshoe bat'}) 0.6670434474945068
frozenset({'Greater horseshoe bat', 'Microbat'}) 0.8335337042808533
frozenset({'Horse', 'Microbat'}) 0.8486621677875519
frozenset({'Horse', 'Greater horseshoe bat'}) 0.33249066902624236
frozenset({'Pig', 'Horse'}) 0.6590619087219238
Cut value: 2.4842944009445773, Partition:
(['Greater horseshoe bat', 'Pig'], ['Horse', 'Microbat'])


Tree("((Greater_horseshoe_bat,Pig),(Horse,Microbat));")

In [165]:
len(trees)

878

In [173]:
tree.unrooted()

Tree("(Greater_horseshoe_bat,Horse,(Microbat,Pig));")

In [58]:
all_lengths

[[array([0.00947547, 0.08680928, 0.0295497 ], dtype=float32),
  array([0.01223142, 0.03364073, 0.03420453], dtype=float32),
  array([0.01593519, 0.0292052 , 0.06262722], dtype=float32)],
 [array([0.01430178, 0.0238518 , 0.02402304], dtype=float32),
  array([0.04306394, 0.03021891, 0.03949882], dtype=float32),
  array([0.00777646, 0.03517144, 0.0303967 ], dtype=float32)],
 [array([0.0316271 , 0.03725714, 0.16054668], dtype=float32),
  array([0.00905246, 0.3178534 , 0.0406809 ], dtype=float32),
  array([0.01028289, 0.03412111, 0.03439054], dtype=float32)],
 [array([0.05644648, 0.02766903, 0.02364084], dtype=float32),
  array([0.01474296, 0.02657158, 0.02611756], dtype=float32),
  array([0.00978957, 0.03517712, 0.04180461], dtype=float32)],
 [array([0.01247493, 0.02970215, 0.03043655], dtype=float32),
  array([0.05246733, 0.03821764, 0.02340573], dtype=float32),
  array([0.03411119, 0.02289377, 0.15011947], dtype=float32)],
 [array([0.10469571, 0.03243235, 0.07028604], dtype=float32),
  a

In [165]:
all_losses

[array([0.00191721, 0.00178754, 0.00200574], dtype=float32),
 array([0.00215732, 0.0020705 , 0.00155442], dtype=float32),
 array([0.00209625, 0.00198488, 0.00098731], dtype=float32),
 array([0.00249136, 0.00178582, 0.00235044], dtype=float32),
 array([0.00250065, 0.00201581, 0.00214428], dtype=float32),
 array([0.00271034, 0.00206783, 0.00222695], dtype=float32),
 array([0.00241017, 0.00203487, 0.00209344], dtype=float32),
 array([0.0015523 , 0.0024744 , 0.00235546], dtype=float32),
 array([0.00118315, 0.00075147, 0.00126201], dtype=float32),
 array([0.00125118, 0.00099299, 0.0012964 ], dtype=float32),
 array([0.00048616, 0.00050437, 0.00054138], dtype=float32),
 array([0.00086001, 0.00124025, 0.00088773], dtype=float32),
 array([0.00471398, 0.00465313, 0.0041287 ], dtype=float32),
 array([0.0033029 , 0.0033121 , 0.00348281], dtype=float32),
 array([0.00404998, 0.00360612, 0.00484333], dtype=float32),
 array([0.00416164, 0.00464752, 0.00430409], dtype=float32),
 array([0.00232247, 0.00

In [246]:
for i in range(100):
    print(cherry_weights(all_losses[i], all_triples[i][1].sum()), all_losses[i].argmin() == np.exp(all_parameters[i][:,2,2]).argmax())

[0.33169898 0.34606633 0.32223466] True
[0.3080029  0.31687272 0.37512442] True
[0.28783745 0.2985128  0.41364974] True
[0.30242023 0.3808976  0.31668213] True
[0.30087325 0.35755527 0.34157145] True
[0.29029328 0.36490086 0.34480587] True
[0.30653206 0.35034868 0.34311926] True
[0.40461448 0.29139143 0.30399418] True
[0.319236 0.370023 0.310741] False
[0.32499525 0.3549974  0.32000738] True
[0.3361252  0.33403802 0.32983685] False
[0.34859905 0.3060907  0.3453103 ] True
[0.2986716  0.30739373 0.3939347 ] True
[0.34316155 0.3416717  0.31516677] False
[0.34238097 0.42236495 0.23525405] False
[0.36636582 0.2911415  0.34249264] True
[0.1709815  0.4621346  0.36688393] False
[0.43246102 0.22034487 0.34719414] False
[0.43747777 0.26008427 0.30243805] True
[0.3888274  0.36216745 0.24900511] False
[0.2835595  0.38453934 0.33190116] True
[0.32142985 0.31107736 0.36749282] True
[0.41663557 0.3510583  0.23230614] True
[0.41577858 0.3898388  0.19438264] True
[0.24114734 0.38738477 0.37146786] True

In [248]:
for i in range(100):
    print(all_triples[i][0], all_triples[i][0][all_losses[i].argmin()], all_triples[i][0][all_parameters[i][:,2,2].argmax()])

('Greater horseshoe bat', 'Microbat', 'Pig') Microbat Microbat
('Greater horseshoe bat', 'Microbat', 'Horse') Horse Horse
('Greater horseshoe bat', 'Pig', 'Horse') Horse Horse
('Microbat', 'Pig', 'Horse') Pig Pig
('Greater horseshoe bat', 'Microbat', 'Pig') Microbat Microbat
('Greater horseshoe bat', 'Microbat', 'Horse') Microbat Microbat
('Greater horseshoe bat', 'Pig', 'Horse') Pig Pig
('Microbat', 'Pig', 'Horse') Microbat Microbat
('Greater horseshoe bat', 'Microbat', 'Pig') Microbat Pig
('Greater horseshoe bat', 'Microbat', 'Horse') Microbat Microbat
('Greater horseshoe bat', 'Pig', 'Horse') Greater horseshoe bat Horse
('Microbat', 'Pig', 'Horse') Microbat Microbat
('Greater horseshoe bat', 'Microbat', 'Pig') Pig Pig
('Greater horseshoe bat', 'Microbat', 'Horse') Greater horseshoe bat Microbat
('Greater horseshoe bat', 'Pig', 'Horse') Pig Greater horseshoe bat
('Microbat', 'Pig', 'Horse') Microbat Microbat
('Greater horseshoe bat', 'Microbat', 'Pig') Microbat Pig
('Greater horsesho

In [234]:
all_parameters[0]

array([[[ 0.82137185, -0.07678887, -0.36368397],
        [-0.88250035,  3.0789864 , -0.11927749],
        [-2.3833287 , -2.2603009 , -7.282843  ],
        [-7.100134  , -6.94011   , -6.2452784 ]],

       [[ 0.8683239 , -0.04682273, -0.33534768],
        [-2.0190694 ,  3.0666864 , -2.090981  ],
        [-2.4840689 , -2.392481  , -6.004034  ],
        [-7.6769686 , -6.1266294 , -5.9855475 ]],

       [[ 0.80737597, -0.11864586, -0.40513352],
        [-3.5434964 , -2.684245  , -2.6538594 ],
        [ 3.2172313 ,  3.023668  , -8.752315  ],
        [-6.157029  , -7.076424  , -7.1379848 ]]], dtype=float32)

In [None]:
transform(all_parameters[2][2])

In [125]:
np.log(4)

1.3862943611198906

In [109]:
np.log(np.linalg.det(all_triples[4][1].sum(axis=2)/all_triples[4][1].sum()))

-5.68789409498752

In [122]:
all_triples[4][1].sum(axis=2)

array([[101,   1,   3,   1],
       [  0,  73,   0,   1],
       [  3,   0,  83,   0],
       [  0,   1,   0,  89]])

### Let's try for a little bit of magic

In [440]:
all_lengths = [np.exp(p[:,2,2]) for p in all_parameters]

In [441]:
set(frozenset(t[0]) for t in all_triples)

{frozenset({'Horse', 'Microbat', 'Pig'}),
 frozenset({'Greater horseshoe bat', 'Horse', 'Pig'}),
 frozenset({'Greater horseshoe bat', 'Horse', 'Microbat'}),
 frozenset({'Greater horseshoe bat', 'Microbat', 'Pig'})}

In [442]:
correct_answers = {frozenset({'Horse', 'Microbat', 'Pig'}): 'Microbat',
                   frozenset({'Greater horseshoe bat', 'Horse', 'Pig'}): 'Greater horseshoe bat',
                   frozenset({'Greater horseshoe bat', 'Horse', 'Microbat'}): 'Horse',
                   frozenset({'Greater horseshoe bat', 'Microbat', 'Pig'}): 'Pig'}

In [483]:
X = []
bigX = []
lenX = []
y = []
for triple, losses, lengths, params in zip(all_triples, all_losses, all_lengths, all_parameters):
    ml = losses.argmin()
    _lenX = np.exp(params[ml,3])
    _lenX[0] += np.exp(params[ml,2,2])
    _lenX = _lenX[np.array([(0, 1, 2), (1, 2, 0), (2, 0, 1)][ml])]
    lenX.append(_lenX[ix])
    ix = np.random.choice(range(3), 3, replace=False)
    X.append(np.hstack([losses[ix]/np.linalg.norm(losses),
                        lengths[ix]/np.linalg.norm(lengths),
                        _lenX[ix]/np.linalg.norm(_lenX)]))
    bigX.append(params[ix].flatten())
    triple = [triple[0][i] for i in ix]
    y.append(triple.index(correct_answers[frozenset(triple)]))
X = np.array(X)
y = np.array(y)

In [454]:
sum(_X[:3].argmin() == _y for _X, _y in zip(X, y)) / len(y)

0.36161731207289294

In [455]:
sum(_X[3:6].argmax() == _y for _X, _y in zip(X, y)) / len(y)

0.3892369020501139

In [456]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC())
clf.fit(X, y)
y_svc = clf.predict(X)

In [457]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVC, SVC, LinearSVC

clf = make_pipeline(StandardScaler(), LinearSVC(C=1, class_weight='balanced'))
clf.fit(X, y)
y_svc = clf.predict(X)



In [458]:
sum(y_svc == y) / len(y)

0.39863325740318906

In [355]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(
    X[:,:3], y, test_size=0.2)

X_train.shape, y_train.shape

X_test.shape, y_test.shape


clf = make_pipeline(StandardScaler(), SVC(class_weight='balanced')).fit(X_train, y_train)
clf.score(X_test, y_test)

0.34281650071123754

In [487]:
from sklearn.model_selection import cross_val_score

clf = make_pipeline(StandardScaler(), LinearSVC(class_weight='balanced'))
scores = cross_val_score(clf, X[:,3:6], y, cv=5)
scores.mean()

0.38923579449895246

In [482]:
X[0]

array([4.7419196e-01, 6.0957062e-01, 6.3526803e-01, 9.9696743e-01,
       7.7752531e-02, 3.2330311e-03, 3.6112129e-04, 8.9671528e-03,
       3.1671236e-05], dtype=float32)