In [1]:
from io import StringIO
from collections import Counter

from cogent3.app import io
from qiime2.plugins.phylogeny.actions import iqtree
from q2_phylogeny import iqtree
from qiime2 import Artifact
from skbio import TabularMSA, DNA, read, TreeNode
from joblib import Parallel, delayed

In [2]:
dstore = io.get_data_store("horse_pig_bats-filtered.tinydb")
loader = io.load_db()
dstore.describe

record type,number
completed,878
incomplete,122
logs,1


In [44]:
def convert_and_fit(aln, model='STRSYM'):
    ffs = loader(aln)
    ffs = ffs[2::3]  # grab the third codon position
    filename = f'hpbb/{aln.split(".")[0]}.fasta'
    with open(filename, 'w') as fh:
        fh.write(ffs.to_fasta())
    return iqtree(filename, substitution_model=model).view(TreeNode)

In [45]:
trees = Parallel(verbose=True, n_jobs=10)(delayed(convert_and_fit)(d) for d in dstore)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   14.4s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   56.9s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  2.1min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  3.8min
[Parallel(n_jobs=10)]: Done 878 out of 878 | elapsed:  4.2min finished


In [46]:
len(trees)

878

In [62]:
ghb_siblings = Counter()
for tree in trees:
    for sibling in tree.find('Greater').siblings():
        if sibling.name in ('Microbat', 'Pig', 'Horse'):
            ghb_siblings[sibling.name] += 1
        if not sibling.is_tip():
            ghb_siblings['dud root'] += 1

In [63]:
assert sum(ghb_siblings.values()) == len(trees)

In [64]:
ghb_siblings

Counter({'dud root': 369, 'Pig': 51, 'Microbat': 408, 'Horse': 50})

In [65]:
ghb_siblings['Microbat']/len(trees)

0.4646924829157175

In [98]:
ghb_siblings = Counter()
completely_right = 0
for tree in trees:
    for node in tree.postorder():
        is_microbat = False
        if set(c.name for c in node.children) in [{'Pig', 'Horse'}, {'Microbat', 'Greater'}]:
            ghb_siblings['Microbat'] += 1
            is_microbat = True
            break
    else:
        ghb_siblings['other'] += 1
    completely_right += is_microbat and sum(c.is_tip() for c in tree.children) == 0
    if not(is_microbat and sum(c.is_tip() for c in tree.children) == 0):
        print(tree.ascii_art())
        print(is_microbat)
        print(sum(c.is_tip() for c in tree.children))

                    /-Greater
          /--------|
         |         |          /-Pig
---------|          \--------|
         |                    \-Horse
         |
          \-Microbat
True
1
          /-Greater
---------|
         |          /-Microbat
          \--------|
                   |          /-Pig
                    \--------|
                              \-Horse
True
1
                              /-Greater
                    /--------|
          /--------|          \-Pig
         |         |
---------|          \-Horse
         |
          \-Microbat
False
1
                    /-Greater
          /--------|
         |         |          /-Pig
---------|          \--------|
         |                    \-Horse
         |
          \-Microbat
True
1
                              /-Greater
                    /--------|
          /--------|          \-Microbat
         |         |
---------|          \-Horse
         |
          \-Pig
True
1
                    /-Gr

In [99]:
assert sum(ghb_siblings.values()) == len(trees)

In [100]:
ghb_siblings

Counter({'Microbat': 682, 'other': 196})

In [101]:
ghb_siblings['Microbat'] / len(trees)

0.7767653758542141

In [102]:
completely_right

34

In [10]:
import json

ordered_siblings = []
for tree in trees:
    for sibling in tree.find('Greater').siblings():
        if sibling.name in ('Microbat', 'Pig', 'Horse'):
            ordered_siblings.append(sibling.name)

with open('iq-tree-third.json', 'w') as fh:
    json.dump(ordered_siblings, fh)