In [1]:
from io import StringIO
from collections import Counter

from cogent3.app import io
from qiime2.plugins.phylogeny.actions import iqtree
from qiime2 import Artifact
from skbio import TabularMSA, DNA, read, TreeNode
from joblib import Parallel, delayed

In [2]:
dstore = io.get_data_store("horse_pig_bats-filtered.tinydb")
loader = io.load_db()
dstore.describe

record type,number
completed,878
incomplete,122
logs,1


In [3]:
def convert_and_fit(aln):
    ffs = loader(aln)
    ffs = ffs[1::3]  # grab the second codon position
    ffs = TabularMSA([DNA(s) for s in read(StringIO(ffs.to_fasta()), 'fasta')])
    ffs = Artifact.import_data('FeatureData[AlignedSequence]', ffs)
    return iqtree(ffs)[0].view(TreeNode)

In [4]:
trees = Parallel(verbose=True, n_jobs=10)(delayed(convert_and_fit)(d) for d in dstore)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   11.8s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   47.0s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  1.8min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  3.3min
[Parallel(n_jobs=10)]: Done 878 out of 878 | elapsed:  3.8min finished


In [5]:
len(trees)

878

In [6]:
ghb_siblings = Counter()
for tree in trees:
    for sibling in tree.find('Greater').siblings():
        if sibling.name in ('Microbat', 'Pig', 'Horse'):
            ghb_siblings[sibling.name] += 1

In [7]:
assert sum(ghb_siblings.values()) == len(trees)

In [8]:
ghb_siblings

Counter({'Pig': 151, 'Microbat': 586, 'Horse': 141})

In [9]:
ghb_siblings['Microbat']/len(trees)

0.6674259681093394