In [1]:
from io import StringIO
from collections import Counter

from cogent3.app import io
from qiime2.plugins.phylogeny.actions import iqtree
from qiime2 import Artifact
from skbio import TabularMSA, DNA, read, TreeNode
from joblib import Parallel, delayed

In [3]:
dstore = io.get_data_store("aligned.tinydb")
loader = io.load_db()
dstore.describe

record type,number
completed,662
incomplete,630
logs,1


In [26]:
loader(dstore[0])

0,1
,0
Fugu,ATGGAGGGCGTCGGTGAGACCAACACCGTCCCGGAGGAAGGGACGCGGGACGTGAAACCA
Mouse,.......A.AAA..CA.......T.A.....AAA.T....CTC.AGA...G...CGG..C
Rat,.......A.AAA.ACA....G..T.A.....AAA.T....CTC.AGA...G...CGG..C
Platypus,.......A.................A....T..A.C..G......GA.CCG.....G..G
Guinea Pig,...A...A.AAG..CA....A....AT....A.A.T..G..CC.CGA...G...C.G..C
Human,.....A.ATAAAA.CA....A....A.....ATA.T.....CT.C.A...G..CC.G..C


In [15]:
def convert_and_fit(aln):
    ffs = loader(aln).take_seqs(['Rat', 'Mouse', 'Guinea Pig', 'Fugu'])
    ffs = ffs[2::3]  # grab the second codon position
    ffs = TabularMSA([DNA(s) for s in read(StringIO(ffs.to_fasta()), 'fasta')])
    ffs = Artifact.import_data('FeatureData[AlignedSequence]', ffs)
    return iqtree(ffs)[0].view(TreeNode)

In [16]:
trees = Parallel(verbose=True, n_jobs=10)(delayed(convert_and_fit)(d) for d in dstore)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   12.2s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   52.1s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  2.0min
[Parallel(n_jobs=10)]: Done 662 out of 662 | elapsed:  3.1min finished


In [17]:
len(trees)

662

In [21]:
ghb_siblings = Counter()
for tree in trees:
    for sibling in tree.find('Rat').siblings():
        if sibling.name in ('Mouse', 'Guinea', 'Fugu'):
            ghb_siblings[sibling.name] += 1

In [23]:
assert sum(ghb_siblings.values()) == len(trees)

In [24]:
ghb_siblings

Counter({'Mouse': 562, 'Fugu': 60, 'Guinea': 40})

In [25]:
ghb_siblings['Mouse']/len(trees)

0.8489425981873112

In [10]:
import json

ordered_siblings = []
for tree in trees:
    for sibling in tree.find('Greater').siblings():
        if sibling.name in ('Microbat', 'Pig', 'Horse'):
            ordered_siblings.append(sibling.name)

with open('iq-tree-third.json', 'w') as fh:
    json.dump(ordered_siblings, fh)