In [None]:
import pandas as pd
from qiime2 import Artifact
from Bio import Entrez, SeqIO
from gzip import open as gopen
from time import sleep
from ete3 import Tree, TextFace, CircleFace, NodeStyle, TreeStyle
from os import path
import glob

!mkdir phylogenies

Entrez.email = "amir@adssc.org"

taxa = Artifact.load('dada/taxonomy.qza').view(pd.DataFrame)
core = pd.read_csv('metadata/core_asvs.tsv',sep='\t')
summary = pd.read_csv('metadata/summary.tsv',sep='\t')

!wget https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz
!zcat SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz | makeblastdb -title silva -out silva -dbtype nucl
refs = SeqIO.to_dict(SeqIO.parse(gopen('SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz','rt'),'fasta'))

qza = 'dada/rep-seqs.qza'
a = !unzip $qza
digest = a[1].split('/')[0].replace('  inflating: ','')
inf = digest + '/data/dna-sequences.fasta'
ASVs = SeqIO.to_dict(SeqIO.parse(inf,'fasta'))
!rm -r $digest

groups = {
    'rhodo': 'Rhodobacterales',
    'flavo': 'Flavobacteriales',
    'sapro': 'Saprospirales',
    'cyto': 'Cytophagales',
    'sbr': 'SBR1031',
    'cyan': 'Cyanobacteria',
    'myxo': 'Myxococcota',
    'planc': 'Planctomycetes',
    'rhiz': 'Rhizobiales',
    'alt': 'Alteromonadales' 
}

for key, group in groups.items():

    rhodo = core.loc[core.Order==group].asv.tolist()
    rhodo += [j for j in taxa.index if j[:6] in [i[:6] for i in summary.loc[summary.order==group].feature.tolist()][1:]]
    rhodo = set(rhodo)
    queries = [ASVs[i] for i in rhodo]
    q = 'phylogenies/%s.fasta' % key
    SeqIO.write(queries,q,'fasta')
    blastout = 'phylogenies/%s.out' % key
    
    a = !blastn -query $q -out $blastout -outfmt 6 -max_hsps 1 -max_target_seqs 50 -db silva

    blast = pd.read_csv(blastout,sep='\t',header=None)
    blast =set([i for i in blast[1]])
    alnin = 'phylogenies/%s_with_refs.fasta' % key

    with open(alnin,'wt') as hndl:
        for i in blast:
            hndl.write(refs[i].format('fasta'))
        hndl.write(open(q,'r').read())

    alnout = 'phylogenies/%s_with_refs.aln.fasta' % key
    a = !mafft --maxiterate 1000 --localpair --thread 14 $alnin > $alnout

    trimout = 'phylogenies/%s_with_refs.trm.fasta' % key
    a = !trimal -in $alnout -out $trimout -gt 0.1
    a = !raxmlHPC-PTHREADS -s $trimout -f a -x 123 -N 100 -n $key -m GTRGAMMA -p 456 -T 10 -w /home/amir/Dropbox/DropboxOnBioPC91/SeaGrass/phylogenies


    with open('ref_seq_sources/%s_sources' % key,'wt') as hndl:
        for n in blast:
            handle = Entrez.efetch(db="nucleotide", id=n.split('.')[0], rettype="gb", retmode="text")
            sleep(1)
            r = SeqIO.read(handle,'gb')
            handle.close()
            if 'isolation_source' in r.features[0].qualifiers:
                source = r.features[0].qualifiers['isolation_source'][0]
                hndl.write("%s\t%s\n" % (n,source))


for f in glob.glob('phylogenies/RAxML_bipartitions.*'):
    taxon = f.split('.')[-1]
    if path.exists('phylogenies/%s.png' % taxon):
        continue
    t = Tree(f)

    ns = NodeStyle()
    ns['size'] = 0

    t.set_outgroup(t.get_midpoint_outgroup())
    
    sources = pd.read_csv('ref_seq_sources/%s_sources' % taxon,index_col=0,header=None,names=['source'])
    
    for n in t.traverse():
        n.set_style(ns)
        if n.is_leaf():
            if n.name in refs:
                color = 'black'
                desc =   refs[n.name].description.split(';')[-1]
                if desc.startswith(taxon):
                    color='blue'
                n.add_face(TextFace(n.name + ' ',fgcolor='black'),0,position='branch-right')
                n.add_face(TextFace(desc,fgcolor=color),1,position='branch-right')
                if n in sources.index:
                    n.add_face(TextFace(' '+sources.at[n,'source'],fgcolor='green'),2,position='branch-right')
            else:
                n.add_face(TextFace(taxon+' '+n.name[:6],fgcolor='red'),0,position='branch-right')
        elif n.support > 70:
            n.add_face(CircleFace(4,'black'),0,position='float')

    ts = TreeStyle()
    ts.show_leaf_name = False
    ts.scale=2000

    t.render('phylogenies/%s.png' % taxon, tree_style=ts,dpi=600,w=1500)  
    t.render('phylogenies/%s.pdf' % taxon, tree_style=ts,dpi=600,w=1500) 