# Building and comparing trees
The main steps for building trees are the following:
1) Retrieve core genes from a tool and save each GF to a file
2) Align core genes from each family (using mafft)
3) Concatenate alignments 
4) Run maximum-likelihood tree building (FastTree)
5) Substitute genoome names such that they correspond between all tools
6) Compare phylogenies using ete3 nRF.

In [48]:
import subprocess
import pandas as pd
import os

## PANPROVA
Ground truth trees are built starting from the core genes of the complete genomes of the synthetic benchmark, PANPROVA.

In [67]:

for species in ['synth_ecoli','synth_myco','synth_paeru']:
    basedir=os.getcwd()
    ipanprova = basedir+'/PANPROVA_'+species+'/1/survival_families'  

    CG_dir = basedir+'/'+species+'/1/Comparison/CG'
    os.makedirs(CG_dir,exist_ok=True)
    os.makedirs(CG_dir+'/fasta_prova',exist_ok=True)
    gf_cg_prova = set()
    for line in open(ipanprova,'r'):
        cc = line.strip().split(' ')[1:]
            
        genomes = set()
        cc_filt=set()
        for c in cc:
            genome = c.split(',')[0].lstrip('(')
            if genome not in genomes:
                genomes.add(genome)
                cc_filt.add((c))

        size = len(genomes)
        if size == 10:
            gf_cg_prova.add(tuple(cc_filt))

    # read all genes
    gene2seq=dict()
    ipanprova_blastdb = basedir+'/PANPROVA_'+species+'/1/blastdb/'
    for file in os.listdir(ipanprova_blastdb):
        if file.endswith('fna'):
            with open(ipanprova_blastdb+'/'+file) as f:
                lines = f.readlines()
                for l in lines:
                    if l.startswith('>'):
                        gene_name = l[1:].rstrip()
                    else: 
                        gene2seq[gene_name]=l.rstrip()
 
    # write to file
    i = 0 
    for family in gf_cg_prova:
        with open(CG_dir+'/fasta_prova/fam_'+str(i)+'.fa','w') as f:
            i += 1
            for gene in family: 
                f.write('>'+str(gene)+'\n')
                f.write(str(gene2seq[gene])+'\n')

    # RUN MAFFT
    cg_fa_prova = CG_dir+'/fasta_prova/'
    msa_prova = CG_dir+'/msa_prova/'
    os.makedirs(msa_prova,exist_ok=True)

    for f in os.listdir(CG_dir+'/fasta_prova/'):
        if f not in os.listdir(CG_dir+'/msa_prova/'):
            et=subprocess.call(['bash',basedir+"/run_mafft.sh",species,cg_fa_prova+f,msa_prova+f],stdout=subprocess.DEVNULL)
    
    # Concatenate alignments
    genome2seqs=dict()
    for file in os.listdir(msa_prova):
        with open(msa_prova+file) as f:
            lines=f.readlines()
            genome=lines[0].split(',')[0].lstrip('>(')
            seq=''
            for l in lines[1:]:
                if l.startswith('>'):

                    if genome in genome2seqs:
                        genome2seqs[genome]=genome2seqs[genome]+seq 
                    else:
                        genome2seqs[genome]=seq
                    seq=''
                    genome=l.split(',')[0].lstrip('>(')

                else:
                    seq+=l.strip()
                

            if genome in genome2seqs:
                genome2seqs[genome]=genome2seqs[genome]+seq 
            else:
                genome2seqs[genome]=seq
    
    #check all went well
    for g in genome2seqs:
        print(len(genome2seqs[g]))

    #write to file
    with open(msa_prova+'concat.msa','w') as f:
        for g in genome2seqs:
            f.write('>'+g+'\n')
            f.write(str(genome2seqs[g])+'\n')
    
    # Build tree
    ret=subprocess.call(['bash',basedir+"/run_fasttree.sh",species,msa_prova+'concat.msa',msa_prova+'concat.tree'],stdout=subprocess.DEVNULL)

# DELOS TREE

In [None]:
basedir=os.getcwd()

for sp in ['synth_myco','synth_ecoli','synth_paeru']:
     for frag in ['0.5','0.8','1']:
        species= sp+'/'+frag
        delosdir = basedir+'/'+species+'/PanDelos'

        ref_to_genes_delos = dict() # #used for looking at the distribution of GF size, based on their gene in the  reference genome
        gf_sizes_delos = dict() #used for looking at the distribution of GF size, regardless of their association to a common gene in the reference genome

        # PanDelos
        with open(delosdir+"/output/output.clus") as f:
                lines = f.readlines()

        cg_delos=set()
        for l in lines: 
            l = l.rstrip()
            l = l.split(' ')
            genome_set = set()
            selected_genes = []
            for gene in l:
                genome = gene.split(':')[0]     
                if genome in genome_set:
                    continue
                else:
                    genome_set.add(genome)
                    selected_genes.append(gene)

            if len(selected_genes)==10:
                cg_delos.add(tuple(selected_genes)) 

        # write to file core genes  
        frag_dir = delosdir+'/output/fragmented_coordinates/'
        records_pan = {}
        for d in os.listdir(frag_dir):
            with open(frag_dir+d+'/'+"coordinates_frag.sam") as f:
                lines = f.readlines()
                for l in lines:
                    if l.startswith('@'):
                        continue
                    l = l.split('\t')
                    name = l[0]
                    seq = l[9]
                    records_pan[name] = seq

        comparisondir = basedir+'/'+species+'/Comparison'
        CG_dir = basedir+'/'+species+'/Comparison/GFs/CG'

        GF_dir = comparisondir+'/GFs'
        os.makedirs(CG_dir,exist_ok=True)
        os.makedirs(CG_dir+'/fasta_delos',exist_ok=True)

        i = 0 
        for family in cg_delos:
            with open(CG_dir+'/fasta_delos/fam_'+str(i)+'.fa','w') as f:
                i += 1
                for gene in family: 
                    f.write('>'+gene+'\n')
                    f.write(str(records_pan[gene])+'\n')


        # RUN MAFFT
        import subprocess

        comparisondir = basedir+'/'+species+'/Comparison'
        CG_dir = basedir+'/'+species+'/Comparison/GFs/CG'
        cg_fa_delos = CG_dir+'/fasta_delos/'
        msa_delos = CG_dir+'/msa_delos/'
        os.makedirs(msa_delos,exist_ok=True)

        for f in os.listdir(cg_fa_delos):
            if f not in os.listdir(msa_delos):
                ret=subprocess.call(['bash',basedir+"/run_mafft.sh",species,cg_fa_delos+f,msa_delos+f],stdout=subprocess.DEVNULL)

        #CONCATENATE
        genome2seqs=dict()
        for file in os.listdir(msa_delos):
            with open(msa_delos+file) as f:
                lines=f.readlines()
                genome=lines[0].split(':')[0]
                seq=''
                for l in lines[1:]:
                    if l.startswith('>'):

                        if genome in genome2seqs:
                            genome2seqs[genome]=genome2seqs[genome]+seq 
                        else:
                            genome2seqs[genome]=seq
                        seq=''
                        genome=l.split(':')[0]
                    else:
                        seq+=l.strip()
                    

                if genome in genome2seqs:
                    genome2seqs[genome]=genome2seqs[genome]+seq 
                else:
                    genome2seqs[genome]=seq

        #check all went well            
        for g in genome2seqs:
            print(len(genome2seqs[g]))
        
        # read pandelos file to associate pandelos aliases to genome name
        genome_names = os.listdir(delosdir+'/output/fragmented')
        matchfile='predictedCDSs_filtered_only_genes.bed'
        dict_list_names = []

        for g in genome_names:
            files = os.listdir(delosdir+'/output/fragmented/' + g + '/artifacts')
            afile = [f for f in files if f.endswith(matchfile)][0]
            with open(delosdir+'/output/fragmented/' + g + '/artifacts/' +afile) as f:
                delos_name = f.readline().split('\t')[0]
                
            dict_row = {'genome':g, 'pandelos_name':delos_name}
            dict_list_names.append(dict_row)
        
        names_df=pd.DataFrame(dict_list_names)
        names_df

        delos2prova = dict()
        for r in names_df.index:
            delos2prova[names_df.loc[r,'pandelos_name']]=names_df.loc[r,'genome']

        #write to file
        with open(msa_delos+'concat.msa','w') as f:
            for g in genome2seqs:
                f.write('>'+delos2prova[g.lstrip('>')].split('_')[1]+'\n')
                f.write(str(genome2seqs[g])+'\n')
        
        # build tree
        ret=subprocess.call(['bash',basedir+"/run_fasttree.sh",species,msa_delos+'concat.msa',msa_delos+'concat_renamed.tree'],stdout=subprocess.DEVNULL)

# Roary Trees

In [62]:
for sp in ['synth_paeru','synth_myco','synth_ecoli']:
    for frag in ['0.5','0.8','1']:
        species=sp+'/'+frag
        alndir = basedir+'/'+species+'/Roary/output/' # core gene alignment is already provided by Roary
        #ret=subprocess.call(['bash',basedir+"/run_fasttree.sh",species,alndir+'core_gene_alignment.aln',alndir+'core_gene_alignment.tree'],stdout=subprocess.DEVNULL)

        with open(alndir+'core_gene_alignment.tree') as infile, open(alndir+'core_gene_alignment_renamed.tree', 'w') as outfile:
            for line in infile:
                line = line.replace('prokka_genome_','')
                line = line.replace('_fr.fasta','')

                outfile.write(line)

# GenAPI Trees

In [37]:
import pandas as pd
import os
import numpy as np
import re
basedir=os.getcwd()

for sp in ['synth_paeru']:
    for frag in ['0.5','0.8','1']:
        species=sp+'/'+frag

        prokkadir= basedir+'/'+species+'/Prokka' 
        gene2seq = dict()

        for file in os.listdir(prokkadir):
            all_files = os.listdir(prokkadir+'/'+file)
            ffn_file = [f for f in all_files if f.endswith('ffn')][0]
            with open(prokkadir+'/'+file+'/'+ffn_file) as f:
                lines=f.readlines()
                gene=lines[0].split(' ')[0]
                seq=''
                for l in lines[1:]:
                    if l.startswith('>'):
                        gene2seq[gene]=seq
                        gene=l.split(' ')[0]
                        seq=''
                    else:
                        seq+=l.strip()
                gene2seq[gene]=seq


        itool = basedir+'/'+species+'/GenAPI/clustered_genes_genapi.ffn.clstr'
        gf2genes=dict()
        with open(itool,'r') as f:
            lines = f.readlines()
            GF_name=lines[0].lstrip('>').split(' ')[1].rstrip()
            GF_list=[]
            genomes=set()
            CG=False

            for l in lines[1:]:
                if l.startswith('>'):
                    if len(GF_list) == 10 and CG == True:
                        gf2genes[GF_name]=GF_list
                    GF_name=l.lstrip('>').split(' ')[1].rstrip()
                    CG = True
                    genomes=set()
                    GF_list=[]
                else:
                    gene= l.split('>')[1].split(':')[0]
                    genome=gene.split('_')[0]
                    if genome in genomes:
                        CG=False
                    genomes.add(genome)
                    GF_list.append(gene)

        CG_dir = basedir+'/'+species+'/Comparison/GFs/CG'

        os.makedirs(CG_dir+'/fasta_genapi',exist_ok=True)
        for gf in gf2genes:  
            genes=gf2genes[gf]

            with open(CG_dir+'/fasta_genapi/'+gf+'.fa','w') as f:
                for g in genes:
                    f.write('>'+g+'\n')
                    f.write(str(gene2seq['>'+g])+'\n') 
        # RUN MAFFT
        import subprocess
        cg_fa_genapi = CG_dir+'/fasta_genapi/'
        msa_genapi = CG_dir+'/msa_genapi/'
        os.makedirs(msa_genapi,exist_ok=True)

        for f in os.listdir(cg_fa_genapi):
            ret=subprocess.call(['bash',basedir+"/run_mafft.sh",species,cg_fa_genapi+f,msa_genapi+f],stdout=subprocess.DEVNULL)
        # Concatenate alignments
        genome2seqs=dict()
        for file in os.listdir(msa_genapi):
            with open(msa_genapi+file) as f:
                lines=f.readlines()
                genome=lines[0].split('_')[0]
                seq=''
                for l in lines[1:]:
                    if l.startswith('>'):

                        if genome in genome2seqs:
                            genome2seqs[genome]=genome2seqs[genome]+seq 
                        else:
                            genome2seqs[genome]=seq
                        seq=''
                        genome=l.split('_')[0]
                    else:
                        seq+=l.strip()
                    

                if genome in genome2seqs:
                    genome2seqs[genome]=genome2seqs[genome]+seq 
                else:
                    genome2seqs[genome]=seq
        #check all went well            
        for g in genome2seqs:
            print(len(genome2seqs[g]))
        #write to file
        with open(msa_genapi+'concat.msa','w') as f:
            for g in genome2seqs:
                f.write(g+'\n')
                f.write(str(genome2seqs[g])+'\n')

        # Build tree
        ret=subprocess.call(['bash',basedir+"/run_fasttree.sh",species,msa_genapi+'concat.msa',msa_genapi+'concat.tree'],stdout=subprocess.DEVNULL)

        # Change names of genomes to correposnd to PANPROVA genomes
        roary2genome = dict()
        genome_names = os.listdir(delosdir+'/output/fragmented')
        matchfile='predictedCDSs_filtered_only_genes.bed'
        dict_list_names = [] 
        prokka_dirs=os.listdir(prokkadir)
        for genome_full in prokka_dirs:
            genome=genome_full.split('_')[2]
            files=os.listdir(prokkadir+'/'+genome_full)
            index_tsv = [i for i,name in enumerate(files) if re.search("tsv", name)][0]

            with open(prokkadir+'/'+genome_full+'/'+files[index_tsv]) as f:
                code=f.readlines()[1].split('\t')[0].split('_')[0]
         
            roary2genome[code]=genome

        # Rename tree
        with open(msa_genapi+'concat.tree') as infile, open(msa_genapi+'concat_renamed.tree', 'w') as outfile:
            for line in infile:
                for src, target in roary2genome.items():
                    line = line.replace(src, target)
                outfile.write(line)

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous v

62440
62440
62440
62440
62440
62440
62440
62440
62440
62440


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous v

119930
119930
119930
119930
119930
119930
119930
119930
119930
119930


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     3 / 9 
Reallocating..done. *alloclen = 3745
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     3 / 9 
Reallocating..done. *alloclen = 3742
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has be

211582
211582
211582
211582
211582
211582
211582
211582
211582
211582


# Panaroo trees

In [45]:
import pandas as pd
import os
import numpy as np
import re
basedir=os.getcwd()

for sp in ['synth_myco','synth_ecoli','synth_paeru']:
    for frag in ['0.5','0.8','1']:
        species=sp+'/'+frag+'/'
  
        comparisondir = basedir+'/'+species+'/Comparison'   
        CG_dir = comparisondir+'/GFs/CG'
        cg_fa_panaroo = CG_dir+'/fasta_panaroo/'
        msa_panaroo = CG_dir+'/msa_panaroo/'
        os.makedirs(cg_fa_panaroo,exist_ok=True)
        os.makedirs(msa_panaroo,exist_ok=True)

        prokkadir = basedir+'/'+species+'/Prokka' 
        panaroodir = basedir+'/'+species+'/Panaroo'
        delosdir=basedir+'/'+species+'/PanDelos'

        # read in all gene sequences
        gene2seq = dict()

        for file in os.listdir(prokkadir):
            all_files = os.listdir(prokkadir+'/'+file)
            ffn_file = [f for f in all_files if f.endswith('ffn')][0]
            with open(prokkadir+'/'+file+'/'+ffn_file) as f:
                lines=f.readlines()
                gene=lines[0].split(' ')[0]
                seq=''
                for l in lines[1:]:
                    if l.startswith('>'):
                        gene2seq[gene]=seq
                        gene=l.split(' ')[0]
                        seq=''
                    else:
                        seq+=l.strip()
                gene2seq[gene]=seq

        # create a dictionary mapping GFs to genes
        itool = basedir+'/'+species+'Panaroo/gene_presence_absence.csv'
        gf2genes=dict()
        with open(itool,'r') as f:
            lines = f.readlines()[1:]  
            for line in lines:
                CG=True
                name=line.split(',')[0]
                cc = line.strip().split(',')[3:]
                cc=[g for g in cc if g != '']
                if len(cc)==10:
                    for c in cc:
                        if len(c.split(';'))>1 or len(c.split('_'))>2:
                            CG=False
                    if CG:
                        gf2genes[name]=cc
                    
        # write to file one fasta for each (single-copy) core gene family
        for gf in gf2genes:  
            genes=gf2genes[gf]

            with open(CG_dir+'/fasta_panaroo/'+gf+'.fa','w') as f:
                for g in genes:
                    if '>'+g in gene2seq:
                        f.write('>'+g+'\n')
                        f.write(str(gene2seq['>'+g])+'\n') 

        # Run Mafft
        for f in os.listdir(cg_fa_panaroo):
            ret=subprocess.call(['bash',basedir+"/run_mafft.sh",species,cg_fa_panaroo+f,msa_panaroo+f],stdout=subprocess.DEVNULL)

        # Concatenate alignments
        genome2seqs=dict()
        for file in os.listdir(msa_panaroo):
            with open(msa_panaroo+file) as f:
                lines=f.readlines()
                genome=lines[0].split('_')[0]
                seq=''
                for l in lines[1:]:
                    if l.startswith('>'):

                        if genome in genome2seqs:
                            genome2seqs[genome]=genome2seqs[genome]+seq 
                        else:
                            genome2seqs[genome]=seq
                        seq=''
                        genome=l.split('_')[0]
                    else:
                        seq+=l.strip()
                    

                if genome in genome2seqs:
                    genome2seqs[genome]=genome2seqs[genome]+seq 
                else:
                    genome2seqs[genome]=seq

        # check all went well            
        for g in genome2seqs:
            print(len(genome2seqs[g]))

        # Write to file a single concatenated alignment
        with open(msa_panaroo+'concat.msa','w') as f:
            for g in genome2seqs:
                f.write(g+'\n')
                f.write(str(genome2seqs[g])+'\n')

        # Build tree
        ret=subprocess.call(['bash',basedir+"/run_fasttree.sh",species,msa_panaroo+'concat.msa',msa_panaroo+'concat.tree'],stdout=subprocess.DEVNULL)

        # Change names of genomes to correposnd to PANPROVA genomes
        roary2genome = dict()
        genome_names = os.listdir(delosdir+'/output/fragmented')
        matchfile='predictedCDSs_filtered_only_genes.bed'
        dict_list_names = [] 
        prokka_dirs=os.listdir(prokkadir)
        for genome_full in prokka_dirs:
            genome=genome_full.split('_')[2]
            files=os.listdir(prokkadir+'/'+genome_full)
            index_tsv = [i for i,name in enumerate(files) if re.search("tsv", name)][0]

            with open(prokkadir+'/'+genome_full+'/'+files[index_tsv]) as f:
                code=f.readlines()[1].split('\t')[0].split('_')[0]
         
            roary2genome[code]=genome

        # Rename tree
        with open(msa_panaroo+'concat.tree') as infile, open(msa_panaroo+'concat_renamed.tree', 'w') as outfile:
            for line in infile:
                for src, target in roary2genome.items():
                    line = line.replace(src, target)
                outfile.write(line)


synth_myco/0.5/


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous v

249
249
249
249
249
249
249
249
249
249
synth_myco/0.8/


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     6 / 9 
Reallocating..done. *alloclen = 1912
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     5 / 9 
Reallocating..done. *alloclen = 1906
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has be

8121
8121
8121
8121
8121
8121
8121
8121
8121
8121
synth_myco/1/


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     7 / 9 
Reallocating..done. *alloclen = 1849
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     7 / 9 
Reallocating..done. *alloclen = 1852
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has be

34245
34245
34245
34245
34245
34245
34245
34245
34245
34245
synth_ecoli/0.5/


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     8 / 9 
Reallocating..done. *alloclen = 1321
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     8 / 9 
Reallocating..done. *alloclen = 1321
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has be

19046
19046
19046
19046
19046
19046
19046
19046
19046
19046
synth_ecoli/0.8/


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous v

168773
168773
168773
168773
168773
168773
168773
168773
168773
168773
synth_ecoli/1/


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     6 / 9 
Reallocating..done. *alloclen = 2632
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     8 / 9 
Reallocating..done. *alloclen = 2656
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has be

557464
557464
557464
557464
557464
557464
557464
557464
557464
557464
synth_paeru/0.5/


ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     5 / 9 
Reallocating..done. *alloclen = 2749
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     5 / 9 
Reallocating..done. *alloclen = 2749
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It

125410
125410
125410
125410
125410
125410
125410
125410
125410
125410
synth_paeru/0.8/


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     6 / 9 
Reallocating..done. *alloclen = 1900
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     7 / 9 
Reallocating..done. *alloclen = 1906
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has be

319988
319988
319988
319988
319988
319988
319988
319988
319988
319988
synth_paeru/1/


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 10
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 10
done.

Progressive alignment 1/2... 
STEP     8 / 9 
Reallocating..done. *alloclen = 2962
STEP     9 / 9 
done.

Making a distance matrix from msa.. 
    0 / 10
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 10
done.

Progressive alignment 2/2... 
STEP     8 / 9 
Reallocating..done. *alloclen = 2962
STEP     9 / 9 
done.

disttbfast (nuc) Version 7.511
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has be

652086
652086
652086
652086
652086
652086
652086
652086
652086
652086
