In [148]:
import subprocess
import os
import glob
import toytree
import tqdm
import pandas as pd
from subprocess import PIPE, Popen
import shlex
from Bio import SeqIO
import random
import itertools
import numpy as np


def remove_seeds( alnfile):
    """
    this function removes seeds from an alignment
    """
    aln = SeqIO.parse(alnfile, 'fasta')
    sequences = []
    for s in aln:
        sequences.append(s)
    #write new aln
    
    with open(alnfile, 'w') as f:
        for s in sequences:
            f.write('>' + str(s.id).replace('seed','').replace('_' , '') + '\n')
            f.write(str(s.seq) + '\n')
    
    return alnfile

def Fident(str1,str2 , verbose = False):
    #minlen= min( (len(str1),len(str2))  )
    #str1 = str1[:minlen]
    #str2 = str2[:minlen]
    str1 = np.array(list(str1))
    str2 = np.array(list(str2))            
    return len(np.where( (str1 == str2 ) & (str1 != '-' ) & (str2 != '-')  )[0]) / len(str1)

def copyaln( aln, seq):
    seqiter = iter(seq)
    newaln = ''
    for i,char in enumerate(aln):
        if char == '-':
            newaln += '-'
        else:
            newaln+=next(seqiter)
    return newaln

def read_dbfiles3di(  AADB , threeDidb):
    #find positions 
    threeDiseq = [ l.strip().replace('\x00','') for l in open(threeDidb)]
    lookup = AADB+'.lookup'
    ids = [ l.split()[1].strip() for l in open(lookup)]
    AAs = [ l.strip().replace('\x00','') for l in open(AADB)]

    mapper3di = dict(zip(ids,threeDiseq))
    mapperAA = dict(zip(ids,AAs))
    
    return mapper3di, mapperAA

def calc_fident_crossaln(row , verbose = False):
    #amino acid representations of alns using AAand3di or just 3di
    qaln_2, taln_2 = row.qaln , row.taln
    #start and stop of aln
    
    qstart_2, qend_2, tstart_2 , tend_2 = row.qstart, row.qend , row.tstart , row.tend
    #indexing starts at 1...
    
    #3di of the query and target
    structQ, structT = row['3diq'], row['3dit']
    AAq, AAt = row['AAq'], row['AAt']

    #add gaps
    t3diAA_newgaps = copyaln(taln_2, structT[tstart_2-1:tend_2]) 
    q3diAA_newgaps = copyaln(qaln_2, structQ[qstart_2-1:qend_2])
    row = pd.Series( { '3di_qaln_mode2':q3diAA_newgaps , '3di_taln_mode2':t3diAA_newgaps })
    #return columns
    return row

def get_leafset( treenode ):
    """
    this function returns the leafset of a node
    """
    if treenode.is_leaf():
        return [treenode.name]
    else:
        return treenode.get_leaf_names()


def mafft_profile(aln1,aln2, outprofile , submat = None):
    """
    this function aligns two alignments using MAFFT
    """
    #make profile
    if submat:
        cmd = 'mafft --textmatrix {} --seed {} {} > {}'.format(submat, aln1,aln2, outprofile)
    else:
        cmd = 'mafft --seed {} {} > {}'.format(aln1,aln2, outprofile)

    print(cmd)
    subprocess.run(cmd , shell=True)
    return outprofile

def mafft_addfull(aln1,aln2, outprofile , submat = None):
    """
    this function aligns two alignments using MAFFT
    """
    #make profile
    profile = aln1 + '.profile'
    if submat:
        cmd = 'mafft --textmatrix {} --addfull {} --keeplength {} > {}'.format(submat, aln1,aln2, outprofile)
    else:
        cmd = 'mafft --addfull  {} --keeplength {} > {}'.format(aln1,aln2, outprofile)
    print(cmd)

    subprocess.run(cmd , shell = True)
    return outprofile


def sub2fasta( sub, outfile , fastacol1='qaln' , fastacol2='taln' ):
    with open(outfile, 'w') as f:
        f.write('>' + sub['query'] + '\n')
        f.write(sub[fastacol1] + '\n')
        f.write('>' + sub['target'] + '\n')
        f.write(sub[fastacol2] + '\n')    
    return outfile

def retalns(allvall, leafname1,leafname2):
    sub = allvall[allvall['query'].isin( leafname1)]
    sub = sub[sub['target'].isin(leafname2)]
    sub = sub[sub['query'] != sub['target']]
    #get max prot lenght aligned
    sub['alnlen'] = sub.apply(lambda x: max(x['qend'] - x['qstart'] , x['tend'] - x['tstart']) , axis = 1)
    sub = sub[sub['alnlen'] == sub['alnlen'].max()]
    if len(sub)==0:
        print(leafname1, leafname2)
        raise Exception('no sub')
    return sub.iloc[0]

def get_fasta_leafset(fasta):
    """
    this function returns the leafset of a fasta file
    """
    aln = SeqIO.parse(fasta, 'fasta')
    leafset = []
    for s in aln:
        leafset.append(s.id)
    return leafset

#traverse tree from root to leaves recursively
def traverse_tree_merge_mafft( treenode, topleafset, allvall , alnfolder , submat = None , verbose = False):
    """
    this function traverses a tree from root to leaves recursively
    it returns a dictionary with the iteratively built alignment
    """
    if verbose == True:
        print('traverse', treenode.name , treenode.is_leaf() , treenode.leafset)
    
    if treenode.is_leaf():
        topleafset.remove(treenode.name)
        #if the node is a leaf, then we need to add it to the alignment with one of the pivots in the current leafset
        #select the alignment of the leaf with itself
        sub = allvall[allvall['query'].isin( [treenode.name] )]
        sub = sub[sub['target'].isin([treenode.name])]
        sub = sub.iloc[0]
        print('leaf',sub)
        with open(alnfolder + treenode.name + '_inter.fasta', 'w') as f:
            f.write('>' + sub['query'] + '\n')
            f.write(sub['qaln'] + '\n')
        with open(alnfolder + treenode.name + '_inter.3di.fasta', 'w') as f:
            f.write('>' + sub['query'] + '\n')
            f.write(sub['3di_qaln_mode2'] + '\n')
        treenode.aln = alnfolder + treenode.name + '_inter.fasta'
        treenode.aln3di = alnfolder + treenode.name + '_inter.3di.fasta'
        return treenode.aln, treenode.aln3di
    
    else:
        childalns3di = {}
        childalnsAA = {}
       
        #treenode.leafset = get_leafset(treenode)
        #get the intersection of the child leafsets
        treenode.leafset = get_leafset(treenode)
        children = treenode.get_children()
        
        if len(children) == 2 and children[0].is_leaf() and children[1].is_leaf():
            #treat the case of a cherry
            print('cherry', children[0].name , children[1].name)
            treenode.aln = sub2fasta( retalns(allvall, [children[0].name] , [children[1].name]) , alnfolder + treenode.name + '_inter.fasta')
            treenode.aln3di = sub2fasta( retalns(allvall, [children[0].name] , [children[1].name]) , alnfolder + treenode.name + '_inter.3di.fasta' , fastacol1='3di_qaln_mode2' , fastacol2='3di_taln_mode2')
            return treenode.aln, treenode.aln3di
        
        else:
            #not a cherry. one or both sides is a subtree
            print('not cherry', treenode.name  )
            print( 'children', [c.name for c in children])
            for c in treenode.get_children():
                #make sub aln for each child
                if verbose == True:
                    print('traverse', c.name , c.is_leaf() , c.leafset)
                if not c.aln:
                    c.aln,c.aln3di = traverse_tree_merge_mafft(c , treenode.leafset , allvall, alnfolder , verbose = verbose)
                childalnsAA[c] = { 'fasta': c.aln  }
                childalns3di[c] = { 'fasta': c.aln3di  }

            c1,c2 = children
            if c1.is_leaf():
                treenode.aln = mafft_addfull(childalnsAA[c1]['fasta'], childalnsAA[c2]['fasta'], alnfolder + treenode.name + '_inter.fasta' )
                treenode.aln3di = mafft_addfull(childalns3di[c1]['fasta'], childalns3di[c2]['fasta'], alnfolder + treenode.name + '_inter3di.fasta' , submat =submat )

            elif c2.is_leaf():
                treenode.aln = mafft_addfull(childalnsAA[c2]['fasta'], childalnsAA[c1]['fasta'], alnfolder + treenode.name + '_inter.fasta' )
                treenode.aln3di = mafft_addfull(childalns3di[c1]['fasta'], childalns3di[c2]['fasta'], alnfolder + treenode.name + '_inter3di.fasta' , submat = submat)

            else:                
                treenode.aln = mafft_profile(childalnsAA[c1]['fasta'], childalnsAA[c2]['fasta'], alnfolder + treenode.name + '_inter.fasta' )
                treenode.aln3di = mafft_profile(childalns3di[c1]['fasta'], childalns3di[c2]['fasta'], alnfolder + treenode.name + '_inter3di.fasta' , submat =  submat)

            if verbose == True:
                #check if node is root  
                if treenode.up == None:
                    print('final aln')
                    print('childalnsAA', childalnsAA)
                    print('childalns3di', childalns3di)
            return treenode.aln, treenode.aln3di


def remove_redundant( alignment ):
    """
    this function removes redundant sequences from an alignment
    """
    aln = SeqIO.parse(alignment, 'fasta')
    seqs = []
    ids = []
    for s in aln:
        if s.id not in ids:
            seqs.append(s)
            ids.append(s.id)
    
    with open(alignment, 'w') as f:
        for s in seqs:
            f.write('>' + s.id + '\n')
            f.write(str(s.seq) + '\n')
    return alignment

#remove all alns except the final merged one
def cleanup( filedir ):
    """
    this function removes all alns except the final merged one
    """
    for f in glob.glob(filedir + '*inter.fasta'):
        os.remove(f)

In [149]:

#traverse tree from root to leaves recursively
def traverse_tree_merge( treenode, topleafset, allvall , alnfolder , verbose = False):
    """
    this function traverses a tree from root to leaves recursively
    it returns a dictionary with the iteratively built alignment
    """
    if verbose == True:
        print('traverse', treenode.name , treenode.is_leaf() , treenode.leafset)
    
    if treenode.is_leaf():
        topleafset.remove(treenode.name)
        #if the node is a leaf, then we need to add it to the alignment with one of the pivots in the current leafset
        sub = retalns(allvall, [treenode.name] , topleafset)  
        treenode.aln = sub2fasta(sub, alnfolder + treenode.name + '_inter.fasta')
        treenode.aln3di = sub2fasta(sub, alnfolder + treenode.name + '_inter.3di.fasta' , fastacol1='3di_qaln_mode2' , fastacol2='3di_taln_mode2')
        return treenode.aln, treenode.aln3di
    
    else:
        childalns3di = {}
        childalnsAA = {}
        bridges3di = {}
        bridgesAA = {}
        #treenode.leafset = get_leafset(treenode)
        #get the intersection of the child leafsets
        treenode.leafset = get_leafset(treenode)
        children = treenode.get_children()
        
        if len(children) == 2 and children[0].is_leaf() and children[1].is_leaf():
            #treat the case of a cherry
            print('cherry', children[0].name , children[1].name)
            treenode.aln = sub2fasta( retalns(allvall, [children[0].name] , [children[1].name]) , alnfolder + treenode.name + '_inter.fasta')
            treenode.aln3di = sub2fasta( retalns(allvall, [children[0].name] , [children[1].name]) , alnfolder + treenode.name + '_inter.3di.fasta' , fastacol1='3di_qaln_mode2' , fastacol2='3di_taln_mode2')
            return treenode.aln, treenode.aln3di
        
        else:
            #not a cherry. one or both sides is a subtree
            print('not cherry', treenode.name  )
            print( 'children', [c.name for c in children])
            for c in treenode.get_children():
                #make sub aln for each child
                if verbose == True:
                    print('traverse', c.name , c.is_leaf() , c.leafset)
                if not c.aln:
                    c.aln,c.aln3di = traverse_tree_merge(c , treenode.leafset , allvall, alnfolder , verbose = verbose)
                childalnsAA[c] = { 'fasta': c.aln , 'protset':set(get_fasta_leafset(c.aln) ) }
                childalns3di[c] = { 'fasta': c.aln3di , 'protset':set(get_fasta_leafset(c.aln3di) ) }
            
            """
            for c1,c2 in itertools.combinations(treenode.get_children(),2):
                bridge = retalns(allvall, childalnsAA[c1]['protset'] , childalnsAA[c2]['protset'] )
                bridgesAA[(c1,c2)] = { 'fasta': sub2fasta(bridge, alnfolder + treenode.name + '_bridge.fasta') , 'protset':set([bridge.query , bridge.target]) }
                bridges3di[(c1,c2)] = { 'fasta' : sub2fasta(bridge, alnfolder + treenode.name + '_bridge.3di.fasta' , fastacol1='3di_qaln_mode2' , fastacol2='3di_taln_mode2') , 'protset':set([bridge.query, bridge.target]) }
            
            #successively merge the alignments of the children
            for i, c in enumerate(itertools.combinations(treenode.get_children(),2)):
                c1,c2 = c
                if verbose == True:
                    print('merge', c1.name , c2.name)
                if i == 0:
                    #first merge
                    try:
                        print('first merge')
                        
                        alnAA = mergealns( childalnsAA[c1]['fasta'], bridgesAA[(c1,c2)]['fasta'] ,alnfolder + treenode.name + '_inter.fasta' , verbose=verbose)
                        aln3di = mergealns( childalns3di[c1]['fasta'], bridges3di[(c1,c2)]['fasta'] ,alnfolder + treenode.name + '_inter3di.fasta', verbose=verbose)

                        print('2 merge')

                        alnAA = mergealns( childalnsAA[c2]['fasta'], alnAA , alnfolder + treenode.name + '_inter.fasta' , verbose=verbose)
                        aln3di = mergealns( childalns3di[c2]['fasta'], aln3di ,alnfolder + treenode.name + '_inter3di.fasta', verbose=verbose)

                    except:
                        print( treenode , childalnsAA , childalns3di , bridgesAA , bridges3di)
                        raise Exception('merge error 1')
                else:
                    try:
                        print('3 merge')

                        alnAA = mergealns( childalnsAA[c1]['fasta'], bridgesAA[(c1,c2)]['fasta'] , alnfolder + treenode.name + '_inter.fasta' , verbose=verbose)
                        alnAA = mergealns( childalnsAA[c1]['fasta'], alnAA , alnfolder + treenode.name + '_inter.fasta' , verbose=verbose)

                        print('4 merge')

                        aln3di = mergealns( childalns3di[c2]['fasta'], bridges3di[(c1,c2)]['fasta'] , alnfolder + treenode.name + '_inter3di.fasta'   , verbose=verbose)
                        aln3di = mergealns( childalns3di[c2]['fasta'], aln3di , alnfolder + treenode.name + '_inter3di.fasta' , verbose=verbose)

                        
                    except:
                        print( treenode )
                        print( childalnsAA , childalns3di , bridgesAA , bridges3di)
                        raise Exception('merge error 2') 
            """

            treenode.aln = alnAA
            treenode.aln3di = aln3di
            if verbose == True:
                #check if node is root  
                if treenode.up == None:
                    print('final aln')
                    print('childalnsAA', childalnsAA)
                    print('childalns3di', childalns3di)
            return treenode.aln, treenode.aln3di


In [150]:

def aln_mapping( s1 , s2, maxaln = 0 , coidx1= 0 , coidx2 = 0 , start1 = 0 , start2=0 ,  verbose = False):
    #build a dictionary of the positions of the characters in the string
    #convolve the strings
    maxaln, maxcount = convolve_strings(s1,s2)

    if start1 != 0 or start2 != 0:
        print('start', start1, start2)
    if verbose == True:
        print('maxaln', maxaln, maxcount)
    
    #find starting points
    if len(s1) < len(s2):
        if maxaln < 0:
            coidx1 = np.abs(maxaln)
            coidx2 = 0
        else:
            coidx1 = 0
            coidx2 = maxaln
    else:
        if maxaln < 0:
            coidx1 = 0
            coidx2 = np.abs(maxaln)
        else:
            coidx1 = maxaln
            coidx2 = 0
    
    substr1 = s1[coidx1:]
    substr2 = s2[coidx2:]

    print('substr1', substr1)
    print('substr2', substr2)
    print('coidx1', coidx1)
    print('coidx2', coidx2)
    
    #find equivalent positions in the strings
    maps1 = {}
    maps2 = {}

    oppositemap1 = {}
    oppositemap2 = {}

    for i, char in enumerate(substr1):
        if substr2[i] == char:
            maps1[i+coidx1] = i + start1
            maps2[i+coidx2] = i + start2
            
            oppositemap1[i+coidx1] = i + start2
            oppositemap2[i+coidx2] = i + start1
        else:
            #if there is mismatch convolve the remaining strings
            print('mismatch')
            sub1,sub2 , om1 , om2 = aln_mapping( substr1[i:] , substr2[i:], maxaln = 0 , start1= start1+coidx1+i , start2 = start2+coidx2+i , verbose = False)
            maps1.update( sub1 )
            maps2.update( sub2 )
            oppositemap1.update( om1 )
            oppositemap2.update( om2 )
            break
    
    return maps1, maps2 , oppositemap1, oppositemap2

def aln_mapping_full( s1 , s2, maxaln = 0 , coidx1= 0 , coidx2 = 0 , verbose = False):
    maps1, maps2 , om1 , om2 = aln_mapping( s1 , s2, maxaln = 0 , coidx1= 0 , coidx2 = 0 , verbose = False)
    #add the reverse mapping
    revmap1 = { v:k for k,v in maps1.items()}
    revmap2 = { v:k for k,v in maps2.items()}

    revmapom1 = { v:k for k,v in om1.items()}   
    revmapom2 = { v:k for k,v in om2.items()}
    
    return maps1, maps2, om1, om2 , revmap1, revmap2 , revmapom1, revmapom2

In [151]:


def convolve_strings(str1, str2):
    # Determine the lengths of the strings
    len1, len2 = len(str1), len(str2)

    if len(str1) < len(str2):
        str1, str2 = str2, str1
        len1, len2 = len2, len1
    max_alignment = 0
    max_count = 0
    # Slide str2 over str1, starting with one character overlap
    # and continue until str2 is again overlapping by just one character
    for i in range(-len2 + 1, len1):
        count = 0
        for j in range(len2):
            if 0 <= i + j < len1 and str1[i + j] == str2[j]:
                count += 1
        if count > max_count:
            max_count = count
            max_alignment = i
    return max_alignment, max_count


def alnchop(s1,s2,rawaln1,rawaln2,aln1,aln2,maxaln = 0):

    #align the two sequences
    coidx1 = 0
    coidx2 = 0

    if len(s1) < len(s2):
        if maxaln < 0:
            coidx1 = np.abs(maxaln)
            coidx2 = 0
        else:
            coidx1 = 0
            coidx2 = maxaln
    else:
        if maxaln < 0:
            coidx1 = 0
            coidx2 = np.abs(maxaln)
        else:
            coidx1 = maxaln
            coidx2 = 0
    discardcount = 0 
    rawaln1 = iter(rawaln1)
    while coidx1 > 0:
        rchar1 = next(rawaln1)
        discardcount += 1
        if rchar1 != '-':
            coidx1 -= 1
    aln1 = aln1[discardcount:]


    discardcount = 0 
    rawaln2 = iter(rawaln2)
    while coidx1 > 0:
        rchar2 = next(rawaln2)
        discardcount += 1
        if rchar2 != '-':
            coidx2 -= 1
    aln2 = aln2[discardcount:]


    rawaln1 = ''.join([ s for s in iter(rawaln1)])
    rawaln2 = ''.join([ s for s in iter(rawaln2)])

    return aln1, aln2, rawaln1, rawaln2


def mergealns( aln1f, aln2f, outfile , verbose = False):
    if set(get_fasta_leafset(aln1f)) == set(get_fasta_leafset(aln2f)):
        print('identical')
        return aln1f

    #find sequences in common between the two alignments
    aln1 = SeqIO.parse(aln1f, 'fasta')
    aln2 = SeqIO.parse(aln2f, 'fasta')
    ids1 = {s.id:str(s.seq) for s in aln1}
    ids2 = {s.id:str(s.seq) for s in aln2}
    aln1 = SeqIO.parse(aln1f, 'fasta')
    aln2 = SeqIO.parse(aln2f, 'fasta')
    idlist = [ s.id for s in aln1] + [ s.id for s in aln2]
    commonids = set(ids1.keys()).intersection(set(ids2.keys()))
    try:
        assert len(commonids) > 0
    except:
        print('no common ids')
        print('ids1', ids1)
        print('ids2', ids2)
        raise Exception('no common ids')
    #transform both alignments into numpy matrices
    aln1 = SeqIO.parse(aln1f, 'fasta')
    aln2 = SeqIO.parse(aln2f, 'fasta')
    aln1 = np.array([ list(str(s.seq)) for s in aln1])
    aln2 = np.array([ list(str(s.seq)) for s in aln2])
    nrows1 = aln1.shape[0]
    nrows2 = aln2.shape[0]

    #generate a list of column arrays
    aln1 = [ aln1[:,i] for i in range(aln1.shape[1])]
    aln2 = [ aln2[:,i] for i in range(aln2.shape[1])]
    #find the best common sequence
    maxconv = 0
    maxaln = 0

    print(ids1)
    print(ids2)
    
    for commonid in commonids:
        s1t = ids1[commonid]
        s2t = ids2[commonid]
        s1t = s1t.replace('-','')
        s2t = s2t.replace('-','')
        #if the common subsequence is not found start by removing the first character of the common sequence
        #convolution of the two sequences
        aln, count = convolve_strings(s1t,s2t)
        if count > maxconv:
            maxconv = count
            maxaln = aln
            ID = commonid
            s1 = s1t
            s2 = s2t

    rawaln1 = ids1[ID]
    rawaln2 = ids2[ID]
    print('pivot' , ID)
    print('s1',s1)
    print('s2',s2)

    print('rawaln1', rawaln1)
    print('rawaln2', rawaln2)
    
    print('maxaln', maxaln)
    #use the sequence convolution to align the two alignment arrays
    if len(s1) < len(s2):
        if maxaln < 0:
            coidx1 = np.abs(maxaln)
            coidx2 = 0
        else:
            coidx1 = 0
            coidx2 = maxaln
    else:
        if maxaln < 0:
            coidx1 = 0
            coidx2 = np.abs(maxaln)
        else:
            coidx1 = maxaln
            coidx2 = 0
    print('coidx1', coidx1)
    print('coidx2', coidx2)


    #remove the leading gaps
    for i in range(len(rawaln1)):
        if rawaln1 != '-':
            break
    aln1 = aln1[i:]
    rawaln1 = rawaln1[i:]
    for i in range(len(rawaln1)):
        if rawaln1 != '-':
            break
    aln2 = aln2[i:]
    rawaln2 = rawaln2[i:]

    #remove the trailing gaps
    for i in range(len(rawaln1)):
        if rawaln1[-i] != '-':
            break
    aln1 = aln1[:len(rawaln1)-i]
    rawaln1 = rawaln1[:len(rawaln1)-i]
    for i in range(len(rawaln2)):
        if rawaln2[-i] != '-':
            break
    aln2 = aln2[:len(rawaln2)-i]
    rawaln2 = rawaln2[:len(rawaln2)-i]
    

    #construct alignment with common sequence

    rawaln1 = iter(rawaln1)
    rawaln2 = iter(rawaln2)    
    char1 = next(rawaln1)
    char2 = next(rawaln2)
    i = 1
    j = 1 
    pchar1 = char1
    pchar2 = char2
    newaln1 = []
    newaln2 = []
    convolved = False
    while True:
        try:
            if pchar1 == '-' and char1 != '-':
                print('end insertion1')

            if pchar2 == '-' and char2 != '-':
                print('end insertion2')

            if char1 == '-' and char2 != '-':
                print('insertion1')
                newaln2.append(['-']*nrows2)
                newaln1.append(aln1[i])
                pchar1 = char1
                char1 = next(rawaln1)
                i +=1

            elif char2 == '-' and char1 != '-':
                print('insertion2')
                newaln2.append(aln2[j])
                newaln1.append(['-']*nrows1)
                pchar2 = char2
                char2 = next(rawaln2)
                j +=1

            elif char1 == char2 and char1 != '-' and char2 != '-':
                char1 = next(rawaln1)
                char2 = next(rawaln2)
                newaln2.append(aln2[j])
                newaln1.append(aln1[i])
                pchar1 = char1
                pchar2 = char2
                j+= 1
                i+= 1

            elif char1 != '-' and char2 != '-' and char1 != char2:
                convolved = True
                print('mismatch')
                #mismatch reconvolve remaining strings
                rawaln1 = ''.join([ s for s in iter(rawaln1)])
                rawaln2 = ''.join([ s for s in iter(rawaln2)])
                s1 = rawaln1.replace('-','')
                s2 = rawaln2.replace('-','')

                maxaln, count = convolve_strings(s1,s2)
                if len(s1) < len(s2):
                    if maxaln < 0:
                        coidx1 = np.abs(maxaln)
                        coidx2 = 0
                    else:
                        coidx1 = 0
                        coidx2 = maxaln
                else:
                    if maxaln < 0:
                        coidx1 = 0
                        coidx2 = np.abs(maxaln)
                    else:
                        coidx1 = maxaln
                        coidx2 = 0
                
                rawaln1 = iter(rawaln1)
                discardcount1 = 0
                count1 = 0
                while count1 < coidx1:
                    discardcount1 += 1
                    char1 = next(rawaln1)
                    if char1 != '-':
                        count1 += 1
                aln1 = aln1[discardcount1:]
                rawaln2 = iter(rawaln2) 
                discardcount2 = 0
                count2 = 0
                while count2 < coidx2:
                    discardcount2 += 1
                    char2 = next(rawaln2)
                    if char2 != '-':
                        count2 += 1
                aln2 = aln2[discardcount2:]

                char1 = next(rawaln1)
                char2 = next(rawaln2)
                

                i = 0 
                j = 0
                print('char1', char1)
                print('char2', char2)

                print( 'newaln1' , newaln1)

                print( 'newaln2' , newaln2)
                
            else:
                print('end')
                break
        
        except StopIteration:
            break
    try:
        newaln1 = np.vstack(newaln1).T
        newaln2 = np.vstack(newaln2).T
        newaln = np.concatenate((newaln1, newaln2), axis = 0)
    except:
        print( 'aln err')
        print( 'char1', char1 )
        print( 'char2' , char2 )
        print( 'aln1', aln1 ) 
        print( 'aln2' , aln2 )
        print( 'maxaln'  , maxaln)
        raise Exception('aln err: newaln not created')
    #write out the new alignment
    with open(outfile, 'w') as f:
        for i in range(newaln.shape[0]):
            #print('>' + idlist[i] + '\n' + ''.join(list(newaln[i,:])) + '\n')
            f.write('>' + idlist[i] + '\n')
            f.write(''.join(list(newaln[i,:])) + '\n')
    remove_redundant( outfile )
    with open(outfile) as out:
        print(out.read())
    return outfile  

In [152]:

import toytree
import os
import pandas as pd
import glob


alndf = pd.read_table('../testdata/HOG113/allvall_1.csv', header = None)

infolder = '../testdata/HOG113/allvall_1.csv'.split('/')[:-1]
infolder = ''.join( [i + '/' for i in infolder])
mapper3di, mapperAA = read_dbfiles3di( "../testdata/HOG113/outdb" , "../testdata/HOG113/outdb_ss")
#add the 3di alignment to the dataframe
columns = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,lddt,qaln,taln,cigar,lntmscore'.split(',')
alndf.columns = columns

alndf['3diq']= alndf['query'].map(mapper3di)
alndf['3dit']= alndf['target'].map(mapper3di)
alndf['AAq']= alndf['query'].map(mapperAA)
alndf['AAt']= alndf['target'].map(mapperAA)

#output a fasta with the 3di sequences
res = alndf.apply(calc_fident_crossaln , axis = 1)
alndf = pd.concat([alndf,res] , axis = 1)

with open('../testdata/HOG113/3diseqs.fasta' , 'w') as out:
    for seq in alndf['query'].unique():
        out.write('>'+seq.replace('.pdb', '' )+'\n')
        out.write(mapper3di[seq]+'\n')


alndf['query'] = alndf['query'].map(lambda x :x.replace('.pdb', ''))
alndf['target'] = alndf['target'].map(lambda x :x.replace('.pdb', ''))



In [153]:
#prepare tree attributes
tre = toytree.tree('../testdata/HOG113/fident_1_raw_struct_tree.PP.nwk.rooted'  )
for i,n in enumerate(tre.treenode.traverse()):
    n.aln = None
    n.aln3di = None
    n.leafset = None
    if len(n.name) == 0:
        n.name = 'internal_'+str(i)

alnfolder = infolder+'alnscratch/'
if not os.path.exists(alnfolder):
    os.mkdir(infolder+'alnscratch/')

#clear all files in aln scratch
cleanup(alnfolder)
submat = '../mafftmat/3diHEXmat.txt'
finalaln, finalaln3di = traverse_tree_merge_mafft( tre.treenode.get_tree_root(), get_leafset(tre.treenode.get_tree_root()) , alndf , infolder+'alnscratch/' , submat = submat , verbose = True ) 
print('finalaln',finalaln)
#print the final alignments
print('nsequences' , len(tre.get_tip_labels()))
finalaln = remove_redundant(finalaln)
finalaln3di = remove_redundant(finalaln3di)



traverse 76 False None
not cherry 76
children ['75', '74']
traverse 75 False None
traverse 75 False None
not cherry 75
children ['Q74CJ3', '73']
traverse Q74CJ3 True None
traverse Q74CJ3 True None
leaf query                                                        Q74CJ3
target                                                       Q74CJ3
fident                                                          1.0
alnlen                                                          143
mismatch                                                          0
gapopen                                                           0
qstart                                                            1
qend                                                            143
tstart                                                            1
tend                                                            143
evalue                                                          0.0
bits                                              

nadd = 1
rescale = 1
dndpre (aa) Version 7.520
alg=X, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

rescale = 1
All-to-all alignment.
    1 / 2

##### writing hat3
pairlocalalign (aa) Version 7.520
alg=Y, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

nadd = 1
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 2
njobc = 3
Loading 'hat3' ... 
done.
rescale = 1
Loading 'hat2n' (aligned sequences - new sequences) ... done.
Loading 'hat2i' (aligned sequences) ... done.
cTEP 0 / 1                    

Combining ..
   done.                      

   done.                      

addsingle (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


To keep the alignment length, 3 letters were DELETED.
To know the positions of deleted letters, rerun the same command with the --mapout option.

Strategy:
 Multi-INS-full (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For m

mafft --seed ../testdata/HOG113/alnscratch/65_inter.3di.fasta ../testdata/HOG113/alnscratch/64_inter.3di.fasta > ../testdata/HOG113/alnscratch/69_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/70_inter.fasta ../testdata/HOG113/alnscratch/69_inter.fasta > ../testdata/HOG113/alnscratch/73_inter.fasta
mafft --seed ../testdata/HOG113/alnscratch/70_inter3di.fasta ../testdata/HOG113/alnscratch/69_inter3di.fasta > ../testdata/HOG113/alnscratch/73_inter3di.fasta


nhomologs = 2
seedoffset = 0
seed = seed1
rescale = 1
Gap Penalty = -1.53, +0.00, -0.12
tsuyosa = 400.000000

adding 0-1
multi2hat3s (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, amax=0.0
1 thread(s)

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 4
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 4
done.

Progressive alignment 1/1... 
STEP     3 / 3 
done.

disttbfast (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

treein = 0
compacttree = 0
stacksize: 8192 kb
Loading 'hat3' ... 
done.
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 0, compacttree = 0
Making a distance matrix from msa .. 
done.                                           
Constructing a UPGMA tree ... 
    0 / 4
    0 / 2
done.

Progressive alignment ... 
STEP     3 /3 
done.
tbfast (aa) Version 7.520
alg=A, model=BLOSUM6

mafft --addfull  ../testdata/HOG113/alnscratch/Q74CJ3_inter.fasta --keeplength ../testdata/HOG113/alnscratch/73_inter.fasta > ../testdata/HOG113/alnscratch/75_inter.fasta
mafft --addfull  ../testdata/HOG113/alnscratch/Q74CJ3_inter.3di.fasta --keeplength ../testdata/HOG113/alnscratch/73_inter3di.fasta > ../testdata/HOG113/alnscratch/75_inter3di.fasta
traverse 74 False None
traverse 74 False None
not cherry 74
children ['72', '71']
traverse 72 False None
traverse 72 False None
cherry B8E097 Q8TU10
traverse 71 False None
traverse 71 False None
not cherry 71
children ['68', '67']
traverse 68 False None
traverse 68 False None
not cherry 68
children ['A2F1C4', '63']
traverse A2F1C4 True None
traverse A2F1C4 True None
leaf query                                                        A2F1C4
target                                                       A2F1C4
fident                                                          1.0
alnlen                                                          145
mi

nadd = 1
rescale = 1
dndpre (aa) Version 7.520
alg=X, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

rescale = 1
All-to-all alignment.
    6 / 7

##### writing hat3
pairlocalalign (aa) Version 7.520
alg=Y, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

nadd = 1
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 7
njobc = 8
Loading 'hat3' ... 
done.
rescale = 1
Loading 'hat2n' (aligned sequences - new sequences) ... done.
Loading 'hat2i' (aligned sequences) ... done.
cTEP 0 / 1                    

Combining ..
   done.                      

   done.                      

addsingle (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


To keep the alignment length, 2 letters were DELETED.
To know the positions of deleted letters, rerun the same command with the --mapout option.

Strategy:
 Multi-INS-full (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For m

mafft --addfull  ../testdata/HOG113/alnscratch/A2F1C4_inter.3di.fasta --keeplength ../testdata/HOG113/alnscratch/63_inter.3di.fasta > ../testdata/HOG113/alnscratch/68_inter3di.fasta
traverse 67 False None
traverse 67 False None
not cherry 67
children ['62', '61']
traverse 62 False None
traverse 62 False None
cherry Q54BU9 A9UTI8
traverse 61 False None
traverse 61 False None
not cherry 61
children ['60', '59']
traverse 60 False None
traverse 60 False None
not cherry 60
children ['58', '57']
traverse 58 False None
traverse 58 False None
not cherry 58
children ['Q7PYJ8', '55']
traverse Q7PYJ8 True None
traverse Q7PYJ8 True None
leaf query                                                        Q7PYJ8
target                                                       Q7PYJ8
fident                                                          1.0
alnlen                                                          157
mismatch                                                          0
gapopen               

nadd = 1
rescale = 1
dndpre (aa) Version 7.520
alg=X, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

rescale = 1
All-to-all alignment.
    1 / 2

##### writing hat3
pairlocalalign (aa) Version 7.520
alg=Y, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

nadd = 1
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 2
njobc = 3
Loading 'hat3' ... 
done.
rescale = 1
Loading 'hat2n' (aligned sequences - new sequences) ... done.
Loading 'hat2i' (aligned sequences) ... done.
cTEP 0 / 1                    

Combining ..
   done.                      

   done.                      

addsingle (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


To keep the alignment length, 9 letters were DELETED.
To know the positions of deleted letters, rerun the same command with the --mapout option.

Strategy:
 Multi-INS-full (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For m

mafft --addfull  ../testdata/HOG113/alnscratch/C3Y972_inter.3di.fasta --keeplength ../testdata/HOG113/alnscratch/54_inter.3di.fasta > ../testdata/HOG113/alnscratch/57_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/58_inter.fasta ../testdata/HOG113/alnscratch/57_inter.fasta > ../testdata/HOG113/alnscratch/60_inter.fasta
mafft --seed ../testdata/HOG113/alnscratch/58_inter3di.fasta ../testdata/HOG113/alnscratch/57_inter3di.fasta > ../testdata/HOG113/alnscratch/60_inter3di.fasta


Loading 'hat3' ... done.
rescale = 1

    0 / 6
Segment   1/  1    1- 157
done 002-001-1  rejected..   
dvtditr (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-INS-i (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.

nadd = 1
rescale = 1
dndpre (aa) Version 7.520
alg=X, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

rescale = 1
All-to-all alignment.
    1 / 2

##### writing hat3
pairlocalalign (aa) Version 7.520
alg=Y, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

nadd = 1
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 2
njobc = 3
Loading 'hat3' ... 


traverse 59 False None
traverse 59 False None
not cherry 59
children ['F7AW07', '56']
traverse F7AW07 True None
traverse F7AW07 True None
leaf query                                                        F7AW07
target                                                       F7AW07
fident                                                          1.0
alnlen                                                          160
mismatch                                                          0
gapopen                                                           0
qstart                                                            1
qend                                                            160
tstart                                                            1
tend                                                            160
evalue                                                          0.0
bits                                                           1325
lddt                                     

nadd = 1
rescale = 1
dndpre (aa) Version 7.520
alg=X, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

rescale = 1
All-to-all alignment.
    1 / 2

##### writing hat3
pairlocalalign (aa) Version 7.520
alg=Y, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

nadd = 1
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 2
njobc = 3
Loading 'hat3' ... 
done.
rescale = 1
Loading 'hat2n' (aligned sequences - new sequences) ... done.
Loading 'hat2i' (aligned sequences) ... done.
cTEP 0 / 1                    

Combining ..
   done.                      

   done.                      

addsingle (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 Multi-INS-full (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct

traverse 43 False None
traverse 43 False None
not cherry 43
children ['41', '40']
traverse 41 False None
traverse 41 False None
cherry M0R5Y9 Q505B7
traverse 40 False None
traverse 40 False None
not cherry 40
children ['A0A2I2Y839', '39']
traverse A0A2I2Y839 True None
traverse A0A2I2Y839 True None
leaf query                                                    A0A2I2Y839
target                                                   A0A2I2Y839
fident                                                          1.0
alnlen                                                          137
mismatch                                                          0
gapopen                                                           0
qstart                                                            1
qend                                                            137
tstart                                                            1
tend                                                            137
evalue          

############################################################################
#   Progressive alignment method is incompatible with the --seed option.
#   Automatically switched to the iterative refinement method.
#   
# Also consider using the '--add' option, which is compatible with
#   the progressive method and FASTER than the '--seed' option.
#   Usage is:
#   % mafft --add newSequences existingAlignment > output
############################################################################
nhomologs = 3
seedoffset = 0
seed = seed1
rescale = 1
Gap Penalty = -1.53, +0.00, -0.12
tsuyosa = 900.000000

adding 0-1
multi2hat3s (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, amax=0.0
1 thread(s)

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 5
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 5
done.

Progressive alignment 1/1... 
STEP     4 / 4 

mafft --seed ../testdata/HOG113/alnscratch/44_inter.fasta ../testdata/HOG113/alnscratch/43_inter.fasta > ../testdata/HOG113/alnscratch/45_inter.fasta
mafft --seed ../testdata/HOG113/alnscratch/44_inter3di.fasta ../testdata/HOG113/alnscratch/43_inter3di.fasta > ../testdata/HOG113/alnscratch/45_inter3di.fasta
mafft --addfull  ../testdata/HOG113/alnscratch/Q2YDE7_inter.fasta --keeplength ../testdata/HOG113/alnscratch/45_inter.fasta > ../testdata/HOG113/alnscratch/46_inter.fasta


############################################################################
#   Progressive alignment method is incompatible with the --seed option.
#   Automatically switched to the iterative refinement method.
#   
# Also consider using the '--add' option, which is compatible with
#   the progressive method and FASTER than the '--seed' option.
#   Usage is:
#   % mafft --add newSequences existingAlignment > output
############################################################################
nhomologs = 5
seedoffset = 0
seed = seed1
rescale = 1
Gap Penalty = -1.53, +0.00, -0.12
tsuyosa = 2500.000000

adding 1-2
multi2hat3s (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, amax=0.0
1 thread(s)

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 8
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 8
done.

Progressive alignment 1/1... 
STEP     7 / 7

mafft --addfull  ../testdata/HOG113/alnscratch/Q2YDE7_inter.3di.fasta --keeplength ../testdata/HOG113/alnscratch/45_inter3di.fasta > ../testdata/HOG113/alnscratch/46_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/47_inter.fasta ../testdata/HOG113/alnscratch/46_inter.fasta > ../testdata/HOG113/alnscratch/48_inter.fasta
mafft --seed ../testdata/HOG113/alnscratch/47_inter.3di.fasta ../testdata/HOG113/alnscratch/46_inter3di.fasta > ../testdata/HOG113/alnscratch/48_inter3di.fasta


nhomologs = 9
seedoffset = 0
seed = seed1
rescale = 1
Gap Penalty = -1.53, +0.00, -0.12
tsuyosa = 8100.000000

adding 0-1
multi2hat3s (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, amax=0.0
1 thread(s)

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 11
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 11
done.

Progressive alignment 1/1... 
STEP    10 / 10 
done.

disttbfast (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

treein = 0
compacttree = 0
stacksize: 8192 kb
Loading 'hat3' ... 
done.
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 0, compacttree = 0
Making a distance matrix from msa .. 
done.                                           
Constructing a UPGMA tree ... 
    0 / 11
    0 / 2
done.

Progressive alignment ... 
STEP    10 /10 
done.
tbfast (aa) Version 7.520
alg=A, model=B

mafft --seed ../testdata/HOG113/alnscratch/49_inter.fasta ../testdata/HOG113/alnscratch/48_inter.fasta > ../testdata/HOG113/alnscratch/50_inter.fasta
mafft --seed ../testdata/HOG113/alnscratch/49_inter.3di.fasta ../testdata/HOG113/alnscratch/48_inter3di.fasta > ../testdata/HOG113/alnscratch/50_inter3di.fasta
mafft --addfull  ../testdata/HOG113/alnscratch/F6ZCC9_inter.fasta --keeplength ../testdata/HOG113/alnscratch/50_inter.fasta > ../testdata/HOG113/alnscratch/52_inter.fasta


done 002-001-1  identical.   
dvtditr (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-INS-i (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.

############################################################################
#   Progressive alignment method is incompatible with the --seed option.
#   Automatically switched to the iterative refinement method.
#   
# Also consider using the '--add' option, which is compatible with
#   the progressive method and FASTER than the '--seed' option.
#   Usage is:
#   % mafft --add newSequences existingAlignment > output
#####################################################

mafft --addfull  ../testdata/HOG113/alnscratch/F6ZCC9_inter.3di.fasta --keeplength ../testdata/HOG113/alnscratch/50_inter3di.fasta > ../testdata/HOG113/alnscratch/52_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/53_inter.fasta ../testdata/HOG113/alnscratch/52_inter.fasta > ../testdata/HOG113/alnscratch/56_inter.fasta
mafft --seed ../testdata/HOG113/alnscratch/53_inter3di.fasta ../testdata/HOG113/alnscratch/52_inter3di.fasta > ../testdata/HOG113/alnscratch/56_inter3di.fasta


nadd = 1
rescale = 1
dndpre (aa) Version 7.520
alg=X, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

rescale = 1
All-to-all alignment.
   12 / 13

##### writing hat3
pairlocalalign (aa) Version 7.520
alg=Y, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

nadd = 1
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 13
njobc = 14
Loading 'hat3' ... 
done.
rescale = 1
Loading 'hat2n' (aligned sequences - new sequences) ... done.
Loading 'hat2i' (aligned sequences) ... done.
cTEP 0 / 1                    

Combining ..
   done.                      

   done.                      

addsingle (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


To keep the alignment length, 7 letters were DELETED.
To know the positions of deleted letters, rerun the same command with the --mapout option.

Strategy:
 Multi-INS-full (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
Fo

mafft --addfull  ../testdata/HOG113/alnscratch/F7AW07_inter.fasta --keeplength ../testdata/HOG113/alnscratch/56_inter.fasta > ../testdata/HOG113/alnscratch/59_inter.fasta
mafft --addfull  ../testdata/HOG113/alnscratch/F7AW07_inter.3di.fasta --keeplength ../testdata/HOG113/alnscratch/56_inter3di.fasta > ../testdata/HOG113/alnscratch/59_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/60_inter.fasta ../testdata/HOG113/alnscratch/59_inter.fasta > ../testdata/HOG113/alnscratch/61_inter.fasta


STEP    23 /23 
done.
tbfast (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
1 thread(s)

rescale = 1
dndpre (aa) Version 7.520
alg=X, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, amax=0.0
0 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 2
sueff_global = 0.100000
nadd = 2
Loading 'hat3' ... done.
rescale = 1

   20 / 24
Segment   1/  1    1- 186
done 002-001-1  identical.   
dvtditr (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-INS-i (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.

#######

mafft --seed ../testdata/HOG113/alnscratch/60_inter3di.fasta ../testdata/HOG113/alnscratch/59_inter3di.fasta > ../testdata/HOG113/alnscratch/61_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/62_inter.fasta ../testdata/HOG113/alnscratch/61_inter.fasta > ../testdata/HOG113/alnscratch/67_inter.fasta


############################################################################
#   Progressive alignment method is incompatible with the --seed option.
#   Automatically switched to the iterative refinement method.
#   
# Also consider using the '--add' option, which is compatible with
#   the progressive method and FASTER than the '--seed' option.
#   Usage is:
#   % mafft --add newSequences existingAlignment > output
############################################################################
nhomologs = 24
seedoffset = 0
seed = seed1
rescale = 1
Gap Penalty = -1.53, +0.00, -0.12
tsuyosa = 57600.000000

adding 0-1
multi2hat3s (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, amax=0.0
1 thread(s)

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 26
done.

Constructing a UPGMA tree (efffree=1) ... 
   20 / 26
done.

Progressive alignment 1/1... 
STEP    25

mafft --seed ../testdata/HOG113/alnscratch/62_inter.3di.fasta ../testdata/HOG113/alnscratch/61_inter3di.fasta > ../testdata/HOG113/alnscratch/67_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/68_inter.fasta ../testdata/HOG113/alnscratch/67_inter.fasta > ../testdata/HOG113/alnscratch/71_inter.fasta


############################################################################
#   Progressive alignment method is incompatible with the --seed option.
#   Automatically switched to the iterative refinement method.
#   
# Also consider using the '--add' option, which is compatible with
#   the progressive method and FASTER than the '--seed' option.
#   Usage is:
#   % mafft --add newSequences existingAlignment > output
############################################################################
nhomologs = 26
seedoffset = 0
seed = seed1
rescale = 1
Gap Penalty = -1.53, +0.00, -0.12
tsuyosa = 67600.000000

adding 1-2
multi2hat3s (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, amax=0.0
1 thread(s)

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 29
done.

Constructing a UPGMA tree (efffree=1) ... 
   20 / 29
done.

Progressive alignment 1/1... 
STEP    28

mafft --seed ../testdata/HOG113/alnscratch/68_inter3di.fasta ../testdata/HOG113/alnscratch/67_inter3di.fasta > ../testdata/HOG113/alnscratch/71_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/72_inter.fasta ../testdata/HOG113/alnscratch/71_inter.fasta > ../testdata/HOG113/alnscratch/74_inter.fasta


done 002-001-1  identical.   
dvtditr (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-INS-i (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.

############################################################################
#   Progressive alignment method is incompatible with the --seed option.
#   Automatically switched to the iterative refinement method.
#   
# Also consider using the '--add' option, which is compatible with
#   the progressive method and FASTER than the '--seed' option.
#   Usage is:
#   % mafft --add newSequences existingAlignment > output
#####################################################

mafft --seed ../testdata/HOG113/alnscratch/72_inter.3di.fasta ../testdata/HOG113/alnscratch/71_inter3di.fasta > ../testdata/HOG113/alnscratch/74_inter3di.fasta
mafft --seed ../testdata/HOG113/alnscratch/75_inter.fasta ../testdata/HOG113/alnscratch/74_inter.fasta > ../testdata/HOG113/alnscratch/76_inter.fasta


############################################################################
#   Progressive alignment method is incompatible with the --seed option.
#   Automatically switched to the iterative refinement method.
#   
# Also consider using the '--add' option, which is compatible with
#   the progressive method and FASTER than the '--seed' option.
#   Usage is:
#   % mafft --add newSequences existingAlignment > output
############################################################################
nhomologs = 29
seedoffset = 0
seed = seed1
rescale = 1
Gap Penalty = -1.53, +0.00, -0.12
tsuyosa = 84100.000000

adding 0-1
multi2hat3s (aa) Version 7.520
alg=A, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, amax=0.0
1 thread(s)

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 31
done.

Constructing a UPGMA tree (efffree=1) ... 
   20 / 31
done.

Progressive alignment 1/1... 
STEP    30

mafft --textmatrix ../mafftmat/3diHEXmat.txt --seed ../testdata/HOG113/alnscratch/75_inter3di.fasta ../testdata/HOG113/alnscratch/74_inter3di.fasta > ../testdata/HOG113/alnscratch/76_inter3di.fasta


STEP    38 / 38  h
done.

disttbfast (text) Version 7.520
alg=A, model=Extended, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

treein = 0
compacttree = 0
stacksize: 8192 kb
Loading 'hat3' ... 
done.
nalphabets = 256
nused=
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 0, compacttree = 0
Making a distance matrix from msa .. 
done.                                           
Constructing a UPGMA tree ... 
   30 / 39
    0 / 8
done.

Progressive alignment ... 
STEP    38 /38 
done.
tbfast (text) Version 7.520
alg=A, model=Extended, 1.53, -0.00, -0.00, noshift, amax=0.0
1 thread(s)

nalphabets = 256
nused=
dndpre (text) Version 7.520
alg=X, model=Extended, 1.53, +0.12, -0.00, noshift, amax=0.0
0 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum -2 / kimura 200
poffset = 0
niter = 2
sueff_global = 0.100000
nadd = 2
Loading 'hat3' ... done.
nalphabets = 256
nused=

   30 / 39
Segment   1/  1    1- 197
STEP 002-013-0  rejected..    identical. 

final aln
childalnsAA {<toytree.TreeNode.TreeNode object at 0x7f905d25e410>: {'fasta': '../testdata/HOG113/alnscratch/75_inter.fasta'}, <toytree.TreeNode.TreeNode object at 0x7f905b1b3cd0>: {'fasta': '../testdata/HOG113/alnscratch/74_inter.fasta'}}
childalns3di {<toytree.TreeNode.TreeNode object at 0x7f905d25e410>: {'fasta': '../testdata/HOG113/alnscratch/75_inter3di.fasta'}, <toytree.TreeNode.TreeNode object at 0x7f905b1b3cd0>: {'fasta': '../testdata/HOG113/alnscratch/74_inter3di.fasta'}}
finalaln ../testdata/HOG113/alnscratch/76_inter.fasta
>_seed__seed_O67888
----------------------------------------LDYEPVYDITADAGIRVRAK
TL-EELFCHAILATFNEITDIDKVEP------KEEYEIQAQND-MPFLLADIINEALVLH
ESK-HFVASECEVLELKEDF----VKVKLKGEKFDPKRHPSKLVIKAATYHRLRVEKKNE
H--W-EA-EVIFDI
>_seed__seed_B5YH50
----------------------------------------MKYK-VIDVAGDVGIRAEGA
SL-EECFINSAFGLYSLITDLAQIEP------AEEIEIIINEDNLENMLVSFLNELIFQF
DTY-GFLGKAISI-EIKDNY----LTARIKGEKFNPEKHERKLLVKAATYHNLVLKQEDS
F--W-IA-EIIFDI
>_seed__seed_A9

STEP 002-004-1  identical.   
Converged.

done
dvtditr (text) Version 7.520
alg=A, model=Extended, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 NW-INS-i (Not tested.)
 ?

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.



In [155]:

finalaln = remove_seeds(finalaln)
finalaln3di = remove_seeds(finalaln3di)
#print number of sequences in the final alignment   
with open(finalaln) as f:
    print( f.read().count('>'))
with open(finalaln3di) as f:
    print( f.read().count('>'))

with open(finalaln) as out:
    print(out.read())

print('finalaln3di',finalaln3di)
with open(finalaln3di) as out:
    print(out.read())



39
39
>O67888
----------------------------------------LDYEPVYDITADAGIRVRAKTL-EELFCHAILATFNEITDIDKVEP------KEEYEIQAQND-MPFLLADIINEALVLHESK-HFVASECEVLELKEDF----VKVKLKGEKFDPKRHPSKLVIKAATYHRLRVEKKNEH--W-EA-EVIFDI
>B5YH50
----------------------------------------MKYK-VIDVAGDVGIRAEGASL-EECFINSAFGLYSLITDLAQIEP------AEEIEIIINEDNLENMLVSFLNELIFQFDTY-GFLGKAISI-EIKDNY----LTARIKGEKFNPEKHERKLLVKAATYHNLVLKQEDSF--W-IA-EIIFDI
>A9A300
----------------------------------------MSYK-FVDHATDAIIEVTAKDL-QEAFSVTADAVINLTLDQDKVEE------KEQRKFVAEGKDLRYLLFSWLEEIPFLLITE-GFAIKRIEF-DITQNQ----INATAYGEPLDFKKHNFKVEIKAPTFYDMEIKQNGGV--F-MR-FLL-DL
>Q5JCY8
-------------------------------------------E-HYEHTADIGVRGYGSTL-EEAFEAVALGLFDVMVNVKKVEP------KECREVEVEEEDLEALLYSFLEELLVLHDME-GLVFGDVKV-RIEKTENGYKLKAKACGEVLNPEKHEPKEEVKAITYHDMKIEKLPDGR-W-MA-QFVPDL
>Q60334
-------------------------------------------N-YFETTADLGVEAKGKSL-EEAFKEGAKGLYNIMVDIDKVDK------KEKIEFEITGEDLEELLYNFLNELLFYTDVE-NLVFNDFDV-KIEKNDNGYRLKCTAYGEKINKEKHNIKEEVKAVTYHKME