In [46]:
import subprocess
import os
import glob
import toytree
import tqdm
import pandas as pd
from subprocess import PIPE, Popen
import shlex
from Bio import SeqIO
import random
import numpy as np

def Fident(str1,str2 , verbose = False):
    #minlen= min( (len(str1),len(str2))  )
    #str1 = str1[:minlen]
    #str2 = str2[:minlen]
    str1 = np.array(list(str1))
    str2 = np.array(list(str2))            
    return len(np.where( (str1 == str2 ) & (str1 != '-' ) & (str2 != '-')  )[0]) / len(str1)

def copyaln( aln, seq):
    seqiter = iter(seq)
    newaln = ''
    for i,char in enumerate(aln):
        if char == '-':
            newaln += '-'
        else:
            newaln+=next(seqiter)
    return newaln

def read_dbfiles3di(  AADB , threeDidb):
    #find positions 
    threeDiseq = [ l.strip().replace('\x00','') for l in open(threeDidb)]
    lookup = AADB+'.lookup'
    ids = [ l.split()[1].strip() for l in open(lookup)]
    AAs = [ l.strip().replace('\x00','') for l in open(AADB)]

    mapper3di = dict(zip(ids,threeDiseq))
    mapperAA = dict(zip(ids,AAs))
    
    return mapper3di, mapperAA

def calc_fident_crossaln(row , verbose = False):
    #amino acid representations of alns using AAand3di or just 3di
    qaln_2, taln_2 = row.qaln , row.taln
    #start and stop of aln
    
    qstart_2, qend_2, tstart_2 , tend_2 = row.qstart, row.qend , row.tstart , row.tend
    #indexing starts at 1...
    
    #3di of the query and target
    structQ, structT = row['3diq'], row['3dit']
    AAq, AAt = row['AAq'], row['AAt']

    #add gaps
    t3diAA_newgaps = copyaln(taln_2, structT[tstart_2-1:tend_2]) 
    q3diAA_newgaps = copyaln(qaln_2, structQ[qstart_2-1:qend_2])
    row = pd.Series( { '3di_qaln_mode2':q3diAA_newgaps , '3di_taln_mode2':t3diAA_newgaps })
    #return columns
    return row

def get_leafset( treenode ):
    """
    this function returns the leafset of a node
    """
    if treenode.is_leaf():
        return [treenode.name]
    else:
        return treenode.get_leaf_names()

def mergeAlign( fasta1, fasta2, outfile):
    print(fasta1,fasta2)
    #args =  'clustalo --p1 ' + fasta1 + ' --p2 ' + fasta2 + ' --force -o ' + outfile
    print('merge' , fasta1 , fasta2 )
    args = 'mafft --add ' + fasta1 + ' ' + fasta2 
    args = shlex.split(args)
    p = Popen( args, stdin=PIPE, stdout=PIPE)
    aln = p.communicate()[0].decode()
    print('mafft aln',aln)
    with open( outfile , 'w') as alnout:
        alnout.write(aln)
    return outfile


def convolve_strings(str1, str2):
    # Determine the lengths of the strings
    len1, len2 = len(str1), len(str2)

    max_alignment = 0
    max_count = 0

    # Slide str2 over str1, starting with one character overlap
    # and continue until str2 is again overlapping by just one character
    for i in range(-len2 + 1, len1):
        count = 0
        for j in range(len2):
            if 0 <= i + j < len1 and str1[i + j] == str2[j]:
                count += 1

        if count > max_count:
            max_count = count
            max_alignment = i

    return max_alignment, max_count

def mergealns( aln1f, aln2f, outfile):
    #find sequences in common between the two alignments
    aln1 = SeqIO.parse(aln1f, 'fasta')
    aln2 = SeqIO.parse(aln2f, 'fasta')
    ids1d = {}
    ids2d = {}

    for s in aln1:
        if s.id not in ids1d:
            ids1d[s.id] = s
    for s in aln2:
        if s.id not in ids2d:
            ids2d[s.id]= s
    
    #find the intersection of the two sets
    ids1 = set(ids1d.keys())
    ids2 = set(ids2d.keys())
    commonids = ids1.intersection(ids2)
    #select one common sequence
    coidx1 = -1
    coidx2 = -1
    #find the start of the common sequence in each alignment
    #select a random common sequence
    commonid = random.choice(list(commonids))
    #create an index of the sequence ids in each alignment
    ids1 = {}
    ids2 = {}
    aln1 = SeqIO.parse(aln1f, 'fasta')
    aln2 = SeqIO.parse(aln2f, 'fasta')
    idlist = []
    for i,s in enumerate(aln1):
        ids1[s.id] = i
        idlist.append(s.id)    
    for i,s in enumerate(aln2):
        ids2[s.id] = i
        idlist.append(s.id)
    #transform both alignments into numpy matrices
    aln1 = SeqIO.parse(aln1f, 'fasta')
    aln2 = SeqIO.parse(aln2f, 'fasta')

    aln1 = np.array([ list(str(s.seq)) for s in aln1])
    aln2 = np.array([ list(str(s.seq)) for s in aln2])
    print('aln1', aln1.shape , aln1)
    print('aln2', aln2.shape , aln2)
    #generate a list of columns
    aln1 = iter([ aln1[:,i] for i in range(aln1.shape[1])])
    aln2 = iter([ aln2[:,i] for i in range(aln2.shape[1])])
    
    s1 = ids1d[commonid]
    s2 = ids2d[commonid]
    print('s1', s1.seq)
    print('s2', s2.seq)

    s1raw = iter(str(s1.seq))
    s2raw = iter(str(s2.seq))
    s1 = str(s1.seq).replace('-','')
    s2 = str(s2.seq).replace('-','')

    #if the common subsequence is not found start by removing the first character of the common sequence
    char1 = None
    char2 = None
    
    #convolution of the two sequences
    maxaln, maxcount = convolve_strings(s1,s2)
    if len(s1) > len(s2):
        if maxaln<0:
            coidx1 = 0
            coidx2 = np.abs(maxaln)
        else:
            coidx1 = maxaln
            coidx2 = 0
    else:
        if maxaln<0:
            coidx1 = np.abs(maxaln)
            coidx2 = 0
        else:
            coidx1 = 0
            coidx2 = maxaln
    print('convolution', maxaln, maxcount)
    print('coidx1', coidx1)
    print('coidx2', coidx2)

    i = 0
    while i < coidx1:
        char = next(s1raw)
        print('char', char)
        if char == '-':
            pass
        else:
            i +=1
        discard = next(aln1)
    
    i = 0
    while i < coidx2:
        char = next(s2raw)
        print('char', char)
        if char == '-':
            pass
        else:
            i +=1
        discard = next(aln2)

    print('coidx1', coidx1)
    print('coidx2', coidx2)
    newaln1 = []
    newaln2 = []

    if char1 == None:
        char1 = next(s1raw)
    if char2 == None:
        char2 = next(s2raw)
    
    while True:
        try:
            if char1 == '-' and char2 == '-':
                #no pivot information, skip
                char1 = next(s1raw)
                char2 = next(s2raw)
            elif char2 == '-' and char1 != '-':
                char2 = next(s2raw)
                #create a column of gaps
                gaps = np.array(['-'] * len(ids2))
                newaln1.append(gaps)
                n2.append(next(aln2))
            elif char1 == '-' and char2 != '-':
                char1 = next(s1raw)
                #create a column of gaps
                gaps = np.array(['-'] * len(ids1))
                newaln1.append(gaps)
                newaln2.append(next(aln2))
            elif char1 == char2:
                #match. append both columns
                char1 = next(s1raw)
                char2 = next(s2raw)
                newaln1.append(next(aln1))
                newaln2.append(next(aln2))
        except StopIteration:
            #if one of the sequences is finished, then break
            break
    
    newaln1 = np.array(newaln1).T
    newaln2 = np.array(newaln2).T
    newaln = np.concatenate((newaln1, newaln2), axis = 0)
    print('newaln', newaln.shape , newaln)
    #write out the new alignment
    with open(outfile, 'w') as f:
        for i in range(newaln.shape[0]):
            print('>' + idlist[i] + '\n' + ''.join(list(newaln[i,:])) + '\n')
            f.write('>' + idlist[i] + '\n')
            f.write(''.join(list(newaln[i,:])) + '\n')
    return outfile  

def sub2fasta( sub, outfile , fastacol1='qaln' , fastacol2='taln' ):
    with open(outfile, 'w') as f:
        f.write('>' + sub['query'] + '\n')
        f.write(sub[fastacol1] + '\n')
        f.write('>' + sub['target'] + '\n')
        f.write(sub[fastacol2] + '\n')    
    return outfile

def retalns(allvall, leafname,leafset):
    sub = allvall[allvall['query'] == leafname]
    sub = sub[sub['target'].isin(leafset)]
    sub = sub[sub['query'] != sub['target']]
    return sub.iloc[0]

#traverse tree from root to leaves recursively
def traverse_tree_merge( treenode, topleafset, allvall , alnfolder ):
    """
    this function traverses a tree from root to leaves recursively
    it returns a dictionary with the iteratively built alignment
    """
    
    if treenode.is_leaf():  
        #if the node is a leaf, then we need to add it to the alignment with one of the pivots in the current leafset
        sub = retalns(allvall, treenode.name , topleafset )
        treenode.aln = sub2fasta(sub, alnfolder + treenode.name + '_inter.fasta')
        treenode.aln3di = sub2fasta(sub, alnfolder + treenode.name + '_inter.3di.fasta' , fastacol1='3di_qaln_mode2' , fastacol2='3di_taln_mode2')
    else:
        childalns3di = []
        childalnsAA = []
        treenode.leafset = get_leafset(treenode) 
        for c in treenode.get_children():
            print('traverse', c.name , c.is_leaf() , c.leafset)
            if not c.aln:
                c.aln,c.aln3di = traverse_tree_merge(c , treenode.leafset , allvall, alnfolder)
            childalnsAA.append(c.aln)
            childalns3di.append(c.aln3di)
        treenode.aln = mergealns( childalnsAA[0], childalnsAA[1] ,  alnfolder + treenode.name + '_inter.fasta')            
        treenode.aln3di = mergealns( childalns3di[0], childalns3di[1] , alnfolder + treenode.name + '_inter.3di.fasta' )
    return treenode.aln, treenode.aln3di

def remove_redundant( alignment ):
    """
    this function removes redundant sequences from an alignment
    """
    aln = SeqIO.parse(alignment, 'fasta')
    seqs = []
    ids = []
    for s in aln:
        if s.id not in ids:
            seqs.append(s)
            ids.append(s.id)
    
    with open(alignment, 'w') as f:
        for s in seqs:
            f.write('>' + s.id + '\n')
            f.write(str(s.seq) + '\n')
    return alignment

#remove all alns except the final merged one
def cleanup( filedir ):
    """
    this function removes all alns except the final merged one
    """
    for f in glob.glob(filedir + '*inter.fasta'):
        os.remove(f)



In [47]:

import toytree
import os
import pandas as pd
import glob


alndf = pd.read_table('../testdata/allvall_1.csv', header = None)
tre = toytree.tree('../testdata/foldtree_struct_tree.PP.nwk.rooted.final'  ) 

infolder = '../testdata/allvall_1.csv'.split('/')[:-1]
infolder = ''.join( [i + '/' for i in infolder])
mapper3di, mapperAA = read_dbfiles3di( "../testdata/outdb" , "../testdata/outdb_ss")
#add the 3di alignment to the dataframe
columns = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,lddt,qaln,taln,cigar,lntmscore'.split(',')
alndf.columns = columns

alndf['3diq']= alndf['query'].map(mapper3di)
alndf['3dit']= alndf['target'].map(mapper3di)
alndf['AAq']= alndf['query'].map(mapperAA)
alndf['AAt']= alndf['target'].map(mapperAA)

#output a fasta with the 3di sequences
res = alndf.apply(calc_fident_crossaln , axis = 1)
alndf = pd.concat([alndf,res] , axis = 1)

with open('../testdata/3diseqs.fasta' , 'w') as out:
    for seq in alndf['query'].unique():
        out.write('>'+seq.replace('.pdb', '' )+'\n')
        out.write(mapper3di[seq]+'\n')


alndf['query'] = alndf['query'].map(lambda x :x.replace('.pdb', ''))
alndf['target'] = alndf['target'].map(lambda x :x.replace('.pdb', ''))



In [48]:

#prepare tree attributes
for i,n in enumerate(tre.treenode.traverse()):
    n.aln = None
    n.aln3di = None
    n.leafset = None
    if len(n.name) == 0:
        n.name = 'internal_'+str(i)

alnfolder = infolder+'alnscratch/'
if not os.path.exists(alnfolder):
    os.mkdir(infolder+'alnscratch/')

finalaln, finalaln3di = traverse_tree_merge( tre.treenode.get_tree_root(), get_leafset(tre.treenode.get_tree_root()) , alndf , infolder+'alnscratch/')


print('finalaln',finalaln)
print('finalaln3di',finalaln3di)



traverse A0A3B3BIE0 True None
traverse 58 False None
traverse 56 False None
traverse A0A3Q2ZTT6 True None
traverse A0A1A8U1R0 True None
aln1 (2, 933) [['M' 'S' 'L' ... 'P' 'T' 'H']
 ['M' 'S' 'L' ... 'P' 'T' 'H']]
aln2 (2, 933) [['M' 'S' 'L' ... 'P' 'T' 'H']
 ['M' 'S' 'L' ... 'P' 'T' 'H']]
s1 MSLSIGDKIEDFKVLTLLGKGSFACVYRAKSVKTGVEVAIKMIDKKAMHKAGMVQRVANEVEIHCRLKHPSILELYNYFEDSNYVYLVLEMCHNGEMSRYLKERKVPFSEDEARHFMHQIIKGMLYLHTHGILHRDLTLSNLLLTSNMNIKIADFGLATQLKLPNEKHFTMCGTPNYISPEVATRSAHGLESDVWSLGCMFYAFLMGRPPFDTDTVKHTLSKVVLGDYEMPTHVSLEAQDLIHQLLQKDPAQRPSLSAVLDHPFMTQSLLVRTKELRLGDDGSMDSGIATISTACTSSTSASSGTRLQRRTKHMIGSALPNRMMP--SLLHQPNSACFEEGDQRHQQYAPDSYSRDGRSRAVYGGEGGQPHSRFLRRAHSSDRCSSAAAGQTSSQAELGRCHSEETLASVGLPVFPTSSTPHTFSEHGRLPSPPVKQSANSGYLSATEMIHPSNLQFQDMEGVSNWLNNEALGQMHTDGSTHSSSGSFHSSRGPLGVHSSWSEPLGRGAHPHHNQHHPHNQLPSSADSYRENSPGADFQPFHSRELKLPASKPSTDKKKKTLRDTVPPLCASRLKPIRQKTKNAVVSILDSGEVCMELLKSQTGQERVKEVLRISCDGSMVTIYQPNGGKGFPVLDCPPAPPEDILICSYEDLPEKYWKKYQYASKFVQLVKSKTPKVTLYTKYAKVMLMENSPNADLEVCFYDGAK

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 881 and the array at index 1 has size 880