In [30]:
#read the afdb clusters file
import pandas as pd
import numpy as np
import glob
import os
#autoreload
%load_ext autoreload
%autoreload 2

from src import AFDB_tools


In [10]:

#read the afdb rep file
reps = pd.read_table( 'afdbclusters/1-AFDBClusters-entryId_repId_taxId.tsv', header=None, names=['entryId', 'repId', 'taxId'] )
print( 'reps' , reps.head() )


In [11]:

structs = glob.glob( 'structs/*.pdb' )
#remove the .pdb extension
structs = [ s.split( '/' )[-1].split( '.' )[0] for s in structs ]
print(structs)

In [12]:
#select the reps that have structures
reps = reps[ reps['repId'].isin( structs ) ]
print(  len(reps)  , reps.head() )


In [13]:
#make a structure alignment directory
if not os.path.exists( 'struct_align' ):
    os.makedirs( 'struct_align' )

#make a directory for each cluster representative
for rep in reps['repId']:
    if not os.path.exists( 'struct_align/' + rep  ):
        os.makedirs( 'struct_align/' + rep  )
    if not os.path.exists( 'struct_align/' + rep  + '/structs/'):
        os.makedirs( 'struct_align/' + rep + '/structs/' )
        

In [31]:
#download n struct members for each cluster
import tqdm
n = 5
for rep in tqdm.tqdm(reps.repId.unique() ):
    subdf = reps[ reps['repId'] == rep ]
    if len(subdf) < n:
        n = len(subdf)
    subdf = subdf.sample( n = n  )
    subdf = subdf.head( n )
    #download the structures
    for uniID in subdf['entryId']:
        AFDB_tools.grab_struct(uniID , structfolder='struct_align/' + rep  + '/structs/')

100%|███████████████████████████████████████| 1400/1400 [06:38<00:00,  3.51it/s]


In [34]:
#for each folder in struct_align, align the structures with all vs all using foldseek
from src import foldseek2tree
import tqdm

for rep in tqdm.tqdm(reps.repId.unique() ):
    #align the structures
    foldseek2tree.runFoldseek_allvall_EZsearch( infolder='struct_align/' + rep  + '/structs/', outpath='struct_align/' + rep + '/allvall.csv' )


  0%|                                                  | 0/1400 [00:00<?, ?it/s]

100%|███████████████████████████████████████| 1400/1400 [06:41<00:00,  3.49it/s]


In [31]:
#derive embeddings for all structures in the struct_align folder
#derive charatcters for 10,20,40,80,128,256,512 kmeans clusters
charsets = [10,20,40,80,128,256,512]

In [35]:
submats = { c: np.zeros( ( c , c ) ) for c in charsets }
#change the character number to an ascii character
colmap = { c:{ i: chr(i) for i in range( c ) } for c in charsets }
revcolmap = { c:{ chr(i): i for i in range( c ) } for c in charsets }
print( colmap )

{10: {0: '\x00', 1: '\x01', 2: '\x02', 3: '\x03', 4: '\x04', 5: '\x05', 6: '\x06', 7: '\x07', 8: '\x08', 9: '\t'}, 20: {0: '\x00', 1: '\x01', 2: '\x02', 3: '\x03', 4: '\x04', 5: '\x05', 6: '\x06', 7: '\x07', 8: '\x08', 9: '\t', 10: '\n', 11: '\x0b', 12: '\x0c', 13: '\r', 14: '\x0e', 15: '\x0f', 16: '\x10', 17: '\x11', 18: '\x12', 19: '\x13'}, 40: {0: '\x00', 1: '\x01', 2: '\x02', 3: '\x03', 4: '\x04', 5: '\x05', 6: '\x06', 7: '\x07', 8: '\x08', 9: '\t', 10: '\n', 11: '\x0b', 12: '\x0c', 13: '\r', 14: '\x0e', 15: '\x0f', 16: '\x10', 17: '\x11', 18: '\x12', 19: '\x13', 20: '\x14', 21: '\x15', 22: '\x16', 23: '\x17', 24: '\x18', 25: '\x19', 26: '\x1a', 27: '\x1b', 28: '\x1c', 29: '\x1d', 30: '\x1e', 31: '\x1f', 32: ' ', 33: '!', 34: '"', 35: '#', 36: '$', 37: '%', 38: '&', 39: "'"}, 80: {0: '\x00', 1: '\x01', 2: '\x02', 3: '\x03', 4: '\x04', 5: '\x05', 6: '\x06', 7: '\x07', 8: '\x08', 9: '\t', 10: '\n', 11: '\x0b', 12: '\x0c', 13: '\r', 14: '\x0e', 15: '\x0f', 16: '\x10', 17: '\x11', 18: 

In [None]:
import h5py
for rep in tqdm.tqdm(reps.repId.unique() ):
    #load the all vs all aln
    aln_df = pd.read_tsv('struct_align/' + rep + '/allvall.csv')
    #load the embedding of the structures
    q = aln_df['s1'].unique()
    t = aln_df['s2'].unique()
    
    for q in aln_df['s1'].unique():
        for t in aln_df['s2'].unique():
            if q != t:
                #align the structures
                aln = aln_df[ (aln_df['s1'] == q) & (aln_df['s2'] == t) ]
                qaln = aln.qaln
                taln = aln.taln
                for charset in charsets:
                    #derive the embeddings
                    
                    with h5py.File('aln_embeds/' + rep + '.h5' , 'r') as hf:
                        q_embeds = iter(hf[q][charset].decode())
                        t_embeds = iter(hf[q][charset].decode())
                    
                    #transfer the alignments to the embeddings
                    qaln_ft2 = ''.join([ next(q_embeds) if x == '-' else x for x in qaln ])
                    taln_ft2 = ''.join([ next(t_embeds) if x == '-' else x for x in taln ])
                    
                    alnzip = zip( qaln_ft2 , taln_ft2 )
                    for qchar, tchar in alnzip:
                        if qchar != '-' and tchar != '-':
                            submats[charset][ colmap[charset][qchar] , colmap[charset][tchar] ] += 1
                            submats[charset][ colmap[charset][tchar] , colmap[charset][qchar] ] += 1


In [None]:

#save the submats in raw form
for charset in charsets:
    np.save( 'submats/' + str(charset) + '.npy' , submats[charset] )
    with open( 'submats/' + str(charset) + '.txt' , 'w' ) as f:
        for i in range( charset ):
            f.write( '\t'.join( [ str(submats[charset][i,j]) for j in range( charset ) ] ) + '\n' )

#normalize the submats rows and columns to sum to 1
for charset in charsets:
    rowsums = submats[charset].sum( axis=1 )
    colsums = submats[charset].sum( axis=0 )
    for i in range( charset ):
        submats[charset][i,:] = submats[charset][i,:] / rowsums[i]
        submats[charset][:,i] = submats[charset][:,i] / colsums[i]

#save the submats in normalized form
for charset in charsets:
    np.save( 'submats/' + str(charset) + '_norm.npy' , submats[charset] )
    with open( 'submats/' + str(charset) + '_norm.txt' , 'w' ) as f:
        for i in range( charset ):
            f.write( '\t'.join( [ str(submats[charset][i,j]) for j in range( charset ) ] ) + '\n' )
    

