In [19]:
%load_ext autoreload
%autoreload 2
#read the afdb clusters file
import pandas as pd
import numpy as np
import glob
import os
#autoreload
import pickle
from src import AFDB_tools
import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
datadir = '../../datasets/foldtree2/'

In [10]:

#read the afdb rep file
reps = pd.read_table( datadir + 'afdbclusters/1-AFDBClusters-entryId_repId_taxId.tsv', header=None, names=['entryId', 'repId', 'taxId'] )
print( 'reps' , reps.head() )


reps       entryId       repId    taxId
0  A0A009E921  A0A009E921  1310605
1  A0A009F5K6  A0A009E921  1310605
2  A0A009E9H3  A0A009E9H3  1310605
3  A0A484ZLT0  A0A009E9H3    82979
4  A0A009ECR5  A0A009ECR5  1310605


In [11]:

structs = glob.glob( datadir+'structs/*.pdb' )
#remove the .pdb extension
structs = [ s.split( '/' )[-1].split( '.' )[0] for s in structs ]
print(structs[0:10])

['A0A3D4Y7U3', 'A0A182QEX7', 'A0A1B6MIL6', 'A0A2X0NIQ0', 'A0A495JSP3', 'A0A3C0CY76', 'A0A1Z9J3Y7', 'F4IL37', 'A0A6M1NDP9', 'A0A368W5L6']


In [12]:
#select the reps that have structures
reps = reps[ reps['repId'].isin( structs ) ]
print(  len(reps)  , reps.head() )

34870           entryId       repId    taxId
7176   A0A011Q6F0  A0A011Q6F0  1454005
7177   A0A838GEN5  A0A011Q6F0  2448782
14470  A0A015KR17  A0A015KR17  1432141
14471  A0A2N0NP60  A0A015KR17   588596
14472  A0A2Z6S933  A0A015KR17    94130


In [13]:
#make a structure alignment directory
if not os.path.exists( 'struct_align' ):
    os.makedirs( 'struct_align' )

#make a directory for each cluster representative
for rep in reps['repId']:
    if not os.path.exists( datadir +'struct_align/' + rep  ):
        os.makedirs(datadir + 'struct_align/' + rep  )
    if not os.path.exists( datadir+ 'struct_align/' + rep  + '/structs/'):
        os.makedirs( datadir+ 'struct_align/' + rep + '/structs/' )
        

In [None]:
#download n struct members for each cluster

n = 5
for rep in tqdm.tqdm(reps.repId.unique() ):
    subdf = reps[ reps['repId'] == rep ]
    if len(subdf) < n:
        n = len(subdf)
    subdf = subdf.sample( n = n  )
    subdf = subdf.head( n )
    #download the structures
    for uniID in subdf['entryId']:
        AFDB_tools.grab_struct(uniID , structfolder=datadir+'struct_align/' + rep  + '/structs/')

In [None]:
#for each folder in struct_align, align the structures with all vs all using foldseek
from src import foldseek2tree

for rep in tqdm.tqdm(reps.repId.unique() ):
    #align the structures
    foldseek2tree.runFoldseek_allvall_EZsearch( infolder= datadir + 'struct_align/' + rep  + '/structs/', outpath=datadir+'struct_align/' + rep + '/allvall.csv' )


In [44]:
#derive embeddings for all structures in the struct_align folder
#derive charatcters for 10,20,40,80,128,256,512 kmeans clusters
charsets = [20,50,80,128,256,512]

In [9]:
submats = { c: np.zeros( ( c , c ) ) for c in charsets }
#change the character number to an ascii character
colmap = { c:{ i: chr(i) for i in range( c ) } for c in charsets }
revcolmap = { c:{ chr(i): i for i in range( c ) } for c in charsets }
print( colmap )

In [14]:
pdbfiles_structalign = glob.glob('./struct_align/*/structs/*.pdb')
import h5py
filename = 'structs_structalign_encoded.h5'
zstack = []
with h5py.File(filename, 'r') as f:
    for s in tqdm.tqdm(f['struct_align']):
        for s2 in f['struct_align'][s]['structs']:
            zstack.append(np.array(f['struct_align'][s]['structs'][s2]['z']))
print(len(zstack) )
zstack = np.vstack(zstack)

100%|███████████████████████████████████████████████████████████████████████████| 1400/1400 [00:01<00:00, 1057.70it/s]


4323


In [15]:
print(zstack.shape)

(1251738, 5)


In [25]:
from sklearn.cluster import KMeans,MiniBatchKMeans
from sklearn.metrics import silhouette_score

kmeans_dir = 'kmeans/'
scores = []

kmeans_dict = {}
for nclusters in tqdm.tqdm([ 20, 50 , 80,  100, 200, 256 , 512 ]):
    print( 'fitting ' , nclusters ) 
    kmeans = MiniBatchKMeans(n_clusters=nclusters, random_state=0).fit( zstack )
    centers = kmeans.cluster_centers_
    labels = kmeans.labels_
    
    kmeans_dict[nclusters] = kmeans
    print( 'done' )
    print('silhouette score' ) 
    
    score = silhouette_score(zstack[0:10000], kmeans.predict(zstack[0:10000]))
    scores.append(score)

    print(score )
    
    #save each of the kmeans 
    if not os.path.exists(kmeans_dir):
        os.makedirs(kmeans_dir)
    with open(kmeans_dir + str(nclusters)+'_kmeans.pkl', 'wb') as f:
        pickle.dump(kmeans, f)

    print( 'find mean and cov of each cluster ' ) 
    #get the covariance within clusters
    cov = np.zeros((nclusters, zstack.shape[1], zstack.shape[1]))
    for i in range(nclusters):
        cov[i] = np.cov(zstack[labels == i].T)
    print(cov.shape)

    #get the means within clusters
    means = np.zeros((nclusters, zstack.shape[1]))
    for i in range(nclusters):
        means[i] = np.mean(zstack[labels == i], axis=0)
    print(means.shape)

    #save the mean and covariance for clusters in .npy
    np.save(kmeans_dir + str(nclusters)+'_means.npy', means)
    np.save(kmeans_dir + str(nclusters)+'_cov.npy', cov)
    print('done' )


  super()._check_params_vs_input(X, default_n_init=3)


fitting  20
done
silhouette score
0.18854639
find mean and cov of each cluster 
(20, 5, 5)


 14%|███████████▊                                                                       | 1/7 [00:01<00:08,  1.48s/it]

(20, 5)
done
fitting  50


  super()._check_params_vs_input(X, default_n_init=3)


done
silhouette score
0.16989867
find mean and cov of each cluster 


 29%|███████████████████████▋                                                           | 2/7 [00:03<00:08,  1.62s/it]

(50, 5, 5)
(50, 5)
done
fitting  80


  super()._check_params_vs_input(X, default_n_init=3)


done
silhouette score
0.15675747
find mean and cov of each cluster 
(80, 5, 5)


 43%|███████████████████████████████████▌                                               | 3/7 [00:05<00:06,  1.72s/it]

(80, 5)
done
fitting  100


  super()._check_params_vs_input(X, default_n_init=3)


done
silhouette score
0.16445886
find mean and cov of each cluster 
(100, 5, 5)


 57%|███████████████████████████████████████████████▍                                   | 4/7 [00:07<00:05,  1.89s/it]

(100, 5)
done
fitting  200


  super()._check_params_vs_input(X, default_n_init=3)


done
silhouette score
0.15605712
find mean and cov of each cluster 
(200, 5, 5)


 71%|███████████████████████████████████████████████████████████▎                       | 5/7 [00:10<00:04,  2.36s/it]

(200, 5)
done
fitting  256


  super()._check_params_vs_input(X, default_n_init=3)


done
silhouette score
0.152111
find mean and cov of each cluster 
(256, 5, 5)


 86%|███████████████████████████████████████████████████████████████████████▏           | 6/7 [00:13<00:02,  2.77s/it]

(256, 5)
done
fitting  512


  super()._check_params_vs_input(X, default_n_init=3)


done
silhouette score
0.13222493
find mean and cov of each cluster 
(512, 5, 5)


100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [00:19<00:00,  2.84s/it]

(512, 5)
done





In [27]:
print(kmeans_dict)

{20: MiniBatchKMeans(n_clusters=20, random_state=0), 50: MiniBatchKMeans(n_clusters=50, random_state=0), 80: MiniBatchKMeans(n_clusters=80, random_state=0), 100: MiniBatchKMeans(n_clusters=100, random_state=0), 200: MiniBatchKMeans(n_clusters=200, random_state=0), 256: MiniBatchKMeans(n_clusters=256, random_state=0), 512: MiniBatchKMeans(n_clusters=512, random_state=0)}


In [45]:
import h5py
alnfiles = glob.glob(datadir + 'struct_align/*/allvall.csv')
cols = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qaln,taln'.split(',')
submats = { charset: np.zeros((charset,charset)) for charset in charsets }

with h5py.File('./structs_structalign.h5' , 'r') as hf:
    for rep in tqdm.tqdm(alnfiles ):
        #load the all vs all aln
        aln_df = pd.read_table(rep)
        aln_df.columns = cols
        #load the embedding of the structures
        q = aln_df['query'].unique()
        t = aln_df['target'].unique()
        representative = rep.split('/')[-2]
        
        for q in aln_df['query'].unique():
            for t in aln_df['target'].unique():
                if q != t:
                    #align the structures
                    aln = aln_df[ (aln_df['query'] == q) & (aln_df['target'] == t) ]
                    qaln = aln.qaln
                    taln = aln.taln
                    print(representative, q,t , qaln, taln)
                    qz = np.array(hf['struct_align'][representative]['structs'][q])
                    tz = np.array(hf['struct_align'][representative]['structs'][t])
                    
                    for charset in charsets:
                        #derive the embeddings
                        q_embeds = list(kmeans_dict[charset].predict(qz))
                        t_embeds = list(kmeans_dict[charset].predict(tz))
                        
                        #transfer the alignments to the embeddings
                        qaln_ft2 = [ next(q_embeds) if x != '-' else None for x in qaln ]
                        taln_ft2 = [ next(t_embeds) if x != '-' else None for x in taln ]    
                        alnzip = zip( qaln_ft2 , taln_ft2 )
                        alnzip = [ [a[0] , a[1]] for a in alnzip if a[0] is not None and a[1] is not None]
                        submats[charset][alnzip] += 1


  0%|                                                                                        | 0/1400 [00:00<?, ?it/s]


A0A090ZGQ5 A0A090ZGQ5.pdb A0A7W6LMN5.pdb 0    MDGIVISIAEAKSDKDCESCVNALKALAKNVPQ--GVRKYAEEVCQ...
Name: qaln, dtype: object 0    PGSLMTEMDKATTNAEATKVLKKIRKQFDDCDKEVAWQPHLGRFLA...
Name: taln, dtype: object


ValueError: could not convert string to float: 'edge'

In [None]:

#save the submats in raw form
for charset in charsets:
    np.save( 'submats/' + str(charset) + '.npy' , submats[charset] )
    with open( 'submats/' + str(charset) + '.txt' , 'w' ) as f:
        for i in range( charset ):
            f.write( '\t'.join( [ str(submats[charset][i,j]) for j in range( charset ) ] ) + '\n' )

#normalize the submats rows and columns to sum to 1
for charset in charsets:
    rowsums = submats[charset].sum( axis=1 )
    colsums = submats[charset].sum( axis=0 )
    for i in range( charset ):
        submats[charset][i,:] = submats[charset][i,:] / rowsums[i]
        submats[charset][:,i] = submats[charset][:,i] / colsums[i]

#save the submats in normalized form
for charset in charsets:
    np.save( 'submats/' + str(charset) + '_norm.npy' , submats[charset] )
    with open( 'submats/' + str(charset) + '_norm.txt' , 'w' ) as f:
        for i in range( charset ):
            f.write( '\t'.join( [ str(submats[charset][i,j]) for j in range( charset ) ] ) + '\n' )
    

