In [13]:
%load_ext autoreload
%autoreload 2
#read the afdb clusters file
import pandas as pd
import numpy as np
import glob
import os
#autoreload
from src import AFDB_tools


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:

#read the afdb rep file
reps = pd.read_table( 'afdbclusters/1-AFDBClusters-entryId_repId_taxId.tsv', header=None, names=['entryId', 'repId', 'taxId'] )
print( 'reps' , reps.head() )


reps       entryId       repId    taxId
0  A0A009E921  A0A009E921  1310605
1  A0A009F5K6  A0A009E921  1310605
2  A0A009E9H3  A0A009E9H3  1310605
3  A0A484ZLT0  A0A009E9H3    82979
4  A0A009ECR5  A0A009ECR5  1310605


In [15]:

structs = glob.glob( 'structs/*.pdb' )
#remove the .pdb extension
structs = [ s.split( '/' )[-1].split( '.' )[0] for s in structs ]
print(structs[0:10])

['A0A182QEX7', 'A0A1B6MIL6', 'A0A2X0NIQ0', 'A0A3C0CY76', 'A0A1Z9J3Y7', 'F4IL37', 'A0A093ZM93', 'A0A662CHN5', 'A0A1Z4SDS6', 'A0A5B7XUB5']


In [16]:
#select the reps that have structures
reps = reps[ reps['repId'].isin( structs ) ]
print(  len(reps)  , reps.head() )


24081           entryId       repId    taxId
7176   A0A011Q6F0  A0A011Q6F0  1454005
7177   A0A838GEN5  A0A011Q6F0  2448782
14470  A0A015KR17  A0A015KR17  1432141
14471  A0A2N0NP60  A0A015KR17   588596
14472  A0A2Z6S933  A0A015KR17    94130


In [None]:
#make a structure alignment directory
if not os.path.exists( 'struct_align' ):
    os.makedirs( 'struct_align' )

#make a directory for each cluster representative
for rep in reps['repId']:
    if not os.path.exists( 'struct_align/' + rep  ):
        os.makedirs( 'struct_align/' + rep  )
    if not os.path.exists( 'struct_align/' + rep  + '/structs/'):
        os.makedirs( 'struct_align/' + rep + '/structs/' )
        

In [None]:
#download n struct members for each cluster
import tqdm
n = 5
for rep in tqdm.tqdm(reps.repId.unique() ):
    subdf = reps[ reps['repId'] == rep ]
    if len(subdf) < n:
        n = len(subdf)
    subdf = subdf.sample( n = n  )
    subdf = subdf.head( n )
    #download the structures
    for uniID in subdf['entryId']:
        AFDB_tools.grab_struct(uniID , structfolder='struct_align/' + rep  + '/structs/')

  0%|                                                                                                                                                                                                                | 0/1400 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1400/1400 [03:33<00:00,  6.55it/s]


In [None]:
#for each folder in struct_align, align the structures with all vs all using foldseek
from src import foldseek2tree
import tqdm

for rep in tqdm.tqdm(reps.repId.unique() ):
    #align the structures
    foldseek2tree.runFoldseek_allvall_EZsearch( infolder='struct_align/' + rep  + '/structs/', outpath='struct_align/' + rep + '/allvall.csv' )


In [None]:
#derive embeddings for all structures in the struct_align folder
#derive charatcters for 10,20,40,80,128,256,512 kmeans clusters
charsets = [10,20,40,80,128,256,512]

In [None]:
submats = { c: np.zeros( ( c , c ) ) for c in charsets }
#change the character number to an ascii character
colmap = { c:{ i: chr(i) for i in range( c ) } for c in charsets }
revcolmap = { c:{ chr(i): i for i in range( c ) } for c in charsets }
print( colmap )

NameError: name 'charsets' is not defined

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import VGAE
from torch.optim import Adam
from torch_geometric.data import DataLoader
#create a training loop for the GAE model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device( 'cpu')
print(device)


encoder_save = 'encoder_mk2_aa_10'
decoder_save = 'decoder_mk2_aa_20'

#save the blank encoder and decoder
with open(enconder_save + '.pkl' , 'rb') as encodeout:
    encoder = pickle.loads( encodeout.read() )
with open(decoder_save + '.pkl' , 'rb') as encodeout:
    decoder = pickle.loads( encodeout.read() )

if os.path.exists(encoder_save+ '.pth') and os.path.exists(decoder_save+ '.pth'):
    encoder.load_state_dict(torch.load(encoder_save + '.pth'))
    decoder.load_state_dict(torch.load(decoder_save + '.pth' ))

#put encoder and decoder on the device
encoder = encoder.to(device)
decoder = decoder.to(device)

In [None]:
import h5py
for rep in tqdm.tqdm(reps.repId.unique() ):
    #load the all vs all aln
    aln_df = pd.read_tsv('struct_align/' + rep + '/allvall.csv')
    #load the embedding of the structures
    q = aln_df['s1'].unique()
    t = aln_df['s2'].unique()
    for q in aln_df['s1'].unique():
        for t in aln_df['s2'].unique():
            if q != t:
                #align the structures
                aln = aln_df[ (aln_df['s1'] == q) & (aln_df['s2'] == t) ]
                qaln = aln.qaln
                taln = aln.taln
                for charset in charsets:
                    #derive the embeddings
                    
                    with h5py.File('aln_embeds/' + rep + '.h5' , 'r') as hf:
                        q_embeds = iter(hf[q][charset].decode())
                        t_embeds = iter(hf[q][charset].decode())
                    
                    #transfer the alignments to the embeddings
                    qaln_ft2 = ''.join([ next(q_embeds) if x == '-' else x for x in qaln ])
                    taln_ft2 = ''.join([ next(t_embeds) if x == '-' else x for x in taln ])
                    
                    alnzip = zip( qaln_ft2 , taln_ft2 )
                    for qchar, tchar in alnzip:
                        if qchar != '-' and tchar != '-':
                            submats[charset][ colmap[charset][qchar] , colmap[charset][tchar] ] += 1
                            submats[charset][ colmap[charset][tchar] , colmap[charset][qchar] ] += 1


In [None]:

#save the submats in raw form
for charset in charsets:
    np.save( 'submats/' + str(charset) + '.npy' , submats[charset] )
    with open( 'submats/' + str(charset) + '.txt' , 'w' ) as f:
        for i in range( charset ):
            f.write( '\t'.join( [ str(submats[charset][i,j]) for j in range( charset ) ] ) + '\n' )

#normalize the submats rows and columns to sum to 1
for charset in charsets:
    rowsums = submats[charset].sum( axis=1 )
    colsums = submats[charset].sum( axis=0 )
    for i in range( charset ):
        submats[charset][i,:] = submats[charset][i,:] / rowsums[i]
        submats[charset][:,i] = submats[charset][:,i] / colsums[i]

#save the submats in normalized form
for charset in charsets:
    np.save( 'submats/' + str(charset) + '_norm.npy' , submats[charset] )
    with open( 'submats/' + str(charset) + '_norm.txt' , 'w' ) as f:
        for i in range( charset ):
            f.write( '\t'.join( [ str(submats[charset][i,j]) for j in range( charset ) ] ) + '\n' )
    

