In [2]:
#Cluster generated poses
#Inputs:
    #dock/poses/tropb_2_prot
    #dock/scores/tropb_2_prot.csv
#Outputs:
    #dock/cluster/tropb_2_prot.xlsx
    
import os
import sys
import re
import pandas as pd
import numpy as np

In [3]:
#Arguments
protein = 'tropb'
ligand = '2'
dockdir = f'../poses/{protein}_{ligand}_prot'
energy_df = pd.read_csv(f'../scores/{protein}_{ligand}_prot.csv', header = 0, index_col = 0)

In [4]:
#Run MMTSB toolkit cluster.pl
def run_clusterpl(protein, ligand, dockdir, radius = 1):
    cmd = f'cluster.pl -kclust -nolsqfit -radius {radius} -selmode heavy {dockdir}/{protein}_{ligand}_*.pdb'
    cluster_out = os.popen(cmd).read()
    return cluster_out

cluster_out = run_clusterpl(protein, ligand, dockdir)
print(cluster_out)

/export/apps/CentOS7/mmtsb/src/feig/toolset//bin/kclust -pdb -centroid -cdist -radius 1 -iterate -maxerr 0.01 -mode rmsd -heavy


# cluster file
# automatically generated on: Fri Apr  7 13:29:34 2023
# mode: rmsd, filetype: pdb, lsqfit: 0, selmode: heavy
@cluster t has 500 elements, 99 subclusters
1 ../poses/tropb_2_prot/tropb_2_100.pdb
2 ../poses/tropb_2_prot/tropb_2_101.pdb
3 ../poses/tropb_2_prot/tropb_2_102.pdb
4 ../poses/tropb_2_prot/tropb_2_103.pdb
5 ../poses/tropb_2_prot/tropb_2_104.pdb
6 ../poses/tropb_2_prot/tropb_2_105.pdb
7 ../poses/tropb_2_prot/tropb_2_106.pdb
8 ../poses/tropb_2_prot/tropb_2_107.pdb
9 ../poses/tropb_2_prot/tropb_2_108.pdb
10 ../poses/tropb_2_prot/tropb_2_109.pdb
11 ../poses/tropb_2_prot/tropb_2_10.pdb
12 ../poses/tropb_2_prot/tropb_2_110.pdb
13 ../poses/tropb_2_prot/tropb_2_111.pdb
14 ../poses/tropb_2_prot/tropb_2_112.pdb
15 ../poses/tropb_2_prot/tropb_2_113.pdb
16 ../poses/tropb_2_prot/tropb_2_114.pdb
17 ../poses/tropb_2_prot/tropb_2_115.pdb
18 ../poses/tropb_2_prot/tropb_2_116.pdb
19 ../poses/tropb_2_prot/tropb_2_117.pdb
20 ../poses/tropb_2_prot/tropb_2_118.pdb
21 ../poses/tropb_2_p

In [5]:
#Read in cluster output and return list of clusters
def read_clusters(protein, ligand, dockdir, cluster_out):
    clusters = []
    cluster = []
    start = False
    
    for line in cluster_out.splitlines():
        if 'cluster t.' in line:
            if cluster:
                clusters.append(cluster)
            cluster = []
            start = True
        elif start:
            index = re.findall('{}/{}_{}_(\S+).pdb'.format(dockdir, protein, ligand), line)[0]
            cluster.append(index)
    if cluster:
        clusters.append(cluster)
    
    #sort clusters by size: largest -> smallest
    clusters.sort(key = len, reverse=True)  
    
    return clusters

clusters = read_clusters(protein, ligand, dockdir, cluster_out)

In [6]:
energy_df

Unnamed: 0_level_0,ENER,GRMS,DELTA,BOND,ANGL,UREY,DIHE,IMPR,VDW,ELEC
pose,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
initial,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,409.927006,0.0,-409.927006,63.757223,43.658180,4.186255,44.828103,0.052957,270.071338,-16.627051
2,60.467168,0.0,-60.467168,0.663227,16.382443,0.265626,37.866743,3.049540,43.717623,-41.478034
3,90.652818,0.0,-90.652818,1.120727,17.079300,1.775320,30.911927,0.189024,74.942027,-35.365506
4,21.912341,0.0,-21.912341,-0.297733,4.580528,1.051028,29.194538,0.339459,20.032997,-32.988476
...,...,...,...,...,...,...,...,...,...,...
496,102.044965,0.0,-102.044965,0.483484,18.876614,0.625559,47.123805,0.350049,43.921505,-9.336051
497,1978.064210,0.0,-1978.064210,1232.716860,75.881415,8.056661,50.273644,0.532089,643.021897,-32.418356
498,64.736308,0.0,-64.736308,1.829885,9.765224,0.383633,42.225142,0.491825,33.108472,-23.067872
499,28.932880,0.0,-28.932880,-0.497582,3.272231,0.329460,43.351592,0.091146,6.550821,-24.164789


In [8]:
#Calculate stats for each cluster
def cluster_stats(cluster, energy_df):
    cluster_dict = dict()
    cluster_dict['cluster'] = cluster
    cluster_dict['size'] = len(cluster)
    energies = []
    min_index = -1
    min_ener = ''
    for index in cluster:
        index_energy = energy_df['ENER'][index]
        energies.append(float(index_energy))
        if not min_ener or index_energy < min_ener:
            min_ener = round(index_energy,2)
            min_index = index
    cluster_dict['min_ener'] = min_ener
    cluster_dict['min_index'] = min_index
    average = round(np.average(energies),2)
    std = round(np.std(energies),2)
    cluster_dict['average energy'] = average
    cluster_dict['std energy'] = std
    return cluster_dict
cluster_dicts = [cluster_stats(cluster, energy_df) for cluster in clusters]

In [13]:
cluster_df = pd.DataFrame.from_records(cluster_dicts)
cluster_df = cluster_df.sort_values(by=['min_ener'])
cluster_df = cluster_df.reset_index(drop=True)
cluster_df.to_excel(f'../cluster/{protein}_{ligand}_prot.xlsx')
cluster_df.head(15)

Unnamed: 0,cluster,size,min_ener,min_index,average energy,std energy
0,"[116, 121, 160, 162, 166, 185, 194, 199, 202, ...",31,-52.23,340,-26.32,15.43
1,"[105, 106, 132, 146, 153, 176, 196, 207, 222, ...",18,-50.55,132,-22.22,24.09
2,"[123, 228, 241, 246, 284, 297, 306, 371, 375, ...",16,-42.0,420,-29.0,10.68
3,"[131, 140, 147, 164, 173, 188, 280, 304, 336, ...",23,-40.47,344,-13.77,21.18
4,"[315, 376]",2,-35.26,376,-25.86,9.39
5,"[268, 281, 335, 424, 427, 481]",6,-34.92,424,-23.15,11.25
6,"[155, 163, 181, 215, 273, 323, 59, 91]",8,-31.49,91,-9.22,10.89
7,[374],1,-30.64,374,-30.64,0.0
8,"[247, 307, 358]",3,-29.54,247,-23.1,6.76
9,"[110, 125, 168, 209, 251, 270, 294, 326, 381, ...",15,-24.17,62,4.82,19.84
