In [1]:
#Cluster generated poses
#Inputs:
    #dock/poses/tropb_2_prot
    #dock/scores/tropb_2_prot.csv
#Outputs:
    #dock/cluster/tropb_2_prot.xlsx
    
import os
import sys
import re
import pandas as pd
import numpy as np

In [2]:
#Arguments
protein = 'tropb'
ligand = '2'
dockdir = f'../poses/{protein}_{ligand}_prot'
energy_df = pd.read_csv(f'../scores/{protein}_{ligand}_prot.csv', header = 0, index_col = 0)

In [3]:
#Run MMTSB toolkit cluster.pl
def run_clusterpl(protein, ligand, dockdir, radius = 1):
    cmd = f'cluster.pl -kclust -nolsqfit -radius {radius} -selmode heavy {dockdir}/{protein}_{ligand}_*.pdb'
    cluster_out = os.popen(cmd).read()
    return cluster_out

cluster_out = run_clusterpl(protein, ligand, dockdir)
print(cluster_out)

/export/apps/CentOS7/mmtsb/src/feig/toolset//bin/kclust -pdb -centroid -cdist -radius 1 -iterate -maxerr 0.01 -mode rmsd -heavy


# cluster file
# automatically generated on: Thu Jun 22 21:51:20 2023
# mode: rmsd, filetype: pdb, lsqfit: 0, selmode: heavy
@cluster t has 500 elements, 116 subclusters
1 ../poses/tropb_2_prot/tropb_2_100.pdb
2 ../poses/tropb_2_prot/tropb_2_101.pdb
3 ../poses/tropb_2_prot/tropb_2_102.pdb
4 ../poses/tropb_2_prot/tropb_2_103.pdb
5 ../poses/tropb_2_prot/tropb_2_104.pdb
6 ../poses/tropb_2_prot/tropb_2_105.pdb
7 ../poses/tropb_2_prot/tropb_2_106.pdb
8 ../poses/tropb_2_prot/tropb_2_107.pdb
9 ../poses/tropb_2_prot/tropb_2_108.pdb
10 ../poses/tropb_2_prot/tropb_2_109.pdb
11 ../poses/tropb_2_prot/tropb_2_10.pdb
12 ../poses/tropb_2_prot/tropb_2_110.pdb
13 ../poses/tropb_2_prot/tropb_2_111.pdb
14 ../poses/tropb_2_prot/tropb_2_112.pdb
15 ../poses/tropb_2_prot/tropb_2_113.pdb
16 ../poses/tropb_2_prot/tropb_2_114.pdb
17 ../poses/tropb_2_prot/tropb_2_115.pdb
18 ../poses/tropb_2_prot/tropb_2_116.pdb
19 ../poses/tropb_2_prot/tropb_2_117.pdb
20 ../poses/tropb_2_prot/tropb_2_118.pdb
21 ../poses/tropb_2_

In [4]:
#Read in cluster output and return list of clusters
def read_clusters(protein, ligand, dockdir, cluster_out):
    clusters = []
    cluster = []
    start = False
    
    for line in cluster_out.splitlines():
        if 'cluster t.' in line:
            if cluster:
                clusters.append(cluster)
            cluster = []
            start = True
        elif start:
            index = re.findall('{}/{}_{}_(\S+).pdb'.format(dockdir, protein, ligand), line)[0]
            cluster.append(index)
    if cluster:
        clusters.append(cluster)
    
    #sort clusters by size: largest -> smallest
    clusters.sort(key = len, reverse=True)  
    
    return clusters

clusters = read_clusters(protein, ligand, dockdir, cluster_out)

In [5]:
energy_df

Unnamed: 0_level_0,ENER,GRMS,DELTA,BOND,ANGL,UREY,DIHE,IMPR,VDW,ELEC
pose,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
initial,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,110.334803,0.0,-110.334803,2.565495,20.807015,1.685246,59.051966,-0.001171,50.207066,-23.980814
2,410.109705,0.0,-410.109705,85.704473,49.917563,6.909689,39.870986,0.042047,278.806066,-51.141120
3,43.292300,0.0,-43.292300,0.183304,11.619941,0.506324,60.933153,0.058452,0.726978,-30.735852
4,13.415328,0.0,-13.415328,0.071456,3.070431,0.372032,12.584316,0.412622,20.128590,-23.224120
...,...,...,...,...,...,...,...,...,...,...
496,21.692887,0.0,-21.692887,0.122927,11.846750,1.157740,30.918001,0.018682,1.909733,-24.280947
497,39.686371,0.0,-39.686371,1.609343,10.479834,0.896336,19.900373,0.230711,18.983897,-12.414123
498,31.512848,0.0,-31.512848,0.391562,7.213033,0.337679,46.372097,0.169744,10.136205,-33.107472
499,-0.574759,0.0,0.574759,1.341998,13.009375,0.282245,3.790035,0.026389,35.233404,-54.258205


In [6]:
#Calculate stats for each cluster
def cluster_stats(cluster, energy_df):
    cluster_dict = dict()
    cluster_dict['cluster'] = cluster
    cluster_dict['size'] = len(cluster)
    energies = []
    min_index = -1
    min_ener = ''
    for index in cluster:
        index_energy = energy_df['ENER'][index]
        energies.append(float(index_energy))
        if not min_ener or index_energy < min_ener:
            min_ener = round(index_energy,2)
            min_index = index
    cluster_dict['min_ener'] = min_ener
    cluster_dict['min_index'] = min_index
    average = round(np.average(energies),2)
    std = round(np.std(energies),2)
    cluster_dict['average energy'] = average
    cluster_dict['std energy'] = std
    return cluster_dict
cluster_dicts = [cluster_stats(cluster, energy_df) for cluster in clusters]

In [7]:
cluster_df = pd.DataFrame.from_records(cluster_dicts)
cluster_df = cluster_df.sort_values(by=['min_ener'])
cluster_df = cluster_df.reset_index(drop=True)
cluster_df.to_excel(f'../cluster/{protein}_{ligand}_prot.xlsx')
cluster_df.head(15)

Unnamed: 0,cluster,size,min_ener,min_index,average energy,std energy
0,"[111, 163, 166, 184, 195, 204, 209, 231, 268, ...",24,-48.82,393,-21.16,15.3
1,"[126, 12, 143, 145, 162, 164, 167, 168, 169, 1...",31,-46.75,126,-14.53,20.26
2,"[312, 313, 315, 332, 369, 394, 415, 453, 485]",9,-34.46,332,-13.83,10.85
3,"[336, 439, 475, 494, 4]",5,-34.36,439,0.03,26.77
4,"[176, 196, 211, 242, 260, 305, 329, 359, 368, ...",13,-34.1,43,-9.24,17.13
5,"[102, 108, 132, 135, 152, 172, 212, 259, 284, ...",21,-32.16,102,15.62,20.96
6,[205],1,-28.56,205,-28.56,0.0
7,"[114, 144, 150, 227, 290, 327, 354, 445, 45, 9...",11,-21.08,445,-2.62,10.04
8,"[129, 298, 334, 337, 412, 441, 497]",7,-20.45,441,23.28,27.93
9,"[100, 109, 110, 112, 118, 11, 127, 139, 153, 1...",50,-18.8,56,23.24,19.32
