**Import required libraries and scripts**

In [12]:
#Import required libraries and scripts
from scripts.library_preparation import *
from scripts.utilities import *
from scripts.docking_functions import *
from scripts.clustering_functions import *
from scripts.rescoring_functions import *
from scripts.ranking_functions import *
from scripts.performance_calculation import *
import numpy as np
import os

software = '/home/tony/CADD22/software'
protein_file = '/home/tony/CADD22/wocondock_performance_ace_test/receptor_protoss_prepared.pdb'
ref_file = '/home/tony/CADD22/wocondock_performance_ace_test/crystal_ligand_protoss.sdf'
docking_library = '/home/tony/CADD22/wocondock_performance_ace_test/merged_actives_decoys.sdf'
docking_programs = ['GNINA', 'SMINA', 'PLANTS']
clustering_metrics = ['RMSD', 'spyRMSD', 'espsim', '3DScore', 'bestpose', 'bestpose_GNINA']
rescoring_functions = ['gnina', 'AD4', 'chemplp', 'rfscorevs']
id_column = 'ID'
n_poses = 10
exhaustiveness = 4

#Initialise variables and create a temporary folder
w_dir = os.path.dirname(protein_file)
print('The working directory has been set to:', w_dir)
create_temp_folder(w_dir+'/temp')

The working directory has been set to: /home/tony/CADD22/wocondock_performance_ace_test
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp already exists


In [None]:
pocket_definition = binding_site_coordinates_dogsitescorer(protein_file, w_dir, method='volume')

In [None]:
cleaned_pkasolver_df = prepare_library(docking_library, id_column, software, 'pkasolver')

In [None]:
all_poses = docking(protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
docking_splitted(w_dir, protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
docking_splitted_futures(w_dir, protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
fetch_poses_splitted(w_dir, n_poses, split_files_folder=w_dir+'/temp/split_final_library')

In [8]:
print('Loading all poses SDF file...')
tic = time.perf_counter()
all_poses = PandasTools.LoadSDF(w_dir+'/temp/allposes.sdf', idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)
toc = time.perf_counter()
print(f'Finished loading all poses SDF in {toc-tic:0.4f}!...')

Loading all poses SDF file...
Finished loading all poses SDF in 0.7534!...


In [10]:
for metric in clustering_metrics:
    cluster_futures(f'{metric}', 'KMedoids', w_dir, protein_file, all_poses)

The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/clustering/ already exists
Clustering using RMSD already done, moving to next metric...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/clustering/ already exists
Clustering using spyRMSD already done, moving to next metric...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/clustering/ already exists
Clustering using espsim already done, moving to next metric...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/clustering/ already exists
Clustering using 3DScore already done, moving to next metric...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/clustering/ already exists
Clustering using bestpose already done, moving to next metric...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/clustering/ already exists
Clustering using bestpose_GNINA already done, moving to next metric...
The folder: /home/tony/CADD22/wocondock_performance_ac

**Rescoring**

The file containing all the cluster centers is then rescored using all scoring functions available (GNINA, Vina, AutoDock4, PLP, CHEMPLP, RF-Score-VS). The rescored output is return as a dataframe.

In [7]:
for metric in clustering_metrics:
    rescore_all(w_dir, protein_file, ref_file, software, w_dir+f'/temp/clustering/{metric}_clustered.sdf', rescoring_functions, 1)


The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_RMSD_clustered was created
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_RMSD_clustered/gnina_rescoring/ was created
Splitting RMSD_clustered.sdf...
Rescoring with GNINA
Splitting SDF file RMSD_clustered.sdf ...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_RMSD_clustered/gnina_rescoring/split_RMSD_clustered was created
Split docking library into 6 files each containing 41 compounds


100%|██████████| 6/6 [00:00<00:00, 137.39it/s]
100%|██████████| 6/6 [01:21<00:00, 13.56s/it]


Rescoring with GNINA complete in 82.2731!
Rescoring with AD4
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_RMSD_clustered/AD4_rescoring/ was created
Rescoring with AD4 complete in 1.7405!
Rescoring with CHEMPLP
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_RMSD_clustered/chemplp_rescoring/ was created


  COLLECTION blocks are not currently implemented and their contents are ignored.
245 molecules converted


Rescoring with CHEMPLP complete in 5.3007!
Rescoring with RFScoreVS
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_RMSD_clustered/rfscorevs_rescoring was created
Rescoring with RF-Score-VS complete in 11.5759!
Combining all score for /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_RMSD_clustered


100%|██████████| 3/3 [00:00<00:00, 317.60it/s]

Rescoring complete in 100.9727!
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_spyRMSD_clustered was created
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_spyRMSD_clustered/gnina_rescoring/ was created
Splitting spyRMSD_clustered.sdf...
Rescoring with GNINA
Splitting SDF file spyRMSD_clustered.sdf ...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_spyRMSD_clustered/gnina_rescoring/split_spyRMSD_clustered was created





Split docking library into 7 files each containing 41 compounds


100%|██████████| 7/7 [00:00<00:00, 171.14it/s]
100%|██████████| 7/7 [01:58<00:00, 16.87s/it]


Rescoring with GNINA complete in 119.3190!
Rescoring with AD4
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_spyRMSD_clustered/AD4_rescoring/ was created
Rescoring with AD4 complete in 1.5814!
Rescoring with CHEMPLP
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_spyRMSD_clustered/chemplp_rescoring/ was created


  COLLECTION blocks are not currently implemented and their contents are ignored.
247 molecules converted


Rescoring with CHEMPLP complete in 3.7098!
Rescoring with RFScoreVS
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_spyRMSD_clustered/rfscorevs_rescoring was created
Rescoring with RF-Score-VS complete in 9.0561!
Combining all score for /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_spyRMSD_clustered


100%|██████████| 3/3 [00:00<00:00, 373.47it/s]

Rescoring complete in 133.7255!
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_espsim_clustered was created
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_espsim_clustered/gnina_rescoring/ was created
Splitting espsim_clustered.sdf...
Rescoring with GNINA
Splitting SDF file espsim_clustered.sdf ...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_espsim_clustered/gnina_rescoring/split_espsim_clustered was created





Split docking library into 6 files each containing 46 compounds


100%|██████████| 6/6 [00:00<00:00, 146.91it/s]
100%|██████████| 6/6 [02:02<00:00, 20.42s/it]


Rescoring with GNINA complete in 123.4971!
Rescoring with AD4
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_espsim_clustered/AD4_rescoring/ was created
Rescoring with AD4 complete in 1.6810!
Rescoring with CHEMPLP
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_espsim_clustered/chemplp_rescoring/ was created


  COLLECTION blocks are not currently implemented and their contents are ignored.
275 molecules converted


Rescoring with CHEMPLP complete in 3.9469!
Rescoring with RFScoreVS
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_espsim_clustered/rfscorevs_rescoring was created
Rescoring with RF-Score-VS complete in 9.2364!
Combining all score for /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_espsim_clustered


100%|██████████| 3/3 [00:00<00:00, 340.70it/s]


Rescoring complete in 138.4145!
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_3DScore_clustered was created
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_3DScore_clustered/gnina_rescoring/ was created
Splitting 3DScore_clustered.sdf...
Rescoring with GNINA
Splitting SDF file 3DScore_clustered.sdf ...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_3DScore_clustered/gnina_rescoring/split_3DScore_clustered was created
Split docking library into 6 files each containing 17 compounds


100%|██████████| 6/6 [00:00<00:00, 113.79it/s]
100%|██████████| 6/6 [00:49<00:00,  8.26s/it]


Rescoring with GNINA complete in 50.0222!
Rescoring with AD4
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_3DScore_clustered/AD4_rescoring/ was created
Rescoring with AD4 complete in 1.2029!
Rescoring with CHEMPLP
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_3DScore_clustered/chemplp_rescoring/ was created


  COLLECTION blocks are not currently implemented and their contents are ignored.
100 molecules converted


Rescoring with CHEMPLP complete in 2.8128!
Rescoring with RFScoreVS
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_3DScore_clustered/rfscorevs_rescoring was created
Rescoring with RF-Score-VS complete in 7.5131!
Combining all score for /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_3DScore_clustered


100%|██████████| 3/3 [00:00<00:00, 472.56it/s]


Rescoring complete in 61.5839!
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_clustered was created
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_clustered/gnina_rescoring/ was created
Splitting bestpose_clustered.sdf...
Rescoring with GNINA
Splitting SDF file bestpose_clustered.sdf ...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_clustered/gnina_rescoring/split_bestpose_clustered was created
Split docking library into 6 files each containing 17 compounds


100%|██████████| 6/6 [00:00<00:00, 142.11it/s]
100%|██████████| 6/6 [00:46<00:00,  7.71s/it]


Rescoring with GNINA complete in 46.6869!
Rescoring with AD4
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_clustered/AD4_rescoring/ was created
Rescoring with AD4 complete in 1.2048!
Rescoring with CHEMPLP
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_clustered/chemplp_rescoring/ was created


  COLLECTION blocks are not currently implemented and their contents are ignored.
100 molecules converted


Rescoring with CHEMPLP complete in 2.7812!
Rescoring with RFScoreVS
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_clustered/rfscorevs_rescoring was created
Rescoring with RF-Score-VS complete in 7.4140!
Combining all score for /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_clustered


100%|██████████| 3/3 [00:00<00:00, 365.26it/s]


Rescoring complete in 58.1213!
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_GNINA_clustered was created
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_GNINA_clustered/gnina_rescoring/ was created
Splitting bestpose_GNINA_clustered.sdf...
Rescoring with GNINA
Splitting SDF file bestpose_GNINA_clustered.sdf ...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_GNINA_clustered/gnina_rescoring/split_bestpose_GNINA_clustered was created
Split docking library into 6 files each containing 17 compounds


100%|██████████| 6/6 [00:00<00:00, 98.89it/s]
100%|██████████| 6/6 [00:45<00:00,  7.65s/it]


Rescoring with GNINA complete in 46.3489!
Rescoring with AD4
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_GNINA_clustered/AD4_rescoring/ was created
Rescoring with AD4 complete in 1.2478!
Rescoring with CHEMPLP
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_GNINA_clustered/chemplp_rescoring/ was created


  COLLECTION blocks are not currently implemented and their contents are ignored.
100 molecules converted


Rescoring with CHEMPLP complete in 2.7982!
Rescoring with RFScoreVS
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_GNINA_clustered/rfscorevs_rescoring was created
Rescoring with RF-Score-VS complete in 7.2370!
Combining all score for /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_GNINA_clustered


100%|██████████| 3/3 [00:00<00:00, 454.72it/s]

Rescoring complete in 57.6616!
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_SMINA_clustered was created
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_SMINA_clustered/gnina_rescoring/ was created
Splitting bestpose_SMINA_clustered.sdf...
Rescoring with GNINA
Splitting SDF file bestpose_SMINA_clustered.sdf ...
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/rescoring_bestpose_SMINA_clustered/gnina_rescoring/split_bestpose_SMINA_clustered was created





KeyError: 'ID'

**Final ranking methods**

This code calculates the final ranking of compounds using various methods.
*Method 1* : Calculates ECR value for each cluster center, then outputs the top ranked center.
*Method 2* : Calculates ECR value for each cluster center, then outputs the average ECR value for each compound.
*Method 3* : Calculates the average rank of each compound, then ouputs the corresponding ECR value for each compound.
*Method 6* : Calculates Z-score for each cluster center, then ouputs the top ranked center.
*Method 7* : Calculates Z-score for each cluster center, then ouputs the average Z-score for each compound.

All methods are then combined into a single dataframe for comparison purposes.

In [13]:
apply_consensus_methods(w_dir, clustering_metrics)

The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/ranking already exists
The folder: /home/tony/CADD22/wocondock_performance_ace_test/temp/consensus was created


In [14]:
calculate_EFs(w_dir, docking_library)