**Import required libraries and scripts**

In [None]:
#Import required libraries and scripts
from scripts.library_preparation import *
from scripts.utilities import *
from scripts.docking_functions import *
from scripts.clustering_functions import *
from scripts.rescoring_functions import *
from scripts.ranking_functions import *
from scripts.performance_calculation import *
from scripts.dogsitescorer import *
from scripts.get_pocket import *
import numpy as np
import os

software = '/home/mario/DockM8/software'
protein_file = '/home/mario/holiday/igf1r/receptor_protoss_prepared.pdb'
ref_file = '/home/mario/holiday/igf1r/crystal_ligand_protoss.sdf'
docking_library = '/home/mario/holiday/igf1r/merged_actives_decoys.sdf'
docking_programs = ['GNINA', 'SMINA', 'PLANTS']
clustering_metrics = ['RMSD', 'spyRMSD', 'espsim', '3DScore', 'bestpose', 'bestpose_GNINA', 'bestpose_SMINA', 'bestpose_PLANTS']
rescoring_functions = ['gnina', 'AD4', 'chemplp', 'rfscorevs', 'LinF9']
id_column = 'ID'
n_poses = 10
exhaustiveness = 4
ncpus = int(os.cpu_count()/2)
#Initialise variables and create a temporary folder
w_dir = os.path.dirname(protein_file)
print('The working directory has been set to:', w_dir)
create_temp_folder(w_dir+'/temp')

In [None]:
pocket_definition = binding_site_coordinates_dogsitescorer(protein_file, w_dir, method='volume')

In [None]:
pocket = GetPocket(ref_file, protein_file, 8)

In [None]:
cleaned_pkasolver_df = prepare_library(docking_library, id_column, software, 'pkasolver', ncpus)

In [None]:
all_poses = docking(protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
docking_splitted(w_dir, protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
fetch_poses_splitted(w_dir, n_poses, split_files_folder=w_dir+'/temp/split_final_library')

In [None]:
print('Loading all poses SDF file...')
tic = time.perf_counter()
all_poses = PandasTools.LoadSDF(w_dir+'/temp/allposes.sdf', idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)
toc = time.perf_counter()
print(f'Finished loading all poses SDF in {toc-tic:0.4f}!...')

In [None]:
for metric in clustering_metrics:
    cluster_futures(f'{metric}', 'KMedoids', w_dir, protein_file, all_poses, ncpus)

**Rescoring**

The file containing all the cluster centers is then rescored using all scoring functions available (GNINA, Vina, AutoDock4, PLP, CHEMPLP, RF-Score-VS). The rescored output is return as a dataframe.

In [4]:
for metric in clustering_metrics:
    rescore_all(w_dir, protein_file, ref_file, software, w_dir+f'/temp/clustering/{metric}_clustered.sdf', rescoring_functions, 1, ncpus)


[2023-Mar-14 08:33:49]: Rescoring with CHEMPLP complete in 73.3911!

[2023-Mar-14 08:33:49]: Rescoring with RFScoreVS

[2023-Mar-14 08:35:29]: Rescoring with RF-Score-VS complete in 100.8085!
Splitting SDF file spyRMSD_clustered.sdf ...


Splitting files: 100%|██████████| 24/24 [00:03<00:00,  7.95it/s]


Split docking library into 24 files each containing 1066 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 30.47file/s]
Rescoring with LinF9: 100%|██████████| 24/24 [00:05<00:00,  4.25file/s]



[2023-Mar-14 08:35:46]: Rescoring with LinF9 complete in 16.9255!

[2023-Mar-14 08:35:46]: Combining all score for /home/mario/holiday/igf1r/temp/rescoring_spyRMSD_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 197.84files/s]



[2023-Mar-14 08:35:47]: Rescoring complete in 526.6447!
The folder: /home/mario/holiday/igf1r/temp/rescoring_espsim_clustered was created
Splitting SDF file espsim_clustered.sdf ...


Splitting files: 100%|██████████| 24/24 [00:02<00:00,  9.70it/s]


Split docking library into 24 files each containing 861 compounds

[2023-Mar-14 08:35:53]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 30.24file/s]
Rescoring with GNINA: 100%|██████████| 24/24 [03:27<00:00,  8.66s/file]



[2023-Mar-14 08:39:24]: Rescoring with GNINA complete in 217.1961!

[2023-Mar-14 08:39:24]: Rescoring with AD4

[2023-Mar-14 08:39:48]: Rescoring with AD4 complete in 23.9731!

[2023-Mar-14 08:39:48]: Rescoring with CHEMPLP


20656 molecules converted



[2023-Mar-14 08:40:48]: Rescoring with CHEMPLP complete in 60.5650!

[2023-Mar-14 08:40:48]: Rescoring with RFScoreVS

[2023-Mar-14 08:42:09]: Rescoring with RF-Score-VS complete in 80.4083!
Splitting SDF file espsim_clustered.sdf ...


Splitting files: 100%|██████████| 24/24 [00:02<00:00,  9.69it/s]


Split docking library into 24 files each containing 861 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 30.46file/s]
Rescoring with LinF9: 100%|██████████| 24/24 [00:04<00:00,  5.15file/s]



[2023-Mar-14 08:42:23]: Rescoring with LinF9 complete in 14.0742!

[2023-Mar-14 08:42:23]: Combining all score for /home/mario/holiday/igf1r/temp/rescoring_espsim_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 246.83files/s]


[2023-Mar-14 08:42:23]: Rescoring complete in 396.4125!
The folder: /home/mario/holiday/igf1r/temp/rescoring_3DScore_clustered was created
Splitting SDF file 3DScore_clustered.sdf ...



Splitting files: 100%|██████████| 24/24 [00:01<00:00, 21.08it/s]


Split docking library into 24 files each containing 392 compounds

[2023-Mar-14 08:42:26]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 30.44file/s]
Rescoring with GNINA: 100%|██████████| 24/24 [01:38<00:00,  4.10s/file]



[2023-Mar-14 08:44:06]: Rescoring with GNINA complete in 103.2180!

[2023-Mar-14 08:44:06]: Rescoring with AD4

[2023-Mar-14 08:44:18]: Rescoring with AD4 complete in 11.7862!

[2023-Mar-14 08:44:18]: Rescoring with CHEMPLP


9407 molecules converted



[2023-Mar-14 08:44:45]: Rescoring with CHEMPLP complete in 27.4437!

[2023-Mar-14 08:44:45]: Rescoring with RFScoreVS

[2023-Mar-14 08:45:24]: Rescoring with RF-Score-VS complete in 38.4563!
Splitting SDF file 3DScore_clustered.sdf ...


Splitting files: 100%|██████████| 24/24 [00:01<00:00, 20.97it/s]


Split docking library into 24 files each containing 392 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 30.69file/s]
Rescoring with LinF9: 100%|██████████| 24/24 [00:02<00:00,  8.87file/s]



[2023-Mar-14 08:45:31]: Rescoring with LinF9 complete in 7.5099!

[2023-Mar-14 08:45:31]: Combining all score for /home/mario/holiday/igf1r/temp/rescoring_3DScore_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 380.78files/s]


[2023-Mar-14 08:45:31]: Rescoring complete in 188.5140!
The folder: /home/mario/holiday/igf1r/temp/rescoring_bestpose_clustered was created
Splitting SDF file bestpose_clustered.sdf ...



Splitting files: 100%|██████████| 24/24 [00:03<00:00,  7.29it/s]


Split docking library into 24 files each containing 1150 compounds

[2023-Mar-14 08:45:41]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 30.62file/s]
Rescoring with GNINA: 100%|██████████| 24/24 [04:31<00:00, 11.31s/file]



[2023-Mar-14 08:50:15]: Rescoring with GNINA complete in 283.9661!

[2023-Mar-14 08:50:15]: Rescoring with AD4

[2023-Mar-14 08:50:47]: Rescoring with AD4 complete in 31.5125!

[2023-Mar-14 08:50:47]: Rescoring with CHEMPLP


27600 molecules converted



[2023-Mar-14 08:52:06]: Rescoring with CHEMPLP complete in 79.2698!

[2023-Mar-14 08:52:06]: Rescoring with RFScoreVS

[2023-Mar-14 08:53:53]: Rescoring with RF-Score-VS complete in 106.3255!
Splitting SDF file bestpose_clustered.sdf ...


Splitting files: 100%|██████████| 24/24 [00:03<00:00,  7.34it/s]


Split docking library into 24 files each containing 1150 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 30.58file/s]
Rescoring with LinF9: 100%|██████████| 24/24 [00:05<00:00,  4.11file/s]



[2023-Mar-14 08:54:11]: Rescoring with LinF9 complete in 18.0955!

[2023-Mar-14 08:54:11]: Combining all score for /home/mario/holiday/igf1r/temp/rescoring_bestpose_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 175.49files/s]



[2023-Mar-14 08:54:11]: Rescoring complete in 519.4264!
The folder: /home/mario/holiday/igf1r/temp/rescoring_bestpose_GNINA_clustered was created
Splitting SDF file bestpose_GNINA_clustered.sdf ...


Splitting files: 100%|██████████| 25/25 [00:01<00:00, 22.31it/s]


Split docking library into 25 files each containing 389 compounds

[2023-Mar-14 08:54:14]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 25/25 [00:00<00:00, 31.87file/s]
Rescoring with GNINA: 100%|██████████| 25/25 [01:36<00:00,  3.88s/file]



[2023-Mar-14 08:55:53]: Rescoring with GNINA complete in 101.7704!

[2023-Mar-14 08:55:53]: Rescoring with AD4

[2023-Mar-14 08:56:04]: Rescoring with AD4 complete in 11.7403!

[2023-Mar-14 08:56:04]: Rescoring with CHEMPLP


9343 molecules converted



[2023-Mar-14 08:56:32]: Rescoring with CHEMPLP complete in 27.3490!

[2023-Mar-14 08:56:32]: Rescoring with RFScoreVS

[2023-Mar-14 08:57:10]: Rescoring with RF-Score-VS complete in 38.0546!
Splitting SDF file bestpose_GNINA_clustered.sdf ...


Splitting files: 100%|██████████| 25/25 [00:01<00:00, 22.45it/s]


Split docking library into 25 files each containing 389 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 25/25 [00:00<00:00, 31.75file/s]
Rescoring with LinF9: 100%|██████████| 25/25 [00:02<00:00,  8.34file/s]



[2023-Mar-14 08:57:18]: Rescoring with LinF9 complete in 7.7469!

[2023-Mar-14 08:57:18]: Combining all score for /home/mario/holiday/igf1r/temp/rescoring_bestpose_GNINA_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 459.96files/s]


[2023-Mar-14 08:57:18]: Rescoring complete in 186.7548!
The folder: /home/mario/holiday/igf1r/temp/rescoring_bestpose_SMINA_clustered was created
Splitting SDF file bestpose_SMINA_clustered.sdf ...



Splitting files: 100%|██████████| 25/25 [00:01<00:00, 22.28it/s]


Split docking library into 25 files each containing 389 compounds

[2023-Mar-14 08:57:21]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 25/25 [00:00<00:00, 31.14file/s]
Rescoring with GNINA: 100%|██████████| 25/25 [01:37<00:00,  3.91s/file]



[2023-Mar-14 08:59:00]: Rescoring with GNINA complete in 102.6211!

[2023-Mar-14 08:59:00]: Rescoring with AD4

[2023-Mar-14 08:59:12]: Rescoring with AD4 complete in 11.6865!

[2023-Mar-14 08:59:12]: Rescoring with CHEMPLP


9343 molecules converted



[2023-Mar-14 08:59:39]: Rescoring with CHEMPLP complete in 27.2043!

[2023-Mar-14 08:59:39]: Rescoring with RFScoreVS

[2023-Mar-14 09:00:18]: Rescoring with RF-Score-VS complete in 38.3560!
Splitting SDF file bestpose_SMINA_clustered.sdf ...


Splitting files: 100%|██████████| 25/25 [00:01<00:00, 22.46it/s]


Split docking library into 25 files each containing 389 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 25/25 [00:00<00:00, 31.12file/s]
Rescoring with LinF9: 100%|██████████| 25/25 [00:02<00:00,  9.02file/s]



[2023-Mar-14 09:00:25]: Rescoring with LinF9 complete in 7.5598!

[2023-Mar-14 09:00:25]: Combining all score for /home/mario/holiday/igf1r/temp/rescoring_bestpose_SMINA_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 455.05files/s]


[2023-Mar-14 09:00:25]: Rescoring complete in 187.5238!
The folder: /home/mario/holiday/igf1r/temp/rescoring_bestpose_PLANTS_clustered was created
Splitting SDF file bestpose_PLANTS_clustered.sdf ...



Splitting files: 100%|██████████| 25/25 [00:01<00:00, 23.57it/s]


Split docking library into 25 files each containing 371 compounds

[2023-Mar-14 09:00:28]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 25/25 [00:00<00:00, 31.75file/s]
Rescoring with GNINA: 100%|██████████| 25/25 [01:32<00:00,  3.70s/file]



[2023-Mar-14 09:02:02]: Rescoring with GNINA complete in 97.1404!

[2023-Mar-14 09:02:02]: Rescoring with AD4

[2023-Mar-14 09:02:14]: Rescoring with AD4 complete in 11.1857!

[2023-Mar-14 09:02:14]: Rescoring with CHEMPLP


8914 molecules converted



[2023-Mar-14 09:02:40]: Rescoring with CHEMPLP complete in 26.3090!

[2023-Mar-14 09:02:40]: Rescoring with RFScoreVS

[2023-Mar-14 09:03:16]: Rescoring with RF-Score-VS complete in 36.4663!
Splitting SDF file bestpose_PLANTS_clustered.sdf ...


Splitting files: 100%|██████████| 25/25 [00:01<00:00, 23.28it/s]


Split docking library into 25 files each containing 371 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 25/25 [00:00<00:00, 31.47file/s]
Rescoring with LinF9: 100%|██████████| 25/25 [00:02<00:00,  9.07file/s]



[2023-Mar-14 09:03:24]: Rescoring with LinF9 complete in 7.3387!

[2023-Mar-14 09:03:24]: Combining all score for /home/mario/holiday/igf1r/temp/rescoring_bestpose_PLANTS_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 453.39files/s]


[2023-Mar-14 09:03:24]: Rescoring complete in 178.5324!





**Final ranking methods**

This code calculates the final ranking of compounds using various methods.
*Method 1* : Calculates ECR value for each cluster center, then outputs the top ranked center.
*Method 2* : Calculates ECR value for each cluster center, then outputs the average ECR value for each compound.
*Method 3* : Calculates the average rank of each compound, then ouputs the corresponding ECR value for each compound.
*Method 6* : Calculates Z-score for each cluster center, then ouputs the top ranked center.
*Method 7* : Calculates Z-score for each cluster center, then ouputs the average Z-score for each compound.

All methods are then combined into a single dataframe for comparison purposes.

In [5]:
apply_consensus_methods(w_dir, clustering_metrics)

The folder: /home/mario/holiday/igf1r/temp/ranking was created
The folder: /home/mario/holiday/igf1r/temp/consensus was created


In [6]:
calculate_EFs(w_dir, docking_library)

In [None]:
dfs = []
for root, dirs, files in os.walk('/home/mario/CADD22/'):
    for file in files:
        if file == 'enrichement_factors.csv':
            file_path = os.path.join(root, file)
            print(root)
            df = pd.read_csv(file_path)
            df = df.rename(columns={df.columns[0]: 'Method'})
            df = df.rename(columns={'EF1%': root.replace('/home/tony/CADD22/', '').replace('wocondock_performance_', '').replace('/temp/consensus', '')})
            dfs.append(df)
merged_df = pd.concat(dfs, axis=0, ignore_index=True)
merged_df = merged_df.groupby('Method', as_index=False).sum()
display(merged_df)
merged_df.to_csv('/home/tony/CADD22/results.csv', index=None)