**Import required libraries and scripts**

In [7]:
#Import required libraries and scripts
from scripts.library_preparation import *
from scripts.utilities import *
from scripts.docking_functions import *
from scripts.clustering_functions import *
from scripts.rescoring_functions import *
from scripts.ranking_functions import *
from scripts.performance_calculation import *
from scripts.dogsitescorer import *
from scripts.get_pocket import *
import numpy as np
import os

software = '/home/mario/DockM8/software'
protein_file = '/home/mario/holiday/gria2/receptor_protoss_prepared.pdb'
ref_file = '/home/mario/holiday/gria2/crystal_ligand_protoss.sdf'
docking_library = '/home/mario/holiday/gria2/merged_actives_decoys.sdf'
docking_programs = ['GNINA', 'SMINA', 'PLANTS']
clustering_metrics = ['RMSD', 'spyRMSD', 'espsim', '3DScore', 'bestpose', 'bestpose_GNINA', 'bestpose_SMINA', 'bestpose_PLANTS']
rescoring_functions = ['gnina', 'AD4', 'chemplp', 'rfscorevs', 'LinF9']
id_column = 'ID'
n_poses = 10
exhaustiveness = 4
ncpus = int(os.cpu_count()/2)
#Initialise variables and create a temporary folder
w_dir = os.path.dirname(protein_file)
print('The working directory has been set to:', w_dir)
create_temp_folder(w_dir+'/temp')

The working directory has been set to: /home/mario/holiday/gria2
The folder: /home/mario/holiday/gria2/temp already exists


In [None]:
pocket_definition = binding_site_coordinates_dogsitescorer(protein_file, w_dir, method='volume')

In [None]:
pocket = GetPocket(ref_file, protein_file, 8)

In [None]:
cleaned_pkasolver_df = prepare_library(docking_library, id_column, software, 'pkasolver', ncpus)

In [None]:
all_poses = docking(protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
docking_splitted(w_dir, protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
fetch_poses_splitted(w_dir, n_poses, split_files_folder=w_dir+'/temp/split_final_library')

In [8]:
print('Loading all poses SDF file...')
tic = time.perf_counter()
all_poses = PandasTools.LoadSDF(w_dir+'/temp/allposes.sdf', idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)
toc = time.perf_counter()
print(f'Finished loading all poses SDF in {toc-tic:0.4f}!...')

Loading all poses SDF file...
Finished loading all poses SDF in 63.9150!...


In [9]:
for metric in clustering_metrics:
    cluster_futures(f'{metric}', 'KMedoids', w_dir, protein_file, all_poses, ncpus)

The folder: /home/mario/holiday/gria2/temp/clustering/ already exists

[2023-Mar-12 16:35:57]: *Calculating RMSD metrics and clustering*

[2023-Mar-12 16:35:57]: Submitting parallel jobs...


Submitting parallel jobs...: 100%|██████████| 11838/11838 [05:33<00:00, 35.51IDs/s]



[2023-Mar-12 16:41:30]: Finished submitting jobs in 333.3502, now running jobs...


Running clustering jobs...: 100%|██████████| 11838/11838 [24:03<00:00,  8.20jobs/s]  


The folder: /home/mario/holiday/gria2/temp/clustering/ already exists

[2023-Mar-12 17:05:38]: *Calculating spyRMSD metrics and clustering*

[2023-Mar-12 17:05:38]: Submitting parallel jobs...


Submitting parallel jobs...: 100%|██████████| 11838/11838 [04:48<00:00, 41.09IDs/s]



[2023-Mar-12 17:10:27]: Finished submitting jobs in 288.1305, now running jobs...


Running clustering jobs...: 100%|██████████| 11838/11838 [29:18<00:00,  6.73jobs/s] 


The folder: /home/mario/holiday/gria2/temp/clustering/ already exists

[2023-Mar-12 17:39:52]: *Calculating espsim metrics and clustering*

[2023-Mar-12 17:39:52]: Submitting parallel jobs...


Submitting parallel jobs...: 100%|██████████| 11838/11838 [04:41<00:00, 42.05IDs/s]



[2023-Mar-12 17:44:34]: Finished submitting jobs in 281.5084, now running jobs...


Running clustering jobs...: 100%|██████████| 11838/11838 [01:38<00:00, 120.63jobs/s] 


The folder: /home/mario/holiday/gria2/temp/clustering/ already exists

[2023-Mar-12 17:46:18]: *Calculating 3DScore metrics and clustering*

[2023-Mar-12 17:46:18]: Submitting parallel jobs...


Submitting parallel jobs...: 100%|██████████| 11838/11838 [04:43<00:00, 41.81IDs/s]



[2023-Mar-12 17:51:01]: Finished submitting jobs in 283.1293, now running jobs...


Running clustering jobs...: 100%|██████████| 11838/11838 [29:05<00:00,  6.78jobs/s] 


The folder: /home/mario/holiday/gria2/temp/clustering/ already exists

[2023-Mar-12 18:20:10]: *Calculating bestpose metrics and clustering*
The folder: /home/mario/holiday/gria2/temp/clustering/ already exists

[2023-Mar-12 18:20:16]: *Calculating bestpose_GNINA metrics and clustering*
The folder: /home/mario/holiday/gria2/temp/clustering/ already exists

[2023-Mar-12 18:20:18]: *Calculating bestpose_SMINA metrics and clustering*
The folder: /home/mario/holiday/gria2/temp/clustering/ already exists

[2023-Mar-12 18:20:20]: *Calculating bestpose_PLANTS metrics and clustering*


**Rescoring**

The file containing all the cluster centers is then rescored using all scoring functions available (GNINA, Vina, AutoDock4, PLP, CHEMPLP, RF-Score-VS). The rescored output is return as a dataframe.

In [10]:
for metric in clustering_metrics:
    rescore_all(w_dir, protein_file, ref_file, software, w_dir+f'/temp/clustering/{metric}_clustered.sdf', rescoring_functions, 1, ncpus)

The folder: /home/mario/holiday/gria2/temp/rescoring_RMSD_clustered was created
Splitting SDF file RMSD_clustered.sdf ...


Splitting files: 100%|██████████| 25/25 [00:03<00:00,  6.27it/s]


Split docking library into 25 files each containing 1352 compounds

[2023-Mar-12 18:20:33]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 25/25 [00:00<00:00, 35.63file/s]
Rescoring with GNINA: 100%|██████████| 25/25 [07:10<00:00, 17.21s/file]



[2023-Mar-12 18:27:47]: Rescoring with GNINA complete in 445.0534!

[2023-Mar-12 18:27:47]: Rescoring with AD4

[2023-Mar-12 18:28:28]: Rescoring with AD4 complete in 41.1607!

[2023-Mar-12 18:28:28]: Rescoring with CHEMPLP


32458 molecules converted



[2023-Mar-12 18:30:24]: Rescoring with CHEMPLP complete in 115.1741!

[2023-Mar-12 18:30:24]: Rescoring with RFScoreVS

[2023-Mar-12 18:33:45]: Rescoring with RF-Score-VS complete in 201.0181!
Splitting SDF file RMSD_clustered.sdf ...


Splitting files: 100%|██████████| 25/25 [00:04<00:00,  5.52it/s]


Split docking library into 25 files each containing 1352 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 25/25 [00:01<00:00, 23.88file/s]
Rescoring with LinF9: 100%|██████████| 25/25 [00:08<00:00,  2.96file/s]



[2023-Mar-12 18:34:10]: Rescoring with LinF9 complete in 25.5444!

[2023-Mar-12 18:34:10]: Combining all score for /home/mario/holiday/gria2/temp/rescoring_RMSD_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 160.39files/s]



[2023-Mar-12 18:34:10]: Rescoring complete in 828.3340!
The folder: /home/mario/holiday/gria2/temp/rescoring_spyRMSD_clustered was created
Splitting SDF file spyRMSD_clustered.sdf ...


Splitting files: 100%|██████████| 24/24 [00:04<00:00,  5.09it/s]


Split docking library into 24 files each containing 1362 compounds

[2023-Mar-12 18:34:22]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 24/24 [00:01<00:00, 23.55file/s]
Rescoring with GNINA: 100%|██████████| 24/24 [07:15<00:00, 18.13s/file]  



[2023-Mar-12 18:41:43]: Rescoring with GNINA complete in 452.4569!

[2023-Mar-12 18:41:43]: Rescoring with AD4

[2023-Mar-12 18:42:24]: Rescoring with AD4 complete in 40.6330!

[2023-Mar-12 18:42:24]: Rescoring with CHEMPLP


32686 molecules converted



[2023-Mar-12 18:43:53]: Rescoring with CHEMPLP complete in 89.5879!

[2023-Mar-12 18:43:53]: Rescoring with RFScoreVS

[2023-Mar-12 18:47:14]: Rescoring with RF-Score-VS complete in 200.4909!
Splitting SDF file spyRMSD_clustered.sdf ...


Splitting files: 100%|██████████| 24/24 [00:05<00:00,  4.69it/s]


Split docking library into 24 files each containing 1362 compounds


Submitting LinF9 rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 30.42file/s]
Rescoring with LinF9: 100%|██████████| 24/24 [00:08<00:00,  2.86file/s]



[2023-Mar-12 18:47:42]: Rescoring with LinF9 complete in 28.7621!

[2023-Mar-12 18:47:42]: Combining all score for /home/mario/holiday/gria2/temp/rescoring_spyRMSD_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 86.00files/s]



[2023-Mar-12 18:47:43]: Rescoring complete in 812.4387!
The folder: /home/mario/holiday/gria2/temp/rescoring_espsim_clustered was created
Splitting SDF file espsim_clustered.sdf ...


Splitting files: 100%|██████████| 24/24 [00:03<00:00,  7.76it/s]


Split docking library into 24 files each containing 1082 compounds

[2023-Mar-12 18:47:53]: Rescoring with GNINA


Submitting GNINA rescoring jobs: 100%|██████████| 24/24 [00:00<00:00, 24.08file/s]
Rescoring with GNINA: 100%|██████████| 24/24 [05:58<00:00, 14.93s/file]  



[2023-Mar-12 18:53:55]: Rescoring with GNINA complete in 372.5606!

[2023-Mar-12 18:53:55]: Rescoring with AD4

[2023-Mar-12 18:54:31]: Rescoring with AD4 complete in 35.6774!

[2023-Mar-12 18:54:31]: Rescoring with CHEMPLP


25957 molecules converted



[2023-Mar-12 18:56:13]: Rescoring with CHEMPLP complete in 101.8768!

[2023-Mar-12 18:56:13]: Rescoring with RFScoreVS


**Final ranking methods**

This code calculates the final ranking of compounds using various methods.
*Method 1* : Calculates ECR value for each cluster center, then outputs the top ranked center.
*Method 2* : Calculates ECR value for each cluster center, then outputs the average ECR value for each compound.
*Method 3* : Calculates the average rank of each compound, then ouputs the corresponding ECR value for each compound.
*Method 6* : Calculates Z-score for each cluster center, then ouputs the top ranked center.
*Method 7* : Calculates Z-score for each cluster center, then ouputs the average Z-score for each compound.

All methods are then combined into a single dataframe for comparison purposes.

In [None]:
apply_consensus_methods(w_dir, clustering_metrics)

In [None]:
calculate_EFs(w_dir, docking_library)

In [None]:
# dfs = []
# for root, dirs, files in os.walk('/home/tony/CADD22/'):
#     for file in files:
#         if file == 'enrichement_factors.csv':
#             file_path = os.path.join(root, file)
#             print(root)
#             df = pd.read_csv(file_path)
#             df = df.rename(columns={df.columns[0]: 'Method'})
#             df = df.rename(columns={'EF1%': root.replace('/home/tony/CADD22/', '').replace('wocondock_performance_', '').replace('/temp/consensus', '')})
#             dfs.append(df)
# merged_df = pd.concat(dfs, axis=0, ignore_index=True)
# merged_df = merged_df.groupby('Method', as_index=False).sum()
# display(merged_df)
# merged_df.to_csv('/home/tony/CADD22/results.csv', index=None)