**Import required libraries and scripts**

In [20]:
#Import required libraries and scripts
from scripts.library_preparation import *
from scripts.utilities import *
from scripts.docking_functions import *
from scripts.clustering_functions import *
from scripts.rescoring_functions import *
from scripts.ranking_functions import *
from scripts.performance_calculation import *
from scripts.dogsitescorer import *
from scripts.get_pocket import *
import numpy as np
import os

software = '/home/tony/CADD22/software'
protein_file = '/home/tony/CADD22/wocondock_performance_aldr/receptor_protoss_prepared.pdb'
ref_file = '/home/tony/CADD22/wocondock_performance_aldr/crystal_ligand_protoss.sdf'
docking_library = '/home/tony/CADD22/wocondock_performance_aldr/merged_actives_decoys.sdf'
docking_programs = ['GNINA', 'SMINA', 'PLANTS']
clustering_metrics = ['RMSD', 'spyRMSD', 'espsim', '3DScore', 'bestpose', 'bestpose_GNINA', 'bestpose_SMINA', 'bestpose_PLANTS']
rescoring_functions = ['gnina', 'AD4', 'chemplp', 'rfscorevs', 'LinF9']
id_column = 'ID'
n_poses = 10
exhaustiveness = 4
ncpus = int(os.cpu_count()/2)
#Initialise variables and create a temporary folder
w_dir = os.path.dirname(protein_file)
print('The working directory has been set to:', w_dir)
create_temp_folder(w_dir+'/temp')

The working directory has been set to: /home/tony/CADD22/wocondock_performance_aldr
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp already exists


In [None]:
pocket_definition = binding_site_coordinates_dogsitescorer(protein_file, w_dir, method='volume')

In [None]:
pocket = GetPocket(ref_file, protein_file, 8)

In [None]:
cleaned_pkasolver_df = prepare_library(docking_library, id_column, software, 'pkasolver')

In [None]:
all_poses = docking(protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
docking_splitted(w_dir, protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

In [None]:
fetch_poses_splitted(w_dir, n_poses, split_files_folder=w_dir+'/temp/split_final_library')

In [None]:
print('Loading all poses SDF file...')
tic = time.perf_counter()
all_poses = PandasTools.LoadSDF(w_dir+'/temp/allposes.sdf', idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)
toc = time.perf_counter()
print(f'Finished loading all poses SDF in {toc-tic:0.4f}!...')

In [None]:
for metric in clustering_metrics:
    cluster_futures(f'{metric}', 'KMedoids', w_dir, protein_file, all_poses, ncpus)

**Rescoring**

The file containing all the cluster centers is then rescored using all scoring functions available (GNINA, Vina, AutoDock4, PLP, CHEMPLP, RF-Score-VS). The rescored output is return as a dataframe.

In [21]:
for metric in clustering_metrics:
    rescore_all(w_dir, protein_file, ref_file, software, w_dir+f'/temp/clustering/{metric}_clustered.sdf', rescoring_functions, 1, ncpus)

The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_RMSD_clustered already exists
/gnina_rescoring folder already exists, skipping gnina rescoring
/AD4_rescoring folder already exists, skipping AD4 rescoring
/chemplp_rescoring folder already exists, skipping chemplp rescoring
/rfscorevs_rescoring folder already exists, skipping rfscorevs rescoring
/LinF9_rescoring folder already exists, skipping LinF9 rescoring
Combining all score for /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_RMSD_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 51.08files/s]


Rescoring complete in 0.7230!
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_spyRMSD_clustered already exists
/gnina_rescoring folder already exists, skipping gnina rescoring
/AD4_rescoring folder already exists, skipping AD4 rescoring
/chemplp_rescoring folder already exists, skipping chemplp rescoring
/rfscorevs_rescoring folder already exists, skipping rfscorevs rescoring
/LinF9_rescoring folder already exists, skipping LinF9 rescoring
Combining all score for /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_spyRMSD_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 40.59files/s]


Rescoring complete in 0.7531!
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_espsim_clustered already exists
/gnina_rescoring folder already exists, skipping gnina rescoring
/AD4_rescoring folder already exists, skipping AD4 rescoring
/chemplp_rescoring folder already exists, skipping chemplp rescoring
/rfscorevs_rescoring folder already exists, skipping rfscorevs rescoring
/LinF9_rescoring folder already exists, skipping LinF9 rescoring
Combining all score for /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_espsim_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 73.06files/s]


Rescoring complete in 0.5849!
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_3DScore_clustered already exists
/gnina_rescoring folder already exists, skipping gnina rescoring
/AD4_rescoring folder already exists, skipping AD4 rescoring
/chemplp_rescoring folder already exists, skipping chemplp rescoring
/rfscorevs_rescoring folder already exists, skipping rfscorevs rescoring
/LinF9_rescoring folder already exists, skipping LinF9 rescoring
Combining all score for /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_3DScore_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 181.37files/s]


Rescoring complete in 0.2522!
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_bestpose_clustered already exists
/gnina_rescoring folder already exists, skipping gnina rescoring
/AD4_rescoring folder already exists, skipping AD4 rescoring
/chemplp_rescoring folder already exists, skipping chemplp rescoring
/rfscorevs_rescoring folder already exists, skipping rfscorevs rescoring
/LinF9_rescoring folder already exists, skipping LinF9 rescoring
Combining all score for /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_bestpose_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 47.08files/s]


Rescoring complete in 0.7408!
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_bestpose_GNINA_clustered already exists
/gnina_rescoring folder already exists, skipping gnina rescoring
/AD4_rescoring folder already exists, skipping AD4 rescoring
/chemplp_rescoring folder already exists, skipping chemplp rescoring
/rfscorevs_rescoring folder already exists, skipping rfscorevs rescoring
/LinF9_rescoring folder already exists, skipping LinF9 rescoring
Combining all score for /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_bestpose_GNINA_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 165.62files/s]


Rescoring complete in 0.2785!
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_bestpose_SMINA_clustered already exists
/gnina_rescoring folder already exists, skipping gnina rescoring
/AD4_rescoring folder already exists, skipping AD4 rescoring
/chemplp_rescoring folder already exists, skipping chemplp rescoring
/rfscorevs_rescoring folder already exists, skipping rfscorevs rescoring
/LinF9_rescoring folder already exists, skipping LinF9 rescoring
Combining all score for /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_bestpose_SMINA_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 108.31files/s]


Rescoring complete in 0.3494!
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_bestpose_PLANTS_clustered already exists
/gnina_rescoring folder already exists, skipping gnina rescoring
/AD4_rescoring folder already exists, skipping AD4 rescoring
/chemplp_rescoring folder already exists, skipping chemplp rescoring
/rfscorevs_rescoring folder already exists, skipping rfscorevs rescoring
/LinF9_rescoring folder already exists, skipping LinF9 rescoring
Combining all score for /home/tony/CADD22/wocondock_performance_aldr/temp/rescoring_bestpose_PLANTS_clustered


Combining scores: 100%|██████████| 4/4 [00:00<00:00, 156.49files/s]


Rescoring complete in 0.3214!


**Final ranking methods**

This code calculates the final ranking of compounds using various methods.
*Method 1* : Calculates ECR value for each cluster center, then outputs the top ranked center.
*Method 2* : Calculates ECR value for each cluster center, then outputs the average ECR value for each compound.
*Method 3* : Calculates the average rank of each compound, then ouputs the corresponding ECR value for each compound.
*Method 6* : Calculates Z-score for each cluster center, then ouputs the top ranked center.
*Method 7* : Calculates Z-score for each cluster center, then ouputs the average Z-score for each compound.

All methods are then combined into a single dataframe for comparison purposes.

In [22]:
apply_consensus_methods(w_dir, clustering_metrics)

The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/ranking already exists
The folder: /home/tony/CADD22/wocondock_performance_aldr/temp/consensus already exists


In [23]:
calculate_EFs(w_dir, docking_library)

In [24]:
dfs = []
for root, dirs, files in os.walk('/home/tony/CADD22/'):
    for file in files:
        if file == 'enrichement_factors.csv':
            file_path = os.path.join(root, file)
            print(root)
            df = pd.read_csv(file_path)
            df = df.rename(columns={df.columns[0]: 'Method'})
            df = df.rename(columns={'EF1%': root.replace('/home/tony/CADD22/', '').replace('wocondock_performance_', '').replace('/temp/consensus', '')})
            dfs.append(df)
merged_df = pd.concat(dfs, axis=0, ignore_index=True)
merged_df = merged_df.groupby('Method', as_index=False).sum()
display(merged_df)
merged_df.to_csv('/home/tony/CADD22/results.csv', index=None)

/home/tony/CADD22/wocondock_performance_cdk2/temp/consensus
/home/tony/CADD22/wocondock_performance_ace/temp/consensus
/home/tony/CADD22/wocondock_performance_aldr/temp/consensus
/home/tony/CADD22/xiap/temp/consensus
/home/tony/CADD22/wocondock_performance_andr/temp/consensus


Unnamed: 0,Method,EF10%,cdk2,ace,aldr,xiap,andr
0,AAScore_S_3DScore,2.20,0.00,4.61,0.00,0.00,0.00
1,AAScore_S_RMSD,1.45,0.00,2.84,0.00,0.00,0.00
2,AAScore_S_bestpose,1.70,0.00,3.55,0.00,0.00,0.00
3,AAScore_S_bestpose_GNINA,1.91,0.00,2.83,0.00,0.00,0.00
4,AAScore_S_bestpose_PLANTS,1.35,0.00,3.92,0.00,0.00,0.00
...,...,...,...,...,...,...,...
115,RFScoreVS_S_bestpose_GNINA,22.65,12.08,13.09,17.68,1.01,15.70
116,RFScoreVS_S_bestpose_PLANTS,28.45,10.76,11.75,30.96,3.40,36.83
117,RFScoreVS_S_bestpose_SMINA,31.96,11.68,17.33,15.15,43.78,29.65
118,RFScoreVS_S_espsim,30.21,14.33,10.29,15.81,21.46,27.23
