**Import required libraries and scripts**

In [1]:
#Import required libraries and scripts
from scripts.library_preparation import *
from scripts.utilities import *
from scripts.docking.docking import *
from scripts.clustering_functions import *
from scripts.rescoring_functions import *
from scripts.consensus_methods import *
from scripts.performance_calculation import *
from scripts.pocket_finding.main import pocket_finder
from scripts.postprocessing import *
from scripts.protein_preparation import *

**Set up**
- **software**: The path to the software folder. In most cases this is where the DockM8 repository was downloaded to.
- **receptor**: The path to the protein file (.pdb).
- **prepare_proteins**: Whether or not protein files should be prepared using Protoss (True or False).
- **pocket**: The method to use for pocket determination. Must be one of 'reference', 'RoG' or 'dogsitescorer'.
- **ref_file**: The path to the reference ligand used to define the binding pocket (.sdf file).
- **docking_library**: The path to the docking library file (.sdf).
- **idcolumn**: The unique identifier column used in the docking library.
- **conformers**: The method to use for conformer generation, must be one of 'GypsumDL', 'MMFF' or 'RDKit' (RDKit and MMFF are equivalent). 
- **protonation**: The method to use for compound protonation. Must be one of 'GypsumDL', 'None'.
- **docking_programs**: The method(s) to use for docking. Must be one or more of 'GNINA', 'SMINA', 'QVINA2', 'QVINAW' or 'PLANTS'.
- **nposes**: The number of poses to generate for each docking software. Default=10
- **exhaustiveness**: The precision used if docking with SMINA/GNINA. Default=8
- **pose_selection**: The method(s) to use for pose clustering. Must be one or more of 'RMSD', 'spyRMSD', 'espsim', 'USRCAT', '3DScore', 'bestpose', 'bestpose_GNINA', 'bestpose_SMINA', 'bestpose_QVINA2', 'bestpose_QVINAW' or 'bestpose_PLANTS'. You can also specify any of the scoring functions to select the poses.
- **clustering_method**: Which algorithm to use for clustering. Must be one of 'KMedoids', 'Aff_prop'. Only valid for the descriptor based pose_selection methods (RMSD, spyRMSD, espsim, USRCAT)
- **rescoring**: A list of scoring functions to use for rescoring. Must be one or more of 'GNINA-Affinity','CNN-Score','CNN-Affinity', 'AD4', 'CHEMPLP', 'RFScoreVS', 'LinF9', 'SCORCH', 'Vinardo', 'PLECScore', 'NNScore', 'KORP-PL', 'ConvexPLR', 'RTMScore', 'AAScore'.
- **consensus**: Which consensus method to use. Must be one of :'ECR_best', 'ECR_avg', 'avg_ECR', 'RbR', 'RbV', 'Zscore_best', 'Zscore_avg'.
We recommend to use the command line or GUI versions of DockM8 to generate decoys.

In [3]:
CWD = os.getcwd()
software = Path(CWD+'/software')
receptor = Path(CWD+'/dockm8_testing/1fvv_p.pdb')
prepare_protein = True
pocket = 'Reference'
ref_file = Path(CWD+'/dockm8_testing/1fvv_l.sdf')
docking_library = Path(CWD+'/dockm8_testing/library.sdf')
id_column = 'ID'
conformers = 'GypsumDL'
protonation = 'GypsumDL'
docking_programs = ['GNINA', 'PLANTS', 'SMINA', 'QVINA2', 'QVINAW']
n_poses = 10
exhaustiveness = 8
pose_selection = ['bestpose_GNINA']
clustering_method = 'KMedoids'
rescoring_functions = ['GNINA-Affinity','CNN-Score','CNN-Affinity', 'AD4']
consensus = 'ECR_avg'
ncpus = int(os.cpu_count()*0.9)
open('log.txt', 'w').close()

**Pocket Extraction**  

This cell will extract the pocket based on the method specified in the 'pocket' variable. Using 'reference' or 'RoG' will use the reference ligand to define the pocket. Using 'dogsitescore' will query the dogsitescorer server and use the pocket with the largest volume.

In [4]:
if prepare_protein:
    prepared_receptor = prepare_protein_protoss(receptor)
else:
    prepared_receptor = receptor

#Create a temporary folder for all further calculations
w_dir = prepared_receptor.parent / prepared_receptor.stem
print('The working directory has been set to:', w_dir)
(w_dir).mkdir(exist_ok=True)

pocket_definition = pocket_finder(pocket, w_dir, prepared_receptor, ref_file, 10)

[2024-May-02 14:18:44]: Preparing protein with ProtoSS ...

Job d4f19fdc-b852-4e13-b607-e7f632322cad completed with success




The working directory has been set to: /home/tony/DockM8/dockm8_testing/1fvv_p_protoss
[2024-May-02 14:18:56]: Extracting pocket from 1fvv_p_protoss using 1fvv_l as reference ligand

[2024-May-02 14:19:32]: Finished extracting pocket from 1fvv_p_protoss using 1fvv_l as reference ligand



**Library preparation**  
This cell will prepare the compounds library (conformer generation and protonation).

In [4]:
if os.path.isfile(w_dir / 'final_library.sdf') == False:
    prepare_library(docking_library, w_dir, id_column, conformers, protonation, software, ncpus)

**Docking**

This cell will dock all compounds in the receptor.
The docking algorithms specified in the 'docking_programs' variable will be used.
All the poses will then be concatenated to the allposes.sdf file

In [5]:
docking(
    w_dir,
    prepared_receptor,
    pocket_definition,
    software,
    docking_programs,
    exhaustiveness,
    n_poses,
    ncpus,
    "concurrent_process",
)
concat_all_poses(w_dir, docking_programs, prepared_receptor, ncpus, bust_poses=False)

: 

All poses are then loaded into memory for clustering

In [None]:
print('Loading all poses SDF file...')
tic = time.perf_counter()
all_poses = PandasTools.LoadSDF(str(w_dir / 'allposes.sdf'), idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)
print(f'Loaded {len(all_poses)} poses.')
toc = time.perf_counter()
print(f'Finished loading all poses SDF in {toc-tic:0.4f}!...')

**Clustering**

This cell will perform the clustering according to the values of the 'pose_selection' and the 'clustering_method' variables if a descriptor-based method is selected. If it detects that the clustering file for that metric has already been generated, it will skip it.

In [None]:
for method in pose_selection:
        if os.path.isfile(w_dir / f'clustering/{method}_clustered.sdf') == False:
            select_poses(method, clustering_method, w_dir, prepared_receptor, pocket_definition, software, all_poses, ncpus)

**Rescoring**

This cell will rescore all the clustered .sdf files according to the specified scoring functions.

In [None]:
for method in pose_selection:
        rescore_poses(w_dir, prepared_receptor, pocket_definition, software, str(w_dir / f'clustering/{method}_clustered.sdf'), rescoring_functions, ncpus)

**Final ranking methods**

This cell applies the selected consensus methods and writes the results to the 'consensus' folder.

In [None]:
for method in pose_selection:
    apply_consensus_methods(w_dir, method, consensus, rescoring_functions, 'min_max')