**Import required libraries and scripts**

In [1]:
#Import required libraries and scripts
from scripts.library_preparation import *
from scripts.utilities import *
from scripts.docking_functions import *
from scripts.clustering_functions import *
from scripts.rescoring_functions import *
from scripts.ranking_functions import *
from scripts.performance_calculation import *
import numpy as np
import os

software = '/home/tony/CADD22/software'
protein_file = '/home/tony/CADD22/wocondock_main/2o1x_A_apo_protoss.pdb'
ref_file = '/home/tony/CADD22/wocondock_main/2o1x_A_lig_protoss.sdf'
docking_library = '/home/tony/CADD22/wocondock_main/500_of_FCHGroup_LeadLike.sdf'
docking_programs = ['SMINA','GNINA','PLANTS']
id_column = 'ID'
n_poses = 10
exhaustiveness = 4

#Initialise variables and create a temporary folder
w_dir = os.path.dirname(protein_file)
print('The working directory has been set to:', w_dir)
create_temp_folder(w_dir+'/temp')

[00:52:24] Initializing Normalizer


The working directory has been set to: /home/tony/CADD22/wocondock_main
The folder: /home/tony/CADD22/wocondock_main/temp already exists


In [None]:
cleaned_pkasolver_df = prepare_library(docking_library, id_column, software, 'pkasolver')

**Docking**

This function will dock all compounds in the receptor, using the reference ligand as a way to define the binding site. The docking results are written to the temporary folder. 

In [5]:
print(f'Splitting docking library into {str(multiprocessing.cpu_count()-2)} files')

Splitting docking library into 6 files


In [4]:
docking_splitted(w_dir, protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

The folder: /home/tony/CADD22/wocondock_main/temp/split_files was created
The folder: /home/tony/CADD22/wocondock_main/temp/plants was created
Converting protein file to .mol2 format for PLANTS docking...
Converting reference file from .sdf to .mol2 format for PLANTS docking...
Determining binding site coordinates using PLANTS...
Docking with PLANTS complete in 73.2442!
Docking with SMINA complete in 52.2637!
Docking with GNINA complete in 87.3236!


In [16]:
def fetch_poses_splitted(w_dir, n_poses):
    '''
    This function is used to fetch the poses from different docking results (SMINA, GNINA, PLANTS) and create a new dataframe with the poses and their corresponding scores.
    It takes two input parameters:

    w_dir: the path of the working_directory
    n_poses: number of poses to be fetched
    The function uses the PandasTools library to load the SDF files and creates a new dataframe with the poses and scores. It also renames the columns and modifies the ID column to include the source of the pose (SMINA, GNINA, PLANTS). In case of an error, the function will print an error message.
    '''
    tic = time.perf_counter()
    #Fetch PLANTS poses
    plants_dataframes = []
    results_folders = [item for item in os.listdir(w_dir+'/temp/plants')]
    for item in results_folders:
        if item.startswith('results'):
            file_path = os.path.join(w_dir+'/temp/plants', item, 'docked_ligands.mol2')
            if os.path.isfile(file_path):
                try:
                    obabel_command = f'obabel -imol2 {file_path} -O {file_path.replace(".mol2",".sdf")}'
                    subprocess.call(obabel_command, shell=True, stdout=DEVNULL, stderr=STDOUT)
                    plants_poses = PandasTools.LoadSDF(file_path.replace('.mol2','.sdf'), idName='ID', molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
                    plants_scores = pd.read_csv(file_path.replace('docked_ligands.mol2','ranking.csv')).rename(columns={'LIGAND_ENTRY':'ID', 'TOTAL_SCORE':'CHEMPLP'})[['ID', 'CHEMPLP']]
                    plants_df = pd.merge(plants_scores, plants_poses, on='ID')
                    for i, row in plants_df.iterrows():
                        split = row['ID'].split("_")
                        conformer_id = str(split[4])
                        plants_df.loc[i, ['Pose ID']] = split[0]+"_PLANTS_"+conformer_id
                        plants_df.loc[i, ['ID']] = split[0]
                    plants_dataframes.append(plants_df)
                except Exception as e:
                    print('ERROR: Failed to convert PLANTS docking results file to .sdf!')
                    print(e)
        elif item in ['protein.mol2', 'ref.mol2']:
            pass
        else:
            Path(os.path.join(w_dir+'/temp/plants', item)).unlink(missing_ok=True)
    plants_df = pd.concat(plants_dataframes)
    #Fetch SMINA poses
    try:
        smina_dataframes = [PandasTools.LoadSDF(w_dir+'/temp/smina/'+file, idName='ID', molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True) for file in os.listdir(w_dir+'/temp/smina/') if file.endswith('.sdf')]
        smina_df = pd.concat(smina_dataframes)
        smina_df['number'] = [*range(1, int(n_poses)+1, 1)] * int(len(smina_df)/int(n_poses)) + [*range(1, len(smina_df)%int(n_poses)+1, 1)]
        smina_df['Pose ID'] = smina_df['ID']+'_SMINA_'+smina_df['number'].apply(str)
        smina_df.drop('number', axis=1, inplace=True)
        smina_df = smina_df.rename(columns={'minimizedAffinity':'SMINA_Affinity'})
    except Exception as e:
        print('ERROR: Failed to Load SMINA poses SDF file!')
        print(e)

    #Fetch GNINA poses
    gnina_dataframes = []
    for file in os.listdir(w_dir+'/temp/gnina/'):
        if file.endswith('.sdf'):
            try:
                gnina_df = PandasTools.LoadSDF(w_dir+'/temp/gnina/'+file, idName='ID', molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
                list_ = [*range(1, int(n_poses)+1, 1)]
                ser = list_ * int(len(gnina_df)/len(list_))
                gnina_df['number'] = ser + list_[:len(gnina_df)-len(ser)]
                for i, row in gnina_df.iterrows():
                    gnina_df.loc[i, ['Pose ID']] = row['ID']+'_GNINA_'+str(row['number'])
                gnina_df.drop('number', axis=1, inplace=True)
                gnina_df = gnina_df.rename(columns={'minimizedAffinity':'GNINA_Affinity'})
                gnina_dataframes.append(gnina_df)
            except Exception as e:
                print('ERROR: Failed to Load GNINA poses SDF file!')
                print(e)
    gnina_df = pd.concat(gnina_dataframes)
    all_poses = pd.concat([plants_df, smina_df, gnina_df]) 
    PandasTools.WriteSDF(all_poses, w_dir+'/temp/allposes_splitted.sdf', molColName='Molecule', idName='Pose ID', properties=list(all_poses.columns))
    toc = time.perf_counter()
    print(f'Combined all docking poses in {toc-tic:0.4f}!')
    return all_poses

all_poses = fetch_poses_splitted(w_dir, n_poses)

display(all_poses)

['ref.mol2', 'results_split_0', 'config_split_1.config', 'results_split_6', 'split_0.mol2', 'config_split_3.config', 'split_6.mol2', 'results_split_1', 'results_split_4', 'split_7.mol2', 'config_split_5.config', 'config_split_4.config', 'results_split_7', 'results_split_2', 'results_split_8', 'results_split_3', 'results_split_9', 'config_split_7.config', 'split_5.mol2', 'split_1.mol2', 'config_split_2.config', 'config_split_0.config', 'protein.mol2', 'split_2.mol2', 'split_8.mol2', 'split_4.mol2', 'results_split_5']
results_split_0
Will delete:config_split_1.config
results_split_6
Will delete:split_0.mol2
Will delete:config_split_3.config
Will delete:split_6.mol2
results_split_1
results_split_4
Will delete:split_7.mol2
Will delete:config_split_5.config
Will delete:config_split_4.config
results_split_7
results_split_2
results_split_8
results_split_3
results_split_9
Will delete:config_split_7.config
Will delete:split_5.mol2
Will delete:split_1.mol2
Will delete:config_split_2.config
Will 

ValueError: No objects to concatenate

In [3]:
all_poses = docking(protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

Docking with SMINA complete in 96.0203!
Docking with GNINA complete in 139.5086!
Converting protein file to .mol2 format for PLANTS docking...
Converting reference file from .sdf to .mol2 format for PLANTS docking...
Determining binding site coordinates using PLANTS...
Writing PLANTS config file...
Starting PLANTS docking...


1 molecule converted
10 molecules converted


Converting PLANTS poses to .sdf format...
Docking with PLANTS complete in 212.2481!
Combined all docking poses in 0.6044!
Finished docking in 448.3839!


**Clustering**

We will first load all the poses generated from the docking run. The cluster() function performs the calculation of the clustering metrics (for now simpleRMSD and electroshape similarity), then performs the clustering using the k-medoids clustering algorithm with the number of clusters optimised using silhouette score. Finally, all cluster centers are collected and written to a file in the temporary directory (/temp/clustering/) (one file per clustering metric).

In [None]:
cluster_dask('RMSD', w_dir, protein_file)

In [None]:
cluster('bestpose', w_dir, protein_file)

In [None]:
cluster('espsim', w_dir, protein_file)
cluster('spyRMSD', w_dir, protein_file)
cluster('USRCAT', w_dir, protein_file)
cluster('RMSD', w_dir, protein_file)
cluster('3DScore', w_dir, protein_file)

**Rescoring**

The file containing all the cluster centers is then rescored using all scoring functions available (GNINA, Vina, AutoDock4, PLP, CHEMPLP, RF-Score-VS). The rescored output is return as a dataframe.

In [None]:
RMSD_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/RMSD_clustered.sdf')
espsim_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/espsim_clustered.sdf')
spyRMSD_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/spyRMSD_clustered.sdf')
USRCAT_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/USRCAT_clustered.sdf')
DScore_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/3DScore_clustered.sdf')
bestpose_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/bestpose_clustered.sdf')



**Final ranking methods**

This code calculates the final ranking of compounds using various methods.
*Method 1* : Calculates ECR value for each cluster center, then outputs the top ranked center.
*Method 2* : Calculates ECR value for each cluster center, then outputs the average ECR value for each compound.
*Method 3* : Calculates the average rank of each compound, then ouputs the corresponding ECR value for each compound.
*Method 6* : Calculates Z-score for each cluster center, then ouputs the top ranked center.
*Method 7* : Calculates Z-score for each cluster center, then ouputs the average Z-score for each compound.

All methods are then combined into a single dataframe for comparison purposes.

In [None]:
apply_ranking_methods_simplified(w_dir)

In [None]:
test_df = pd.read_csv('/home/tony/CADD22/wocondock_refactored_chatgpt/temp/ranking/ranking_results.csv')
def show_correlation(dataframe):
    matrix = dataframe.corr().round(2)
    mask = np.triu(np.ones_like(matrix, dtype=bool))
    sns.heatmap(matrix, mask = mask, annot=False, vmax=1, vmin=-1, center=0, linewidths=.5, cmap='coolwarm')
    plt.show()

show_correlation(test_df)

In [None]:
calculate_EFs(w_dir, docking_library)