**Import required libraries and scripts**

In [1]:
#Import required libraries and scripts
from scripts.library_preparation import *
from scripts.utilities import *
from scripts.docking_functions import *
from scripts.clustering_functions import *
from scripts.rescoring_functions import *
from scripts.ranking_functions import *
from scripts.performance_calculation import *
import numpy as np
import os

software = '/home/tony/CADD22/software'
protein_file = '/home/tony/CADD22/wocondock_main/2o1x_A_apo_protoss.pdb'
ref_file = '/home/tony/CADD22/wocondock_main/2o1x_A_lig_protoss.sdf'
docking_library = '/home/tony/CADD22/wocondock_main/500_of_FCHGroup_LeadLike.sdf'
docking_programs = ['SMINA','GNINA','PLANTS']
id_column = 'ID'
n_poses = 10
exhaustiveness = 4

#Initialise variables and create a temporary folder
w_dir = os.path.dirname(protein_file)
print('The working directory has been set to:', w_dir)
create_temp_folder(w_dir+'/temp')

[16:12:07] Initializing Normalizer


The working directory has been set to: /home/tony/CADD22/wocondock_main
The folder: /home/tony/CADD22/wocondock_main/temp already exists


In [None]:
cleaned_pkasolver_df = prepare_library(docking_library, id_column, software, 'pkasolver')

In [None]:
all_poses = PandasTools.LoadSDF(w_dir+'/temp/allposes.sdf', idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)

all_poses = all_poses[all_poses['ID'] == 'FCG1390566']

def generate_matrix(df):
    matrix = pd.DataFrame(0.0, index=[df['Pose ID']], columns=df['Molecule'])
    for subset in tqdm(itertools.combinations(df['Molecule'], 2)):
        result = simpleRMSD_calc(subset[0], subset[1])
        matrix.iloc[df[df['Molecule']==subset[0]].index.values, df[df['Molecule']==subset[1]].index.values] = 0 if np.isnan(result) else result
        matrix.iloc[df[df['Molecule']==subset[1]].index.values, df[df['Molecule']==subset[0]].index.values] = 0 if np.isnan(result) else result
    return matrix

#####THIS IS FOR MARIO#####

def matrix_calculation_numpy(df):
    matrix = np.zeros((len(df), len(df)))
    subsets = np.array(list(itertools.combinations(df['Molecule'], 2)))
    subset1 = subsets[:,0]
    subset2 = subsets[:,1]
    # Find the index of each molecule in the df['Molecule'] array
    indices = {mol: idx for idx, mol in enumerate(df['Molecule'].values)}
    # Calculate the results for each combination
    results = np.array([simpleRMSD_calc(x, y) for x, y in tqdm(subsets)])
    # Use vectorized operations to update the matrix
    i, j = np.array([indices[x] for x in subset1]), np.array([indices[y] for y in subset2])
    matrix[i, j] = results
    matrix[j, i] = results
    output_df = pd.DataFrame(matrix)
    output_df.columns = [df['Pose ID']]
    output_df.index = [df['Pose ID']]
    return matrix

#####THIS IS FOR MARIO#####

matrix_test=matrix_calculation_numpy(all_poses)
matrix_test2 = generate_matrix(all_poses)


In [None]:
def apply_to_group(group):
    return matrix_calculation_numpy(group)

ddf.groupby('ID').apply(apply_to_group).compute()

In [None]:
#dictionary = {ele : object for ele in ddf['Pose ID']}
ddf.groupby('ID').apply(matrix_calculation_numpy).compute()

In [None]:
import dask.dataframe as dd
import pandas as pd

# Load molecules from SDF file and store in a Pandas dataframe
all_poses = PandasTools.LoadSDF(w_dir+'/temp/allposes.sdf', idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)

splits = list(all_poses.groupby("ID"))

new_splits = []

for id, split in splits:
    new_splits.append(split)

dskbag = dask.bag.from_sequence(new_splits)

def generate_matrix(df):
    matrix = pd.DataFrame(0.0, index=[df['Pose ID']], columns=df['Molecule'])
    for subset in itertools.combinations(df['Molecule'], 2):
        result = simpleRMSD_calc(subset[0], subset[1])
        matrix.iloc[df[df['Molecule']==subset[0]].index.values, df[df['Molecule']==subset[1]].index.values] = 0 if np.isnan(result) else result
        matrix.iloc[df[df['Molecule']==subset[1]].index.values, df[df['Molecule']==subset[0]].index.values] = 0 if np.isnan(result) else result
        display(matrix)
    return matrix

matrices = dskbag.map(generate_matrix).compute()

In [None]:
cluster('RMSD', w_dir, protein_file)

In [None]:
from tqdm import tqdm
def cluster_numpy(method, w_dir, protein_file):
    def matrix_calculation_and_clustering(method, df, id_list, protein_file): 
        matrix = dict()
        clustered_dataframes = []
        print("*Calculating {} metrics and clustering*".format(method))
        methods = {'RMSD': simpleRMSD_calc, 'spyRMSD': spyRMSD_calc, 'espsim': espsim_calc, 'USRCAT': USRCAT_calc, 'SPLIF': SPLIF_calc, '3DScore': '3DScore', 'bestpose': 'bestpose'}
        for id in tqdm(id_list):
            if method == 'bestpose':
                df_name = df[df['ID']==id]
                df_name[['CHEMPLP', 'SMINA_Affinity', 'CNNaffinity']] = df_name[['CHEMPLP', 'SMINA_Affinity', 'CNNaffinity']].apply(pd.to_numeric, errors='coerce')
                best_row_CHEMPLP = df_name.loc[df_name.groupby(['ID'])['CHEMPLP'].idxmin()]
                best_row_SMINA = df_name.loc[df_name.groupby(['ID'])['SMINA_Affinity'].idxmin()]
                best_row_GNINA = df_name.loc[df_name.groupby(['ID'])['CNNaffinity'].idxmax()]
                table = pd.concat([best_row_GNINA, best_row_SMINA, best_row_CHEMPLP])
                table.reset_index(inplace=True)
                table = pd.DataFrame(table['Pose ID'])
                table['Pose ID'] = table['Pose ID'].astype(str).str.replace('[()\',]','', regex=False)
                clustered_dataframes.append(table)
            elif method == '3DScore':
                for subset in itertools.combinations(df_name['Molecule'], 2):
                    try:
                        result = methods['spyRMSD'](subset[0], subset[1])
                    except:
                        result = methods['RMSD'](subset[0], subset[1])
                    table.iloc[df_name[df_name['Molecule']==subset[0]].index.values, df_name[df_name['Molecule']==subset[1]].index.values] = 0 if np.isnan(result) else result
                    table.iloc[df_name[df_name['Molecule']==subset[1]].index.values, df_name[df_name['Molecule']==subset[0]].index.values] = 0 if np.isnan(result) else result
                
                df_filtered = df[df['ID']==id]
                matrix = np.zeros((len(df_filtered), len(df_filtered)))
                subsets = np.array(list(itertools.combinations(df_filtered['Molecule'], 2)))
                subset1 = subsets[:,0]
                subset2 = subsets[:,1]
                indices = {mol: idx for idx, mol in enumerate(df_filtered['Molecule'].values)}
                for x, y in subsets:
                    try:
                        results = np.array([methods['spyRMSD'](x, y, protein_file)])
                    except:
                        results = np.array([methods['RMSD'](x, y, protein_file)])
                i, j = np.array([indices[x] for x in subset1]), np.array([indices[y] for y in subset2])
                matrix[i, j] = results
                matrix[j, i] = results
                matrix_df = pd.DataFrame(matrix)
                matrix_df['3DScore'] = matrix_df.sum(axis=1)
                matrix_df.sort_values(by='3DScore', ascending=True)
                matrix_df = matrix_df.head(1)
                matrix_df.reset_index(inplace=True)
                matrix_df = pd.DataFrame(matrix_df['Pose ID'])
                matrix_df['Pose ID'] = matrix_df['Pose ID'].astype(str).str.replace('[()\',]','', regex=False)
                clustered_dataframes.append(matrix_df)
            else:
                #try:
                    df_filtered = df[df['ID']==id]
                    matrix = np.zeros((len(df_filtered), len(df_filtered)))
                    subsets = np.array(list(itertools.combinations(df_filtered['Molecule'], 2)))
                    subset1 = subsets[:,0]
                    subset2 = subsets[:,1]
                    indices = {mol: idx for idx, mol in enumerate(df_filtered['Molecule'].values)}
                    results = np.array([methods[method](x, y, protein_file) for x, y in subsets])
                    i, j = np.array([indices[x] for x in subset1]), np.array([indices[y] for y in subset2])
                    matrix[i, j] = results
                    matrix[j, i] = results
                    matrix_df = pd.DataFrame(matrix)
                    matrix_df.columns = [df_filtered['Pose ID']]
                    matrix_df.index = [df_filtered['Pose ID']]
                    clust_df = kmedoids_S_clustering(matrix_df)
                    clust_df = clust_df['Pose ID']
                    clust_df.index.name = 'index'
                    display(clust_df)
                    clustered_dataframes.append(clust_df)
                #except:
                    #print(f'Failed to calculate metrics and cluster ID: {id}')
        display(clustered_dataframes)
        full_df = functools.reduce(lambda  left,right: pd.merge(left,right,on=['Pose ID'], how='outer'), clustered_dataframes)
        full_df['Pose ID'] = full_df['Pose ID'].astype(str).replace('[()\',]','', regex=True)
        return full_df
    print('Loading all poses SDF file...')
    all_poses = PandasTools.LoadSDF(w_dir+'/temp/allposes.sdf', idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)
    print('Finished loading all poses SDF file...')
    id_list = np.unique(np.array(all_poses['ID']))
    create_clustering_folder(w_dir+'/temp/clustering/')
    clustered_poses = matrix_calculation_and_clustering(method, all_poses, id_list, protein_file)
    clustered_poses = pd.merge(all_poses, clustered_poses, on='Pose ID')
    # keep only the necessary columns
    clustered_poses = clustered_poses[['Pose ID', 'Molecule', 'ID']]
    save_path = w_dir + '/temp/clustering/' + method + '_clustered.sdf'
    PandasTools.WriteSDF(clustered_poses, save_path, molColName='Molecule', idName='Pose ID')
    return

In [None]:
cluster_numpy('RMSD', w_dir, protein_file)

In [None]:
cluster('bestpose', w_dir, protein_file)

In [None]:
cluster('espsim', w_dir, protein_file)
cluster('spyRMSD', w_dir, protein_file)
cluster('USRCAT', w_dir, protein_file)
cluster('RMSD', w_dir, protein_file)
cluster('3DScore', w_dir, protein_file)

In [3]:
all_poses = docking(protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

SMINA folder already exists
Docking with SMINA complete in 79.8990!
GNINA folder already exists
Docking with GNINA complete in 136.1025!
Plants docking folder already exists
Converting protein file to .mol2 format for PLANTS docking...
Converting reference file from .sdf to .mol2 format for PLANTS docking...
Determining binding site coordinates using PLANTS...
Writing PLANTS config file...
Starting PLANTS docking...


1 molecule converted
10 molecules converted


Converting PLANTS poses to .sdf format...
Docking with PLANTS complete in 212.3593!
Combined all docking poses in 0.7529!
Finished docking in 429.1147!


In [2]:
docking_splitted(w_dir, protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses)

Splitting docking library...
The folder: /home/tony/CADD22/wocondock_main/temp/split_files was created
Split docking library into 5 files each containing 2 compounds
The folder: /home/tony/CADD22/wocondock_main/temp/plants already exists
Converting protein file to .mol2 format for PLANTS docking...
Converting reference file from .sdf to .mol2 format for PLANTS docking...
Determining binding site coordinates using PLANTS...
Docking split files using PLANTS...
Docking with PLANTS complete in 158.4660!
Docking split files using SMINA...
Docking with SMINA complete in 58.6538!
Docking split files using GNINA...
Docking with GNINA complete in 141.9001!
Combined all docking poses in 0.7383!


**Rescoring**

The file containing all the cluster centers is then rescored using all scoring functions available (GNINA, Vina, AutoDock4, PLP, CHEMPLP, RF-Score-VS). The rescored output is return as a dataframe.

In [None]:
RMSD_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/RMSD_clustered.sdf')
espsim_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/espsim_clustered.sdf')
spyRMSD_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/spyRMSD_clustered.sdf')
USRCAT_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/USRCAT_clustered.sdf')
DScore_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/3DScore_clustered.sdf')
bestpose_rescored = rescore_all(w_dir, protein_file, ref_file, software, w_dir+'/temp/clustering/bestpose_clustered.sdf')



**Final ranking methods**

This code calculates the final ranking of compounds using various methods.
*Method 1* : Calculates ECR value for each cluster center, then outputs the top ranked center.
*Method 2* : Calculates ECR value for each cluster center, then outputs the average ECR value for each compound.
*Method 3* : Calculates the average rank of each compound, then ouputs the corresponding ECR value for each compound.
*Method 6* : Calculates Z-score for each cluster center, then ouputs the top ranked center.
*Method 7* : Calculates Z-score for each cluster center, then ouputs the average Z-score for each compound.

All methods are then combined into a single dataframe for comparison purposes.

In [None]:
apply_ranking_methods_simplified(w_dir)

In [None]:
test_df = pd.read_csv('/home/tony/CADD22/wocondock_refactored_chatgpt/temp/ranking/ranking_results.csv')
def show_correlation(dataframe):
    matrix = dataframe.corr().round(2)
    mask = np.triu(np.ones_like(matrix, dtype=bool))
    sns.heatmap(matrix, mask = mask, annot=False, vmax=1, vmin=-1, center=0, linewidths=.5, cmap='coolwarm')
    plt.show()

show_correlation(test_df)

In [None]:
calculate_EFs(w_dir, docking_library)