**Import required libraries and scripts**

In [1]:
#Import required libraries and scripts
from scripts.library_preparation import *
from scripts.utilities import *
from scripts.docking_functions import *
from scripts.clustering_functions import *
from scripts.rescoring_functions import *
from scripts.consensus_methods import *
from scripts.performance_calculation import *
from scripts.dogsitescorer import *
from scripts.get_pocket import *

software = '/home/tony/DockM8/software'
protein_file = '/home/tony/Downloads/ECF/47mol/ecfpanthit1_protoss.pdb'
ref_file = '/home/tony/Downloads/ECF/47mol/ref_ligand.pdb'
docking_library = '/home/tony/Downloads/ECF/47mol/IC50_mol_only.sdf'
docking_programs = ['GNINA', 'SMINA', 'PLANTS']
clustering_metrics = ['RMSD', 'spyRMSD', 'espsim', '3DScore', 'bestpose', 'bestpose_GNINA', 'bestpose_SMINA', 'bestpose_PLANTS']
rescoring_functions = ['gnina', 'AD4', 'chemplp', 'rfscorevs', 'LinF9', 'RTMScore', 'SCORCH', 'vinardo']
id_column = 'ID'
n_poses = 10
exhaustiveness = 8
protonation = 'pkasolver'
# ncpus = int(os.cpu_count()-2)
ncpus = 7
pocket = 'reference'
#Create a temporary folder for all further calculations
w_dir = os.path.dirname(protein_file)
print('The working directory has been set to:', w_dir)
create_temp_folder(w_dir+'/temp')

[15:04:14] Initializing Normalizer
  MIN_CHEMFILES_VERSION = LooseVersion("0.9")
[TRJ.py:171 - <module>()] netCDF4 is not available. Writing AMBER ncdf files will be slow.
  class NCDFPicklable(scipy.io.netcdf.netcdf_file):


The working directory has been set to: /home/tony/Downloads/ECF/47mol
The folder: /home/tony/Downloads/ECF/47mol/temp already exists


In [2]:
pocket = 'RoG'

# if os.path.isfile(protein_file.replace('.pdb', '_pocket.pdb')) == False:
if pocket == 'reference':
    pocket_definition = get_pocket(ref_file, protein_file, 8)
    print(pocket_definition)
if pocket == 'RoG':
    pocket_definition = get_pocket_RoG(ref_file, protein_file)
    print(pocket_definition)
elif pocket == 'dogsitescorer':
    pocket_definition = binding_site_coordinates_dogsitescorer(protein_file, w_dir, method='volume')
    print(pocket_definition)


[2023-Jun-06 15:04:17]: Extracting pocket from /home/tony/Downloads/ECF/47mol/ecfpanthit1_protoss.pdb using /home/tony/Downloads/ECF/47mol/ref_ligand.pdb as reference ligand

[2023-Jun-06 15:04:17]: Radius of Gyration of reference ligand is: 4.132692647261068

[2023-Jun-06 15:04:35]: Finished extracting pocket from /home/tony/Downloads/ECF/47mol/ecfpanthit1_protoss.pdb using /home/tony/Downloads/ECF/47mol/ref_ligand.pdb as reference ligand
{'center': [67.84, 79.11, 89.67], 'size': [11.81, 11.81, 11.81]}


In [None]:
{'center': [63.49, 76.77, 51.57], 'size': [31.64, 31.64, 31.64]}

In [None]:
if os.path.isfile(w_dir+'/temp/final_library.sdf') == False:
    prepare_library(docking_library, id_column, software, protonation, ncpus)

In [None]:
docking(w_dir, protein_file, ref_file, software, docking_programs, exhaustiveness, n_poses, ncpus, pocket_definition)


In [None]:
print('Loading all poses SDF file...')
tic = time.perf_counter()
all_poses = PandasTools.LoadSDF(w_dir+'/temp/allposes.sdf', idName='Pose ID', molColName='Molecule', includeFingerprints=False, strictParsing=True)
toc = time.perf_counter()
print(f'Finished loading all poses SDF in {toc-tic:0.4f}!...')


In [None]:
for metric in clustering_metrics:
        if os.path.isfile(w_dir+f'/temp/clustering/{metric}_clustered.sdf') == False:
            cluster_pebble(metric, 'KMedoids', w_dir, protein_file, all_poses, ncpus)

In [None]:
for metric in clustering_metrics:
        rescore_all(w_dir, protein_file, ref_file, software, w_dir+f'/temp/clustering/{metric}_clustered.sdf', rescoring_functions, 20)


In [None]:
#Calculates correlation to activity for each scoring function

def calculate_EF_single_functions(w_dir, docking_library, clustering_metrics):
    create_temp_folder(w_dir+'/temp/ranking')
    rescoring_folders = {metric: f'rescoring_{metric}_clustered' for metric in clustering_metrics}
    standardised_dataframes, ranked_dataframes = process_dataframes(w_dir, rescoring_folders)
    for name, df_dict in {'standardised': standardised_dataframes, 'ranked': ranked_dataframes}.items():
        for df_name, df in df_dict.items():
            df['ID'] = df['Pose ID'].str.split('_').str[0]
            df.to_csv(w_dir + f'/temp/ranking/{df_name}.csv', index=False)
    
    original_df = PandasTools.LoadSDF(docking_library, molColName='Molecule', idName='ID')
    original_df = original_df[['ID', 'Activity']]
    original_df['Activity'] = pd.to_numeric(original_df['Activity'])
    results = pd.DataFrame(columns=['Scoring Function', 'Clustering Metric', 'Corr'])
    #Calculate EFs for separate scoring functions
    def calculate_correlation(col, df):
        correlation, p_value = pearsonr(df[col], df['Activity'])
        return correlation
    for file in os.listdir(w_dir+'/temp/ranking'):
        if file.endswith('_standardised.csv'):
            clustering_metric = file.replace('_standardised.csv', '')
            std_df = pd.read_csv(w_dir+'/temp/ranking/'+file)
            numeric_cols = std_df.select_dtypes(include='number').columns
            std_df_grouped = std_df.groupby('ID')[numeric_cols].mean().reset_index()
            merged_df = pd.merge(std_df_grouped, original_df, on='ID')
            for col in merged_df.columns:
                if col not in ['ID', 'Activity']:
                    corr = calculate_correlation (col, merged_df)
                    results.loc[len(results)] = [col, clustering_metric, corr]
    create_temp_folder(w_dir+'/temp/consensus')
    results.to_csv(w_dir+'/temp/consensus/Corr_single_functions.csv')

calculate_EF_single_functions(w_dir, '/home/tony/Downloads/ECF/47mol/IC50_mol_only_STD.sdf', clustering_metrics)

In [None]:
#Calculates correlation to activity for each consensus method and set of scoring function

def process_combination(combination, w_dir, name, standardised_df, ranked_df, column_mapping, rank_methods, score_methods, docking_library, original_df):
    selected_columns = list(combination)
    ranked_selected_columns = [column_mapping[col] for col in selected_columns]
    subset_name = '_'.join(selected_columns)
    replacements_dict = {'_R_': '','_S_': '_'}
    for key, value in replacements_dict.items():
        subset_name = subset_name.replace(key, value)
    standardised_subset = standardised_df[['ID'] + selected_columns]
    ranked_subset = ranked_df[['ID'] + ranked_selected_columns]
    analysed_dataframes = {method: rank_methods[method](ranked_subset, name, ranked_selected_columns) for method in rank_methods}
    analysed_dataframes.update({method: score_methods[method](standardised_subset, name, selected_columns) for method in score_methods})
    def calculate_EF1(df, w_dir, docking_library, original_df):
        #Calculate EFs for consensus methods
        merged_df = df.merge(original_df, on='ID')
        method_list = df.columns.tolist()[1:]
        method_ranking = {'ECR':False, 'Zscore':False, 'RbV':False, 'RbR':True}
        for method in method_list:
            asc = [method_ranking[key] for key in method_ranking if key in method][0]
            sorted_df = merged_df.sort_values(method, ascending = asc)
            N1_percent = round(0.01 * len(sorted_df))
            N100_percent = len(sorted_df)
            Hits1_percent = sorted_df.head(N1_percent)['Activity'].sum()
            Hits100_percent = sorted_df['Activity'].sum()
            ef1 = round((Hits1_percent/N1_percent)*(N100_percent/Hits100_percent),2)
        return ef1
    def calculate_correlation(df, original_df):
        df = df.merge(original_df, on='ID')
        score_columns = [col for col in df.columns if col not in ['ID', 'Activity']]
        correlation, p_value = pearsonr(df[score_columns[0]], df['Activity'])
        return correlation
    result_dict = {}
    for method, df in analysed_dataframes.items():
        df = df.drop(columns="Pose ID", errors='ignore')
        correlation = calculate_correlation(df, original_df)
        # Create a new dataframe with the method name, selected columns, and enrichment factor
        ef_df = pd.DataFrame({
            'clustering_method': [name],
            'method_name': [method],
            'selected_columns': [subset_name],
            'corr': [correlation]
        })

        result_dict[method] = ef_df
    return result_dict

def process_combination_wrapper(args):
    return process_combination(*args)

def apply_consensus_methods_combinations(w_dir, docking_library, clustering_metrics):
    create_temp_folder(w_dir+'/temp/ranking')
    rescoring_folders = {metric: f'rescoring_{metric}_clustered' for metric in clustering_metrics}
    standardised_dataframes, ranked_dataframes = process_dataframes(w_dir, rescoring_folders)
    for name, df_dict in {'standardised': standardised_dataframes, 'ranked': ranked_dataframes}.items():
        for df_name, df in df_dict.items():
            df['ID'] = df['Pose ID'].str.split('_').str[0]
            df.to_csv(w_dir + f'/temp/ranking/{df_name}.csv', index=False)
    create_temp_folder(w_dir+'/temp/consensus')
    rank_methods = {'method1':method1_ECR_best, 'method2':method2_ECR_average, 'method3':method3_avg_ECR, 'method4':method4_RbR}
    score_methods = {'method5':method5_RbV, 'method6':method6_Zscore_best, 'method7':method7_Zscore_avg}
    
    original_df = PandasTools.LoadSDF('/home/tony/Downloads/ECF/47mol/IC50_mol_only_STD.sdf', molColName=None, idName='ID')
    original_df = original_df[['ID', 'Activity']]
    original_df['Activity'] = pd.to_numeric(original_df['Activity'])
    df_list = []
    printlog('Calculating consensus methods for every possible score combination...')
    for name in tqdm(rescoring_folders, total=len(rescoring_folders)):
        standardised_df = standardised_dataframes[name+'_standardised']
        ranked_df = ranked_dataframes[name+'_ranked']
        calc_columns = [col for col in standardised_df.columns if col not in ['Pose ID', 'ID']]
        column_mapping = {col: f"{col}_R" for col in calc_columns}
        ranked_df = ranked_df.rename(columns=column_mapping)
        parallel = Parallel(n_jobs=int(os.cpu_count()/2), backend='multiprocessing')
        for L in range(2, len(calc_columns)):
            combinations = list(itertools.combinations(calc_columns, L))
            args = [(subset, w_dir, name, standardised_df, ranked_df, column_mapping, rank_methods, score_methods, docking_library, original_df) for subset in combinations]
            results = parallel(delayed(process_combination_wrapper)(arg) for arg in args)
            for result_dict in results:
                for method, df in result_dict.items():
                    df_list.append(df)
            consensus_summary = pd.concat(df_list, ignore_index=True)

    # Save the consensus_summary DataFrame to a single CSV file
    consensus_summary = pd.concat(df_list, ignore_index=True)
    display(consensus_summary.head())
    consensus_summary.to_csv(w_dir + '/temp/consensus/consensus_summary.csv', index=False)
    
apply_consensus_methods_combinations(w_dir, '/home/tony/Downloads/ECF/47mol/IC50_mol_only_STD.sdf', clustering_metrics)

In [None]:
#Use if you want to output the scores for one consensus method

def apply_consensus_methods(w_dir, clustering_metric, method, rescoring_functions):
    create_temp_folder(w_dir+'/temp/ranking')
    rescoring_folder = f'rescoring_{clustering_metric}_clustered'
    rescored_dataframe = pd.read_csv(w_dir + f'/temp/{rescoring_folder}/allposes_rescored.csv')
    standardised_dataframe = standardize_scores(rescored_dataframe)
    col_dict = {
    'gnina': 'GNINA_Affinity', 'cnn-score': 'CNN-Score', 'cnn-affinity': 'CNN-Affinity', 
    'vinardo': 'Vinardo', 'AD4': 'AD4', 'LinF9': 'LinF9', 'rfscorevs': 'RFScoreVS', 
    'plp': 'PLP', 'chemplp': 'CHEMPLP', 'NNScore': 'NNScore', 'PLECnn': 'PLECnn', 
    'AAScore': 'AAScore', 'ECIF': 'ECIF', 'SCORCH': 'SCORCH', 'RTMScore': 'RTMScore'
    }
    col_list = ['Pose ID'] + [col_dict[function] for function in rescoring_functions if function in col_dict]

    filtered_dataframe = standardised_dataframe[col_list]
    
    print(filtered_dataframe)
    #show_correlation(filtered_dataframe)
    standardised_dataframes, ranked_dataframes = process_dataframes(w_dir, {clustering_metric: rescoring_folder})
    for name, df_dict in {'standardised': standardised_dataframes, 'ranked': ranked_dataframes}.items():
        for df_name, df in df_dict.items():
            df['ID'] = df['Pose ID'].str.split('_').str[0]
            df.to_csv(w_dir + f'/temp/ranking/{df_name}.csv', index=False)

    create_temp_folder(w_dir+'/temp/consensus')
    rank_methods = {'method1': method1_ECR_best, 'method2': method2_ECR_average, 'method3': method3_avg_ECR, 'method4': method4_RbR}
    score_methods = {'method5': method5_RbV, 'method6': method6_Zscore_best, 'method7': method7_Zscore_avg}

    if method in rank_methods:
        method_function = rank_methods[method]
        analysed_dataframe = method_function(ranked_dataframes[clustering_metric+'_ranked'], clustering_metric, [col for col in ranked_dataframes[clustering_metric+'_ranked'] if col not in ['Pose ID', 'ID']])
    elif method in score_methods:
        method_function = score_methods[method]
        analysed_dataframe = method_function(standardised_dataframes[clustering_metric+'_standardised'], clustering_metric, [col for col in standardised_dataframes[clustering_metric+'_standardised'] if col not in ['Pose ID', 'ID']])
    else:
        raise ValueError(f"Invalid method: {method}")

    print(analysed_dataframe)
    analysed_dataframe = analysed_dataframe.drop(columns="Pose ID", errors='ignore')
    analysed_dataframe.to_csv(w_dir+f'/temp/consensus/{clustering_metric}_{method}_results.csv', index=False)

apply_consensus_methods(w_dir, 'bestpose_PLANTS', 'method6', ['SCORCH', 'AD4', 'RTMScore', 'gnina', 'cnn-affinity', 'rf-score-vs'])


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

activity_df = PandasTools.LoadSDF('/home/tony/Downloads/ECF/47mol/IC50_mol_only_STD.sdf', idName='ID', molColName=None)
consensus_df = pd.read_csv('/home/tony/Downloads/ECF/47mol/temp/consensus/bestpose_PLANTS_method6_results.csv')
merged_df = pd.merge(activity_df, consensus_df, on='ID')
merged_df.head()

# Scatter plot
plt.scatter(merged_df['Activity'], merged_df['Method6_Zscore_bestpose_PLANTS'], cmap='viridis')
plt.colorbar(label='Activity')
plt.xlabel('Activity')
plt.ylabel('Method6_Zscore_bestpose_PLANTS')
plt.title('Scatter Plot')
plt.xticks(rotation=45)  # Rotating x-axis labels for better readability
plt.show()


In [None]:
#Standardise Activity using minmax

import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools

# Load SDF file into a Pandas DataFrame
sdf_file = '/home/tony/Downloads/ECF/47mol/IC50_mol_only.sdf'
df = PandasTools.LoadSDF(sdf_file, idName='ID', molColName='Molecule')

# Find the lowest original value in the 'Activity' column
df['Activity'] = pd.to_numeric(df['Activity'])
min_activity = df['Activity'].min()

# Standardize and map 'Activity' column to a score between 0 and 1
df['Activity'] = (df['Activity'] - min_activity) / (df['Activity'].max() - min_activity)

# Sort the DataFrame by the 'Activity' column in descending order
df = df.sort_values(by='Activity', ascending=False)

# Print the resulting DataFrame
print(df)
PandasTools.WriteSDF(df, '/home/tony/Downloads/ECF/47mol/IC50_mol_only_STD.sdf', molColName='Molecule', idName='ID', properties=list(df.columns))
