1. Import required librairies

In [1]:
import os
import numpy as np
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem import rdFMCS
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
import chembl_structure_pipeline as pipeline
from chembl_structure_pipeline import checker
from chembl_structure_pipeline import standardizer
import subprocess
import shutil
from pymol import cmd
import pathlib
import math
from tqdm import tqdm
from spyrmsd import io, rmsd
from espsim import GetEspSim
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans, Birch, DBSCAN, SpectralClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
from math import sqrt
from kneed import KneeLocator
from molvs import *
import concurrent.futures
import multiprocessing
from functools import lru_cache

2. Initial Variables

In [2]:
software = '/media/drive/Software/'

In [3]:
protein_file = '/home/tony/Documents/consensus_docking_python/2o1x_A_apo_protoss.pdb'
ref_file = '/home/tony/Documents/consensus_docking_python/2o1x_A_lig_protoss.pdb'
lig_file = '/home/tony/Documents/consensus_docking_python/Selection_of_FCHGroup_LeadLike.sdf'
id_column = 'ID'
n_poses = 10
exhaustivness = 8

#Initialise variables
root_dir = os.path.dirname(protein_file)
print(root_dir)

/home/tony/Documents/consensus_docking_python


3. Create temp folder

In [None]:
#Create temporary folder for all subsequent work. Temp folder is created in folder containing .pdb file.
def create_temp_folder():
    if os.path.isdir(root_dir+'/temp') == True:
        print('Temp folder already exists')
    else:
        os.mkdir(root_dir+'/temp')

create_temp_folder()

4. Clean library sdf file

In [None]:
def clean_library(sdf):
    #Load Library SDF into Pandas
    try:
        df = PandasTools.LoadSDF(lig_file, idName=id_column, molColName='Molecule',includeFingerprints=False, embedProps=True, removeHs=True, strictParsing=True, smilesName='SMILES')
    except:
        print('ERROR: Failed to Load library SDF file!')
    #Standardize molecules using ChemBL Pipeline
    df['Molecule'] = [standardizer.standardize_mol(mol) for mol in df['Molecule']]
    df['Molecule'] = [standardizer.get_parent_mol(mol) for mol in df['Molecule']]
    df[['Molecule', 'flag']]=pd.DataFrame(df['Molecule'].tolist(), index=df.index)
    df=df.drop(columns='flag')
    std_sdf = root_dir+'/standardized_sdf.sdf'
    #Write standardized molecules to temporary SDF file
    try:
        rdkit.Chem.PandasTools.WriteSDF(df, std_sdf, molColName='Molecule', idName=id_column, properties=list(df.columns), allNumeric=True)
    except:
        print('ERROR: Failed to write standardized library SDF file!')
    # Protonate and Generate 3D conformers using Gypsum-DL
    gypsum_sdf = root_dir+'/gypsum_dl_success.sdf'
    try:
        gypsum_dl_command = 'python '+software+'/gypsum_dl-1.1.9/run_gypsum_dl.py -s '+std_sdf+' --job_manager multiprocessing -p -1 -m 1 -t 10 --min_ph 7 --max_ph 8 --pka_precision 0.5 --skip_enumerate_chiral_mol --skip_enumerate_double_bonds'
        os.system(gypsum_dl_command)
    except:
        print('ERROR: Failed to generate protomers and conformers!')
    #Load final library into Pandas and clean excess columns
    cleaned_library_df = PandasTools.LoadSDF(gypsum_sdf, idName=id_column, molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
    cleaned_library_df = cleaned_library_df[['Molecule', id_column]]
    cleaned_library_df = cleaned_library_df.iloc[1: , :]
    #Write final library to SDF file
    final_sdf = root_dir+'/final_library.sdf'
    try:
        rdkit.Chem.PandasTools.WriteSDF(cleaned_library_df, final_sdf, molColName='Molecule', idName=id_column)
    except:
        print('ERROR: Failed to write final library SDF file!')
    #Remove temporary files
    try:
        os.remove(gypsum_sdf)
        os.remove(gypsum_sdf.replace('_success.sdf', '_failed.smi'))
    except:
        print('ERROR: Could not remove gypsum files!')
    try:
        os.remove(root_dir+'/standardized_sdf.sdf')
    except:
        print('ERROR: Could not remove temporary sdf files!')
    return cleaned_library_df

cleaned_library_df = clean_library(lig_file)

5. SMINA docking

In [None]:
def smina_docking():
    library = root_dir+'/final_library.sdf'
    smina_folder = root_dir+'/temp/smina/'
    try:
        os.mkdir(smina_folder, mode = 0o777)
    except:
        print('Smina folder already exists')
    results = smina_folder+'/docked.sdf'
    log = smina_folder+'log.txt'
    smina_cmd = 'gnina -r '+protein_file+' -l '+library+' --autobox_ligand '+ref_file+' -o '+results+' --exhaustiveness ' +str(exhaustivness)+' --num_modes '+str(n_poses)+' --cnn_scoring none'+' --log '+log
    docker_cmd = 'sudo docker run --name smina --rm -v /home:/home gnina/gnina ' + smina_cmd
    subprocess.call(docker_cmd, shell=True)
    smina_poses = PandasTools.LoadSDF(results, idName=id_column, molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
    return smina_poses

smina_poses = smina_docking()

6. GNINA docking

In [None]:
def gnina_docking():
    library = root_dir+'/final_library.sdf'
    gnina_folder = root_dir+'/temp/gnina/'
    try:
        os.mkdir(gnina_folder, mode = 0o777)
    except:
        print('Gnina folder already exists')
    results = gnina_folder+'/docked.sdf'
    log = gnina_folder+'log.txt'
    gnina_cmd = 'gnina -r '+protein_file+' -l '+library+' --autobox_ligand '+ref_file+' -o '+results+' --exhaustiveness ' +str(exhaustivness)+' --num_modes '+str(n_poses)+' --cnn_scoring rescore --cnn crossdock_default2018 '+' --log '+log
    docker_cmd = 'sudo docker run --name gnina --rm -v /home:/home gnina/gnina ' + gnina_cmd
    subprocess.call(docker_cmd, shell=True)
    gnina_poses = PandasTools.LoadSDF(results, idName=id_column, molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
    return gnina_poses

gnina_poses = gnina_docking()

7. PLANTS docking

In [4]:
plants_search_speed = "speed1"
ants = "20"
plants_docking_scoring = "chemplp"
plants_docking_dir = root_dir+"/temp/plants"
plants_docking_results_dir = root_dir+"/temp/plants/results"

In [5]:
def plants_docking():
    #Create plants docking folder
    if os.path.isdir(plants_docking_dir) == True:
        print('Plants docking folder already exists')
    else:
        os.mkdir(plants_docking_dir)
    #Convert protein file to .mol2 using open babel
    plants_protein_mol2 = root_dir+"/temp/plants/protein.mol2"
    try:
        obabel_command = 'obabel -ipdb '+protein_file+' -O '+plants_protein_mol2
        os.system(obabel_command)
    except:
        print('ERROR: Failed to convert protein file to .mol2!')
    #Convert protein file to .mol2 using open babel
    plants_ref_mol2 = root_dir+"/temp/plants/ref.mol2"
    if ref_file.endswith(".mol2"):
        shutil.copy(ref_file, plants_docking_dir)
        os.rename(plants_docking_dir+"/"+os.path.basename(ref_file), plants_ref_mol2)
    if ref_file.endswith(".sdf"):
        try:
            obabel_command = 'obabel -isdf '+ref_file+' -O '+plants_ref_mol2
            os.system(obabel_command)
        except:
            print('ERROR: Failed to convert reference ligand file to .mol2!')
    if ref_file.endswith(".pdb"):
        try:
            obabel_command = 'obabel -ipdb '+ref_file+' -O '+plants_ref_mol2
            os.system(obabel_command)
        except:
            print('ERROR: Failed to convert reference ligand file to .mol2!')
    else:
        print('ERROR: Reference ligand file not in a readable format!')
    #Convert prepared ligand file to .mol2 using open babel
    final_library = root_dir+"/final_library.sdf"
    plants_ligands_mol2 = root_dir+"/temp/plants/ligands.mol2"
    try:
        obabel_command = 'obabel -isdf '+final_library+' -O '+plants_ligands_mol2
        os.system(obabel_command)
    except:
        print('ERROR: Failed to convert library file to .mol2!')
    #Determine binding site coordinates
    plants_binding_site_command = "cd "+software+" && ./PLANTS --mode bind "+plants_ref_mol2+" 6"
    run_plants_binding_site = os.popen(plants_binding_site_command)
    output_plants_binding_site = run_plants_binding_site.readlines()
    keep = []
    for l in output_plants_binding_site:
        if l.startswith("binding"):
            keep.append(l)
        else:
            pass
    binding_site_center = keep[0].split()
    binding_site_radius = keep[1].split()
    binding_site_radius = binding_site_radius[1]
    binding_site_x = binding_site_center[1]
    binding_site_y = binding_site_center[2]
    binding_site_z = binding_site_center[3]
    #Generate plants config file
    plants_docking_config_path_txt = plants_docking_dir+"/config.txt"
    plants_config = ['# search algorithm\n',
    'search_speed '+plants_search_speed+'\n',
    'aco_ants '+ants+'\n',
    'flip_amide_bonds 0\n',
    'flip_planar_n 1\n',
    'force_flipped_bonds_planarity 0\n',
    'force_planar_bond_rotation 1\n',
    'rescore_mode simplex\n',
    'flip_ring_corners 0\n',
    '# scoring functions\n',
    '# Intermolecular (protein-ligand interaction scoring)\n',
    'scoring_function '+plants_docking_scoring+'\n',
    'outside_binding_site_penalty 50.0\n',
    'enable_sulphur_acceptors 1\n',
    '# Intramolecular ligand scoring\n',
    'ligand_intra_score clash2\n',
    'chemplp_clash_include_14 1\n',
    'chemplp_clash_include_HH 0\n',

    '# input\n',
    'protein_file '+plants_protein_mol2+'\n',
    'ligand_file '+plants_ligands_mol2+'\n',

    '# output\n',
    'output_dir '+plants_docking_results_dir+'\n',

    '# write single mol2 files (e.g. for RMSD calculation)\n',
    'write_multi_mol2 1\n',

    '# binding site definition\n',
    'bindingsite_center '+binding_site_x+' '+binding_site_y+' '+binding_site_z+'+\n',
    'bindingsite_radius '+binding_site_radius+'\n',

    '# cluster algorithm\n',
    'cluster_structures 10\n',
    'cluster_rmsd 2.0\n',

    '# write\n',
    'write_ranking_links 0\n',
    'write_protein_bindingsite 1\n',
    'write_protein_conformations 1\n',
    'write_protein_splitted 1\n',
    'write_merged_protein 0\n',
    '####\n']
    #Write config file
    plants_docking_config_path_config = plants_docking_config_path_txt.replace(".txt", ".config")
    with open(plants_docking_config_path_config, 'w') as configwriter:
        configwriter.writelines(plants_config)
    configwriter.close()
    # os.rename(plants_docking_config_path_txt, plants_docking_config_path_config)
    #Run PLANTS docking
    plants_docking_command = "cd "+software+" && ./PLANTS --mode screen "+plants_docking_config_path_config
    os.system(plants_docking_command)
    return

plants_docking_command, binding_site_x, binding_site_y, binding_site_z, binding_site_radius = plants_docking()

  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /home/tony/Documents/consensus_docking_python/2o1x_A_apo_protoss.pdb)

1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /home/tony/Documents/consensus_docking_python/2o1x_A_lig_protoss.pdb)

1 molecule converted
10 molecules converted




                  PLANTS
        Protein-Ligand ANT System
               version 1.2


author: Oliver Korb

scientific contributors: T.E. Exner, T. Stuetzle

contact: Oliver.Korb@uni-konstanz.de


run PLANTS: PLANTS --mode screen yourconfigfile

PLANTS info: CHO hydrogen found 188 HD2 HIS17
PLANTS info: CHO hydrogen found 189 HE1 HIS17
PLANTS info: CHO hydrogen found 736 HE1 HIS51
PLANTS info: CHO hydrogen found 949 HD2 HIS66
PLANTS info: CHO hydrogen found 950 HE1 HIS66
PLANTS info: CHO hydrogen found 1212 HE1 HIS82
PLANTS info: CHO hydrogen found 1755 HD2 HIS117
PLANTS info: CHO hydrogen found 1756 HE1 HIS117
PLANTS info: CHO hydrogen found 2177 HE1 HIS147
PLANTS info: CHO hydrogen found 3177 HD2 HIS262
PLANTS info: CHO hydrogen found 3178 HE1 HIS262
PLANTS info: CHO hydrogen found 3858 HE1 HIS304
PLANTS info: CHO hydrogen found 4713 HE1 HIS362
PLANTS info: CHO hydrogen found 4744 HE1 HIS364
PLANTS info: CHO hydrogen found 5423 HD2 HIS408
PLANTS info: CHO hydrogen found 5424 HE1 H

TypeError: cannot unpack non-iterable NoneType object

8. LeDock docking

In [42]:
# def get_box(selection='sele', extending = 6.0):
#     ([minX, minY, minZ],[maxX, maxY, maxZ]) = cmd.get_extent(selection)
#     minX = minX - float(extending)
#     minY = minY - float(extending)
#     minZ = minZ - float(extending)
#     maxX = maxX + float(extending)
#     maxY = maxY + float(extending)
#     maxZ = maxZ + float(extending)      
#     SizeX = maxX - minX
#     SizeY = maxY - minY
#     SizeZ = maxZ - minZ
#     CenterX =  (maxX + minX)/2
#     CenterY =  (maxY + minY)/2
#     CenterZ =  (maxZ + minZ)/2        
#     cmd.delete('all')        
#     return minX, maxX, minY, maxY, minZ, maxZ, CenterX, CenterY, CenterZ
# cmd.load(filename=ref_file,format=str(pathlib.Path(ref_file).suffix)[1:],object='lig')
# minX, maxX, minY, maxY, minZ, maxZ, CenterX, CenterY, CenterZ = get_box(selection='lig')
# print(CenterX, CenterY, CenterZ)
# print(binding_site_x, binding_site_y, binding_site_z)

In [27]:
def ledock_docking():
    # Make LeDock temp folders
    final_library = root_dir+'/final_library.sdf'
    ledock_folder = root_dir+'/temp/ledock/'
    ledock_ligands_folder = root_dir+'/temp/ledock/ligands/'
    try:
        os.mkdir(ledock_folder, mode = 0o777)
    except:
        print('LeDock root folder already exists')
    try:
        os.mkdir(ledock_ligands_folder, mode = 0o777)
    except:
        print('LeDock ligand folder already exists')
    #Prepare protein using LePro
    try:
        lepro_command = "cd "+software+" && ./lepro_linux_x86 "+protein_file+" -metal -rot"
        os.system(lepro_command)
        shutil.move(software+"pro.pdb", ledock_folder+"pro.pdb")
        lepro_file = ledock_folder+"pro.pdb"
    except:
        print('LePro command failed!')
    #Split ligands into sdf files
    full_sdf = PandasTools.LoadSDF(final_library, idName=id_column, molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
    for i, row in full_sdf.iterrows():
        id = row[id_column]
        path = ledock_ligands_folder+id+".sdf"
        single_sdf=full_sdf.loc[full_sdf[id_column]==id]
        PandasTools.WriteSDF(single_sdf, path, molColName='Molecule', idName=id_column, properties=None, allNumeric=False)
    #Convert ligand sdf files to mol2 files
    sdf_list = os.listdir(ledock_ligands_folder)
    path_list = []
    for x in sdf_list:
        y = ledock_ligands_folder+x
        path_list.append(y)
    for z in path_list:
        try:
            obabel_command = 'obabel -isdf '+z+' -O '+z.replace(".sdf", ".mol2")
            os.system(obabel_command)
            os.remove(z)
        except:
            print('ERROR: Failed to convert library file to .mol2!')
    def get_box(selection='sele', extending = 6.0):
        ([minX, minY, minZ],[maxX, maxY, maxZ]) = cmd.get_extent(selection)
        minX = minX - float(extending)
        minY = minY - float(extending)
        minZ = minZ - float(extending)
        maxX = maxX + float(extending)
        maxY = maxY + float(extending)
        maxZ = maxZ + float(extending)      
        SizeX = maxX - minX
        SizeY = maxY - minY
        SizeZ = maxZ - minZ
        CenterX =  (maxX + minX)/2
        CenterY =  (maxY + minY)/2
        CenterZ =  (maxZ + minZ)/2        
        cmd.delete('all')        
        return minX, maxX, minY, maxY, minZ, maxZ
    cmd.load(filename=ref_file,format=str(pathlib.Path(ref_file).suffix)[1:],object='lig')
    minX, maxX, minY, maxY, minZ, maxZ = get_box(selection='lig')
    #Generate LeDock docking file
    ledock_ligand_list = root_dir+"/temp/ledock/ligands.list"
    with open(ledock_ligand_list,'w') as l_out:
        for element in os.listdir(ledock_ligands_folder):
            writer = (ledock_ligands_folder+element).replace(".sdf", ".mol2")+'\n'
            l_out.write(writer)
    l_out.close()
    lepro_file = ledock_folder+"pro.pdb"
    file=[
        'Receptor\n',
        lepro_file + '\n\n',
        'RMSD\n',
        '1.0\n\n',
        'Binding pocket\n',
        str(minX),' ',str(maxX),'\n',
        str(minY),' ',str(maxY),'\n',
        str(minZ),' ',str(maxZ),'\n\n',
        'Number of binding poses\n',
        str(n_poses+1 ) + '\n\n',
        'Ligands list\n',
        ledock_ligand_list + '\n\n',
        'END']
    #Write LeDock config file
    ledock_config = root_dir+"/temp/ledock/dock.in"
    with open(ledock_config,'w') as ledock_config_output:
        for line in file:
            ledock_config_output.write(line)
    ledock_config_output.close()
    #Run LeDock
    ledock_docking_command = "cd "+software+" && ./ledock_linux_x86 "+ledock_config
    os.system(ledock_docking_command)
    #Remove .mol2 files
    for x in os.listdir(ledock_ligands_folder):
        if x.endswith('.mol2'):
            os.remove(ledock_ligands_folder+x)
        else:
            pass
    ledock_ligands_folder = root_dir+'/temp/ledock/ligands/'
    #Get .dok files
    ledock_dok_files = []
    for file in os.listdir(ledock_ligands_folder):
        if file.endswith(".dok"):
            dok = ledock_ligands_folder+file
            ledock_dok_files.append(dok)
    #Split .dok files into .txt
    for x in ledock_dok_files:
        op = ''
        start = 0
        counter = 1
        file_name = x.replace(root_dir+"/temp/ledock/ligands/", "")
        file_name = file_name.replace(".dok", "")
        file_name = file_name+"_LeDock_"
        with open(x, 'r') as f:
            for x in f.read().split('\n'):
                if x.__contains__("Cluster"):
                    if (start==1):
                        with open(ledock_ligands_folder+"/"+file_name+str(counter)+".txt", "w") as output_file:
                            output_file.write(op)
                            op=''
                            counter+=1
                    else:
                        start=1
                else:
                    op = op+'\n'+x
    #Fix .txt files
    ledock_txt_files = []
    for file in os.listdir(ledock_ligands_folder):
        if file.endswith(".txt"):
            txt = ledock_ligands_folder+file
            ledock_txt_files.append(txt)
        else:
            pass
    for y in ledock_txt_files:
        new=["NEW FILE START\n"]
        new_file = y.replace(".txt", "_cleaned.txt")
        with open(y, 'r') as txt_file:
            for line in txt_file.read().split('\n'):
                if line.__contains__("ATOM"):
                    line=line+'\n'
                    new.append(line)
                else:
                    pass
            txt_file.close()
            new.append("END\n")
            os.remove(y)
        with open(new_file, 'w') as out_file:
            for lines in new:
                out_file.write(lines)
            out_file.close()
    ledock_cleaned_files = []
    for file in os.listdir(ledock_ligands_folder):
        if file.endswith(".txt"):
            cleaned = ledock_ligands_folder+file
            ledock_cleaned_files.append(cleaned)
        else:
            pass
    for input in ledock_cleaned_files:   
        ledock_pdb=input.replace("cleaned", "mod")
        with open(input, 'r') as f:
            doc=[line for line in f.readlines()]
        doc=[line.replace(line.split()[2],line.split()[2].upper()) if 'ATOM' in line else line for line in doc]
        start=[index for (index,p) in enumerate(doc) if 'NEW FILE' in p]
        finish=[index-1 for (index,p) in enumerate(doc) if 'NEW FILE' in p]
        finish.append(len(doc))
        interval=list(zip(start,finish[1:]))
        for num,i in enumerate(interval):
            block = ",".join(doc[i[0]:i[1]]).replace(',','')
            with open(ledock_pdb, 'w') as w:
                w.write(block)
    for file in os.listdir(ledock_ligands_folder):
        file = root_dir+"/temp/ledock/ligands/"+file
        if file.endswith("_cleaned.txt"):
            os.remove(file)
    for file in os.listdir(ledock_ligands_folder):
        if file.endswith("_mod.txt"):
            id = file.replace("_mod.txt", "")
            output = str(root_dir+"/temp/ledock/ligands/"+id+".pdb")
            with open(str(root_dir+"/temp/ledock/ligands/"+file), 'rt') as i:
                lines = i.readlines()
                test = []
                for l in lines:
                    if l.__contains__('NEW FILE'):
                        l = l.replace("NEW FILE START\n", "HEADER\nTITLE     "+id+'\n')
                        test.append(l)
                    if l.__contains__('ATOM'):
                        line_part_1 = l.split()
                        line_part_2 = l[30:53]
                        if len(str(line_part_1[1]))>1:
                            pass
                        if len(str(line_part_1[1]))==1:
                            fixed_atom_num = ' '+str(line_part_1[1])
                            line_part_1[1] = fixed_atom_num
                        if len(line_part_1[2])>1:
                            pass
                        if len(line_part_1[2])==1:
                            fixed_atom_code = ' '+line_part_1[2]
                            line_part_1[2] = fixed_atom_code
                        new_line = line_part_1[0]+'     '+line_part_1[1]+' '+line_part_1[2]+'   '+line_part_1[3]+'     '+line_part_1[4]+'    '+line_part_2+'                       '+line_part_1[2]+'\n'
                        test.append(new_line)
                output_string = ''
                for t in test:
                    output_string = output_string+t
                with open(output, 'wt') as o:
                    o.write(output_string)
    #Convert to .sdf and remove .dok files
    ledock_pdb_files = []
    for file in os.listdir(ledock_ligands_folder):
        if file.endswith(".pdb"):
            pdb = ledock_ligands_folder+file
            ledock_pdb_files.append(pdb)
        if file.endswith('.dok'):
            os.remove(ledock_ligands_folder+file)
        else:
            pass
    for file in ledock_pdb_files:
        try:
            obabel_command = 'obabel -ipdb '+file+' -O '+file.replace(".pdb", ".sdf")
            os.system(obabel_command)
            os.remove(file)
        except:
            print('ERROR: Failed to convert library file to .sdf!')
    #Concatenate .sdf files
    df = pd.DataFrame()
    ledock_sdf_files = []
    for file in os.listdir(ledock_ligands_folder):
        if file.endswith(".sdf"):
            sdf = ledock_ligands_folder+file
            ledock_sdf_files.append(sdf)
        else:
            pass
    for sdf in ledock_sdf_files:
        df1 = PandasTools.LoadSDF(sdf, idName='TITLE', molColName='Structure',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=False)
        df = pd.concat([df, df1])
    df = df.rename(columns={'TITLE':'Pose ID'})
    df = df.rename(columns={'HEADER':'ID'})
    for i, row in df.iterrows():
        row['Pose ID'] = row['Pose ID'].replace(root_dir+"/temp/ledock/ligands/", "")
        row[id_column] = row['Pose ID'].split('_')[0]
        row['Pose ID'] = row['Pose ID'].replace(".pdb", "")
    PandasTools.WriteSDF(df, root_dir+"/temp/ledock/ligands/all_poses.sdf", molColName='Structure', idName='Pose ID', properties=list(df.columns), allNumeric=True)
    #Remove extra .sdf files
    ledock_extra_files = []
    for file in os.listdir(ledock_ligands_folder):
        if "_LeDock_" in file:
            sdf = ledock_ligands_folder+file
            ledock_extra_files.append(sdf)
    for file in ledock_extra_files:
        os.remove(file)

with concurrent.futures.ThreadPoolExecutor() as executor:
    result = executor.submit(ledock_docking)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /home/tony/Documents/consensus_docking_python/temp/ledock/ligands/FCG16141527_LeDock_7.pdb)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
  Failed to kekulize aromatic bonds in OBMol::PerceiveB

9. Concatenate poses

In [32]:
def fetch_poses():
    plants_docking_results_mol2 = root_dir+"/temp/plants/results/docked_ligands.mol2"
    plants_docking_results_sdf = plants_docking_results_mol2.replace(".mol2", ".sdf")
    # Convert PLANTS poses to sdf
    try:
        obabel_command = 'obabel -imol2 '+plants_docking_results_mol2+' -O '+plants_docking_results_sdf 
        os.system(obabel_command)
    except:
        print('ERROR: Failed to convert PLANTS poses file to .sdf!')
    smina_docking_results = root_dir+"/temp/smina/docked.sdf"
    gnina_docking_results = root_dir+"/temp/gnina/docked.sdf"
    #Fetch PLANTS poses
    try:
        plants_df = PandasTools.LoadSDF(plants_docking_results_sdf, idName=id_column, molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
        for i, row in plants_df.iterrows():
            split = row[id_column].split("_")
            conformer_id = str(split[4])
            plants_df.loc[i, ['Pose ID']] = split[0]+"_PLANTS_"+conformer_id
            plants_df.loc[i, ['ID']] = split[0]
    except:
        print('ERROR: Failed to Load PLANTS poses SDF file!')
    #Fetch SMINA poses
    try:
        smina_df = PandasTools.LoadSDF(smina_docking_results, idName=id_column, molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
        smina_df = smina_df[[id_column, 'Molecule']]
        list_ = [*range(1, n_poses+1, 1)]
        ser = list_ * int(len(smina_df)/len(list_))
        smina_df['number'] = ser + list_[:len(smina_df)-len(ser)]
        for i, row in smina_df.iterrows():
            smina_df.loc[i, ['Pose ID']] = row[id_column]+"_SMINA_"+str(row['number'])
        smina_df.drop('number', axis=1, inplace=True)
    except:
        print('ERROR: Failed to Load SMINA poses SDF file!')
    #Fetch GNINA poses
    try:
        gnina_df = PandasTools.LoadSDF(gnina_docking_results, idName=id_column, molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
        gnina_df = gnina_df[[id_column, 'Molecule']]
        list_ = [*range(1, n_poses+1, 1)]
        ser = list_ * int(len(gnina_df)/len(list_))
        gnina_df['number'] = ser + list_[:len(gnina_df)-len(ser)]
        for i, row in gnina_df.iterrows():
            gnina_df.loc[i, ['Pose ID']] = row[id_column]+"_GNINA_"+str(row['number'])
        gnina_df.drop('number', axis=1, inplace=True)
    except:
        print('ERROR: Failed to Load GNINA poses SDF file!')
    #Fetch LeDock poses
    ledock_docking_results = root_dir+"/temp/ledock/ligands/all_poses.sdf"
    try:
        ledock_df = PandasTools.LoadSDF(ledock_docking_results, idName='Pose ID', molColName='Molecule',includeFingerprints=False, embedProps=False, removeHs=False, strictParsing=True)
    except:
        print('ERROR: Failed to Load GNINA poses SDF file!')
    #Concatenate all poses to single dataframe
    all_poses = pd.concat([plants_df, smina_df, gnina_df, ledock_df], ignore_index=True)
    return (all_poses)

all_poses = fetch_poses()

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_01)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_02)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_03)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_04)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_05)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_06)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_07)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_08)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_09)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG16600623_entry_00004_conf_10)

  Failed to kekulize aromatic bonds in MOL2 file (title is FCG17585042

In [33]:
#Set _Name property for all poses
poses=all_poses['Molecule']
renamed_poses=[]
for i,row in all_poses.iterrows():
	p = row['Molecule']
	p.SetProp('_Name',str(row['Pose ID']))
	renamed_poses.append(p)
all_poses['Molecule'] = renamed_poses
PandasTools.WriteSDF(all_poses, root_dir+"/temp/allposes.sdf", molColName='Molecule', idName='Pose ID')

In [34]:
clustering_metrics_folder = root_dir+'/temp/clustering_metrics/'
try:
    os.mkdir(clustering_metrics_folder)
except:
    print('Clustering metrics folder already exists')

clustering_folder = root_dir+'/temp/clustering/'
try:
    os.mkdir(clustering_folder)
except:
    print('Clustering folder already exists')

Clustering metrics folder already exists
Clustering folder already exists


10. Functions for clustering metrics

In [29]:
def simpleRMSD(dataframe):
    size=len(dataframe)
    table=pd.DataFrame()
    for i,row in tqdm(dataframe.iterrows()):
        for j,jrow in dataframe.iterrows():
            mol = row['Molecule']
            jmol = jrow['Molecule']
            # MCS identification between reference pose and target pose
            r=rdFMCS.FindMCS([mol,jmol])
            # Atom map for reference and target              
            a=mol.GetSubstructMatch(Chem.MolFromSmarts(r.smartsString))
            b=jmol.GetSubstructMatch(Chem.MolFromSmarts(r.smartsString))
            # Atom map generation     
            amap=list(zip(a,b))
            # distance calculation per atom pair
            distances=[]
            for atomA, atomB in amap:
                pos_A=mol.GetConformer().GetAtomPosition (atomA)
                pos_B=jmol.GetConformer().GetAtomPosition (atomB)
                coord_A=np.array((pos_A.x,pos_A.y,pos_A.z))
                coord_B=np.array ((pos_B.x,pos_B.y,pos_B.z))
                dist_numpy = np.linalg.norm(coord_A-coord_B)        
                distances.append(dist_numpy)      
            # This is the RMSD formula from wikipedia
            rmsd=math.sqrt(1/len(distances)*sum([i*i for i in distances])) 
            #saving the rmsd values to a matrix and a table for clustering
            table.loc[mol.GetProp('_Name'),jmol.GetProp('_Name')]=rmsd
    return table

def spyRMSD(path_list):
  table=pd.DataFrame()
  for mol in tqdm(enumerate(path_list)):
      for jmol in enumerate(path_list):
          ref = io.loadmol(mol[1])
          test = io.loadmol(jmol[1])
          ref.strip()
          test.strip()
          coords_ref = ref.coordinates
          anum_ref = ref.atomicnums
          adj_ref = ref.adjacency_matrix
          coords_test = test.coordinates
          anum_test = test.atomicnums
          adj_test = test.adjacency_matrix
          spyrmsd = rmsd.symmrmsd(coords_ref,coords_test,anum_ref,anum_test,adj_ref,adj_test)
          id = mol[1].replace(clustering_metrics_folder, "").replace(".sdf", "")
          jid = jmol[1].replace(clustering_metrics_folder, "").replace(".sdf", "")
          table.loc[id, jid]=spyrmsd
  return table

def espsim_default(dataframe):
	size=len(dataframe)
	hmap=np.empty(shape=(size,size))
	table=pd.DataFrame()
	for i, row in tqdm(dataframe.iterrows()):
		for j, jrow in dataframe.iterrows():
			mol = row['Molecule']
			jmol = jrow['Molecule']
			id = row['Pose ID']
			jid = jrow['Pose ID']
			espsim = GetEspSim(mol, jmol, renormalize=True)
			table.loc[id,jid]=espsim
	return table

def espsim_mmff(dataframe):
	size=len(dataframe)
	hmap=np.empty(shape=(size,size))
	table=pd.DataFrame()
	for i, row in tqdm(dataframe.iterrows()):
		for j, jrow in dataframe.iterrows():
			mol = row['Molecule']
			jmol = jrow['Molecule']
			id = row['Pose ID']
			jid = jrow['Pose ID']
			espsim = GetEspSim(mol, jmol, partialCharges = 'mmff', renormalize=True)
			table.loc[id,jid]=espsim
	return table

14. Clustering methods

In [30]:
def kmedoids_E_clustering(input_dataframe):
	dataframe = input_dataframe.copy()
	# Get column names for renaming
	column_list = dataframe.columns.tolist()
	# Calculate maximum number of clusters
	if len(dataframe)>20:
		max_clusters = 20
	else:
		max_clusters = len(dataframe)
	# Define function for inertia/sum of squares calculation
	def calculate_ss(data):
		wcss = []
		for n in range(2, max_clusters):
			kmedoids = KMedoids(n_clusters=n, init='heuristic')
			kmedoids.fit(X=dataframe)
			wcss.append(kmedoids.inertia_)
		return wcss
	# Calculate sum of squares and range for k
	sum_of_squares = calculate_ss(dataframe)
	cluster_range = range(2, max_clusters)
	# Determine elbow value
	kn = KneeLocator(cluster_range, sum_of_squares, curve='convex', direction='decreasing')
	if kn.knee is None:
		opt_clusters = 5
	else:
		opt_clusters = 5
	#Plotting
	plt.xlabel('k')
	plt.ylabel('Sum of squared distances')
	plt.plot(cluster_range, sum_of_squares, 'bx-')
	plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
	# Apply optimised k-medoids clustering
	kmedoids = KMedoids(n_clusters=opt_clusters, init='heuristic')
	clusters = kmedoids.fit_predict(dataframe)
	dataframe['KMedoids Cluster']=clusters
	# Determine centers
	centers_array = kmedoids.cluster_centers_
	cluster_centers = pd.DataFrame(centers_array, columns = column_list)
	# Assign IDs to cluster centers
	cluster_centers['Pose ID'] = "NaN"
	dataframe['Pose ID'] = "NaN"
	for i, row in dataframe.iterrows():
		dataframe.loc[i, 'Pose ID'] = i
	for j, jrow in cluster_centers.iterrows():
		for x in enumerate(cluster_centers.columns):
			if jrow[x[1]]==1 or jrow[x[1]]==0:
				cluster_centers.loc[j, 'Pose ID'] = x[1]
			else:
				pass
	# Remove columns used for clustering
	cluster_centers_cleaned = cluster_centers[['Pose ID']]
	cluster_centers_cleaned = cluster_centers_cleaned.merge(dataframe, on='Pose ID')
	cluster_centers_cleaned = cluster_centers_cleaned.merge(all_poses, on='Pose ID')
	cluster_centers_cleaned = cluster_centers_cleaned.drop(columns=column_list)
	return cluster_centers_cleaned

In [31]:
def kmedoids_S_clustering(input_dataframe):
	dataframe = input_dataframe.copy()
	# Get column names for renaming
	column_list = dataframe.columns.tolist()
	# Calculate maximum number of clusters
	if len(dataframe)>20:
		max_clusters = 20
	else:
		max_clusters = len(dataframe)
	range_n_clusters = range(2,max_clusters)
	# Define function for silhouette calculation
	silhouettes = []
	for n_clusters in range_n_clusters:
		clusterer = KMedoids(n_clusters=n_clusters, init='heuristic')
		cluster_labels = clusterer.fit_predict(dataframe)
		silhouette_n = silhouette_score(dataframe, cluster_labels)
		silhouettes.append(silhouette_n)
	silhouette_df = pd.DataFrame(columns=['n', 'silhouette_score'])
	silhouette_df['n'] = range_n_clusters
	silhouette_df['silhouette_score'] = silhouettes
	opt_clusters = int(silhouette_df.loc[silhouette_df['silhouette_score'].idxmax(), ['n']][0])
	# Plot
	plt.xlabel('k')
	plt.ylabel('silhouettes')
	plt.plot(np.array(silhouette_df['n']), np.array(silhouette_df['silhouette_score']), 'bx-')
	plt.vlines(opt_clusters, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
	# Apply optimised k-medoids clustering
	kmedoids = KMedoids(n_clusters=opt_clusters, init='heuristic')
	clusters = kmedoids.fit_predict(dataframe)
	dataframe['KMedoids Cluster']=clusters
	# Determine centers
	centers_array = kmedoids.cluster_centers_
	cluster_centers = pd.DataFrame(centers_array, columns = column_list)
	# Assign IDs to cluster centers
	cluster_centers['Pose ID'] = "NaN"
	dataframe['Pose ID'] = "NaN"
	for i, row in dataframe.iterrows():
		dataframe.loc[i, 'Pose ID'] = i
	for j, jrow in cluster_centers.iterrows():
		for x in enumerate(cluster_centers.columns):
			if jrow[x[1]]==1 or jrow[x[1]]==0:
				cluster_centers.loc[j, 'Pose ID'] = x[1]
			else:
				pass
	# Remove columns used for clustering
	cluster_centers_cleaned = cluster_centers[['Pose ID']]
	cluster_centers_cleaned = cluster_centers_cleaned.merge(dataframe, on='Pose ID')
	cluster_centers_cleaned = cluster_centers_cleaned.merge(all_poses, on='Pose ID')
	cluster_centers_cleaned = cluster_centers_cleaned.drop(columns=column_list)
	return cluster_centers_cleaned

In [41]:
id_list = np.unique(np.array(all_poses['ID']))
print(id_list)
RMSD_kE_full = pd.DataFrame()
spyRMSD_kE_full = pd.DataFrame()
espsim_d_kE_full = pd.DataFrame()
mmff_kE_full = pd.DataFrame()
spyRMSD_mmff_kE_full = pd.DataFrame()
RMSD_kS_full = pd.DataFrame()
spyRMSD_kS_full = pd.DataFrame()
espsim_d_kS_full = pd.DataFrame()
mmff_kS_full = pd.DataFrame()
spyRMSD_mmff_kS_full = pd.DataFrame()

@lru_cache(maxsize = 40)
def clustering():
    for id in tqdm(id_list):
        print("Clustering "+id)
        filtered_df = all_poses[all_poses.ID==id]
        for i, row in filtered_df.iterrows():
            single_line = filtered_df[filtered_df.index==i]
            pose_path = clustering_metrics_folder+row['Pose ID']+'.sdf'
            PandasTools.WriteSDF(single_line, pose_path, molColName='Molecule', idName='Pose ID')
        poses = []
        for file in os.listdir(clustering_metrics_folder):
            poses.append(clustering_metrics_folder+file)
        filtered_df_simple_RMSD = simpleRMSD(filtered_df)
        column_reordering = filtered_df_simple_RMSD.columns
        filtered_df_spyRMSD = spyRMSD(poses)
        filtered_df_spyRMSD = filtered_df_spyRMSD.reindex(columns=column_reordering)
        filtered_df_espsim = espsim_default(filtered_df)
        filtered_df_mmff= espsim_mmff(filtered_df)
        filtered_df_spyRMSD_mmff = filtered_df_spyRMSD*filtered_df_mmff
        for file in poses:
            os.remove(file)
        RMSD_kE_centers = kmedoids_E_clustering(filtered_df_simple_RMSD)
        spyRMSD_kE_centers = kmedoids_E_clustering(filtered_df_spyRMSD)
        espsim_d_kE_centers = kmedoids_E_clustering(filtered_df_espsim)
        mmff_kE_centers = kmedoids_E_clustering(filtered_df_mmff)
        spyRMSD_mmff_kE_centers = kmedoids_E_clustering(filtered_df_spyRMSD_mmff)
        RMSD_kS_centers = kmedoids_S_clustering(filtered_df_simple_RMSD)
        spyRMSD_kS_centers = kmedoids_S_clustering(filtered_df_spyRMSD)
        espsim_d_kS_centers = kmedoids_S_clustering(filtered_df_espsim)
        mmff_kS_centers = kmedoids_S_clustering(filtered_df_mmff)
        spyRMSD_mmff_kS_centers = kmedoids_S_clustering(filtered_df_spyRMSD_mmff)

        RMSD_kE_full = RMSD_kE_full.append(RMSD_kE_centers)
        spyRMSD_kE_full = spyRMSD_kE_full.append(spyRMSD_kE_centers)
        espsim_d_kE_full = espsim_d_kE_full.append(espsim_d_kE_centers)
        mmff_kE_full = mmff_kE_full.append(mmff_kE_centers)
        spyRMSD_mmff_kE_full = spyRMSD_mmff_kE_full.append(spyRMSD_mmff_kE_centers)
        RMSD_kS_full = RMSD_kS_full.append(RMSD_kS_centers)
        spyRMSD_kS_full = spyRMSD_kS_full.append(spyRMSD_kS_centers)
        espsim_d_kS_full = espsim_d_kS_full.append(espsim_d_kS_centers)
        mmff_kS_full = mmff_kS_full.append(mmff_kS_centers)
        spyRMSD_mmff_kS_full = spyRMSD_mmff_kS_full.append(spyRMSD_mmff_kS_centers)
    return RMSD_kE_full, spyRMSD_kE_full, espsim_d_kE_full, mmff_kE_full, spyRMSD_mmff_kE_full, RMSD_kS_full, spyRMSD_kS_full, espsim_d_kS_full, mmff_kS_full, spyRMSD_mmff_kS_full

RMSD_kE_full, spyRMSD_kE_full, espsim_d_kE_full, mmff_kE_full, spyRMSD_mmff_kE_full, RMSD_kS_full, spyRMSD_kS_full, espsim_d_kS_full, mmff_kS_full, spyRMSD_mmff_kS_full = clustering()

RMSD_kE_full_path = clustering_folder+'RMSD_kE_full.sdf'
spyRMSD_kE_full_path = clustering_folder+'spyRMSD_kE_full.sdf'
espsim_d_kE_full_path = clustering_folder+'espsim_d_kE_full.sdf'
mmff_kE_full_path = clustering_folder+'mmff_kE_full.sdf'
spyRMSD_mmff_kE_full_path = clustering_folder+'spyRMSD_mmff_kE_full.sdf'
RMSD_kS_full_path = clustering_folder+'RMSD_kS_full.sdf'
spyRMSD_kS_full_path = clustering_folder+'spyRMSD_kS_full.sdf'
espsim_d_kS_full_path = clustering_folder+'espsim_d_kS_full.sdf'
mmff_kS_full_path = clustering_folder+'mmff_kS_full.sdf'
spyRMSD_mmff_kS_full_path = clustering_folder+'spyRMSD_mmff_kS_full.sdf'

PandasTools.WriteSDF(RMSD_kE_full, RMSD_kE_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(spyRMSD_kE_full, spyRMSD_kE_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(espsim_d_kE_full, espsim_d_kE_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(mmff_kE_full, mmff_kE_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(spyRMSD_mmff_kE_full, spyRMSD_mmff_kE_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(RMSD_kS_full, RMSD_kS_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(spyRMSD_kS_full, spyRMSD_kS_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(espsim_d_kS_full, espsim_d_kS_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(mmff_kS_full, mmff_kS_full_path, molColName='Molecule', idName='Pose ID')
PandasTools.WriteSDF(spyRMSD_mmff_kS_full, spyRMSD_mmff_kS_full_path, molColName='Molecule', idName='Pose ID')


['FCG1390566' 'FCG16141527' 'FCG16425532' 'FCG16600623' 'FCG16952409'
 'FCG17274676' 'FCG17585042' 'FCG17822054' 'FCG18066182' 'FCG18472628']


  0%|          | 0/10 [00:00<?, ?it/s]

Clustering FCG1390566


40it [00:02, 13.45it/s]
0it [00:00, ?it/s]
  0%|          | 0/10 [00:03<?, ?it/s]


AssertionError: 

Rescoring

In [None]:
rescoring_folder = root_dir+'/temp/rescoring'
gnina_rescoring_folder = rescoring_folder+'/gnina_rescoring/'
smina_rescoring_folder = rescoring_folder+'/smina_rescoring/'
plp_rescoring_folder = rescoring_folder+'/plp_rescoring/'
chemplp_rescoring_folder = rescoring_folder+'/chemplp_rescoring/'
rfscore_rescoring_folder = rescoring_folder+'/rfscore_rescoring/'
vinardo_rescoring_folder = rescoring_folder+'/vinardo_rescoring/'
ecif_rescoring_folder = rescoring_folder+'/ecif_rescoring/'

try:
    os.mkdir(rescoring_folder)
except:
	print("Rescoring folder already exists")
try:
    os.mkdir(gnina_rescoring_folder)
except:
	print("GNINA Rescoring folder already exist")
try:
    os.mkdir(smina_rescoring_folder)
except:
	print("SMINA Rescoring folder already exists")
try:
    os.mkdir(plp_rescoring_folder)
except:
	print("PLP Rescoring folder already exists")
try:
    os.mkdir(chemplp_rescoring_folder)
except:
	print("CHEMPLP Rescoring folder already exists")
try:
    os.mkdir(rfscore_rescoring_folder)
except:
	print("RFScore Rescoring folder already exists")
try:
    os.mkdir(vinardo_rescoring_folder)
except:
	print("VINARDO Rescoring folder already exists")

GNINA

In [None]:
def gnina_rescoring(sdf):
	cnn = 'crossdock_default2018'
	results = gnina_rescoring_folder+'/rescored_'+cnn+'.sdf'
	log_file = gnina_rescoring_folder+'log_'+cnn+'.txt'

	gnina_cmd = "gnina -r "+protein_file+" -l "+sdf+" --autobox_ligand "+ref_file+" -o "+results+" --cnn "+cnn+" --log "+log_file+" --score_only"
	docker_cmd = "sudo docker run --name gnina --rm -v /home:/home gnina/gnina " + gnina_cmd 

	subprocess.call(docker_cmd, shell=True)

	log = open(log_file, "r")
	lines = log.readlines()
	keep = []
	for l in lines:
		if l.startswith("##") or l.startswith("Affinity") or l.startswith("CNN") or l.startswith("Intramolecular"):
			keep.append(l)
		else:
			pass
	for k in keep:
		if k.startswith("## Name"):
			keep.remove(k)
		else:
			pass
	keep = [keep[i:i+5] for i in range(0, len(keep), 5)]
	gnina_rescoring_results = pd.DataFrame()
	for k in enumerate(keep):
		i = k[1]
		for j in i:
			if j.startswith("##"):
				a = j.split()
				gnina_rescoring_results.loc[k[0], 'Pose ID'] = a[1]
			if j.startswith("Affinity"):
				b = j.split()
				gnina_rescoring_results.loc[k[0], 'Affinity_'+cnn] = b[1]
			if j.startswith("CNNscore"):
				c = j.split()
				gnina_rescoring_results.loc[k[0], 'CNNscore_'+cnn] = c[1]
			if j.startswith("CNNaffinity"):
				d = j.split()
				gnina_rescoring_results.loc[k[0], 'CNNaffinity_'+cnn] = d[1]
			if j.startswith("Intramolecular"):
				e = j.split()
				gnina_rescoring_results.loc[k[0], 'Intramolecular'+cnn] = e[2]
			else:
				pass
	return gnina_rescoring_results

gnina_rescoring_results = gnina_rescoring('/home/tony/Documents/consensus_docking_python/temp/clustering/RMSD_kE_full.sdf')

SMINA

In [None]:
def smina_rescoring(sdf):
	results = smina_rescoring_folder+'/rescored_.sdf'
	log_file = smina_rescoring_folder+'log.txt'

	smina_cmd = "gnina -r "+protein_file+" -l "+sdf+" --autobox_ligand "+ref_file+" -o "+results+" --log "+log_file+" --score_only --cnn_scoring none"
	docker_cmd = "sudo docker run --name gnina --rm -v /home:/home gnina/gnina " + smina_cmd 

	subprocess.call(docker_cmd, shell=True)

	log = open(log_file, "r")
	lines = log.readlines()
	keep = []
	for l in lines:
		if l.startswith("##") or l.startswith("Affinity") or l.startswith("CNN") or l.startswith("Intramolecular"):
			keep.append(l)
		else:
			pass

	for k in keep:
		if k.startswith("## Name"):
			keep.remove(k)
		else:
			pass

	keep = [keep[i:i+5] for i in range(0, len(keep), 5)]

	smina_rescoring_results = pd.DataFrame()

	for k in enumerate(keep):
		i = k[1]
		for j in i:
			if j.startswith("##"):
				a = j.split()
				smina_rescoring_results.loc[k[0], 'Pose ID'] = a[1]
			if j.startswith("Affinity"):
				b = j.split()
				smina_rescoring_results.loc[k[0], 'Affinity_smina'] = b[1]
			if j.startswith("CNNscore"):
				c = j.split()
				smina_rescoring_results.loc[k[0], 'CNNscore_smina'] = c[1]
			if j.startswith("CNNaffinity"):
				d = j.split()
				smina_rescoring_results.loc[k[0], 'CNNaffinity_smina'] = d[1]
			if j.startswith("Intramolecular"):
				e = j.split()
				smina_rescoring_results.loc[k[0], 'Intramolecular_smina'] = e[2]
			else:
				pass
	return smina_rescoring_results

smina_rescoring_results = smina_rescoring('/home/tony/Documents/consensus_docking_python/temp/clustering/RMSD_kE_full.sdf')

VINARDO

In [None]:
def vinardo_rescoring(sdf):
	results = vinardo_rescoring_folder+'/rescored_.sdf'
	log_file = vinardo_rescoring_folder+'log.txt'

	vinardo_cmd = "gnina -r "+protein_file+" -l "+sdf+" --autobox_ligand "+ref_file+" -o "+results+" --log "+log_file+" --score_only --scoring vinardo --cnn_scoring none"
	docker_cmd = "sudo docker run --name gnina --rm -v /home:/home gnina/gnina " + vinardo_cmd 

	subprocess.call(docker_cmd, shell=True)

	log = open(log_file, "r")
	lines = log.readlines()
	keep = []
	for l in lines:
		if l.startswith("##") or l.startswith("Affinity") or l.startswith("CNN") or l.startswith("Intramolecular"):
			keep.append(l)
		else:
			pass

	for k in keep:
		if k.startswith("## Name"):
			keep.remove(k)
		else:
			pass

	keep = [keep[i:i+5] for i in range(0, len(keep), 5)]

	vinardo_rescoring_results = pd.DataFrame()

	for k in enumerate(keep):
		i = k[1]
		for j in i:
			if j.startswith("##"):
				a = j.split()
				vinardo_rescoring_results.loc[k[0], 'Pose ID'] = a[1]
			if j.startswith("Affinity"):
				b = j.split()
				vinardo_rescoring_results.loc[k[0], 'Affinity_vinardo'] = b[1]
			if j.startswith("CNNscore"):
				c = j.split()
				vinardo_rescoring_results.loc[k[0], 'CNNscore_vinardo'] = c[1]
			if j.startswith("CNNaffinity"):
				d = j.split()
				vinardo_rescoring_results.loc[k[0], 'CNNaffinity_vinardo'] = d[1]
			if j.startswith("Intramolecular"):
				e = j.split()
				vinardo_rescoring_results.loc[k[0], 'Intramolecular_vinardo'] = e[2]
			else:
				pass
	return vinardo_rescoring_results

vinardo_rescoring_results = vinardo_rescoring('/home/tony/Documents/consensus_docking_python/temp/clustering/RMSD_kE_full.sdf')

PLP

In [None]:
plants_search_speed = "speed1"
ants = "20"

def plp_rescoring(sdf):
    #Read protein and ref files generated during PLANTS docking
    plants_protein_mol2 = root_dir+"/temp/plants/protein.mol2"
    plants_ref_mol2 = root_dir+"/temp/plants/ref.mol2"
    #Convert clustered ligand file to .mol2 using open babel
    plants_ligands_mol2 = root_dir+"/temp/plants/ligands.mol2"
    try:
        obabel_command = 'obabel -isdf '+sdf+' -O '+plants_ligands_mol2
        os.system(obabel_command)
    except:
        print('ERROR: Failed to convert clustered library file to .mol2!')
    #Determine binding site coordinates
    plants_binding_site_command = "cd "+software+" && ./PLANTS --mode bind "+plants_ref_mol2+" 6"
    run_plants_binding_site = os.popen(plants_binding_site_command)
    output_plants_binding_site = run_plants_binding_site.readlines()
    keep = []
    for l in output_plants_binding_site:
        if l.startswith("binding"):
            keep.append(l)
        else:
            pass
    binding_site_center = keep[0].split()
    binding_site_radius = keep[1].split()
    binding_site_radius = binding_site_radius[1]
    binding_site_x = binding_site_center[1]
    binding_site_y = binding_site_center[2]
    binding_site_z = binding_site_center[3]
    #Generate plants config file
    plp_rescoring_config_path_txt = plp_rescoring_folder+"config.txt"
    plp_config = ['# search algorithm\n',
    'search_speed '+plants_search_speed+'\n',
    'aco_ants '+ants+'\n',
    'flip_amide_bonds 0\n',
    'flip_planar_n 1\n',
    'force_flipped_bonds_planarity 0\n',
    'force_planar_bond_rotation 1\n',
    'rescore_mode simplex\n',
    'flip_ring_corners 0\n',
    '# scoring functions\n',
    '# Intermolecular (protein-ligand interaction scoring)\n',
    'scoring_function plp\n',
    'outside_binding_site_penalty 50.0\n',
    'enable_sulphur_acceptors 1\n',
    '# Intramolecular ligand scoring\n',
    'ligand_intra_score clash2\n',
    'chemplp_clash_include_14 1\n',
    'chemplp_clash_include_HH 0\n',

    '# input\n',
    'protein_file '+plants_protein_mol2+'\n',
    'ligand_file '+plants_ligands_mol2+'\n',

    '# output\n',
    'output_dir '+plp_rescoring_folder+'results\n',

    '# write single mol2 files (e.g. for RMSD calculation)\n',
    'write_multi_mol2 1\n',

    '# binding site definition\n',
    'bindingsite_center '+binding_site_x+' '+binding_site_y+' '+binding_site_z+'+\n',
    'bindingsite_radius '+binding_site_radius+'\n',

    '# cluster algorithm\n',
    'cluster_structures 10\n',
    'cluster_rmsd 2.0\n',

    '# write\n',
    'write_ranking_links 0\n',
    'write_protein_bindingsite 1\n',
    'write_protein_conformations 1\n',
    'write_protein_splitted 1\n',
    'write_merged_protein 0\n',
    '####\n']
    #Write config file
    plp_rescoring_config_path_config = plp_rescoring_config_path_txt.replace(".txt", ".config")
    config = open(plp_rescoring_config_path_txt, 'w')
    config.writelines(plp_config)
    os.rename(plp_rescoring_config_path_txt, plp_rescoring_config_path_config)
    #Run PLANTS docking
    plp_rescoring_command = "cd "+software+" && ./PLANTS --mode screen "+plp_rescoring_config_path_config
    os.system(plp_rescoring_command)
    #Fetch results
    results_csv_location = plp_rescoring_folder+'results/ranking.csv'
    plp_results = pd.read_csv(results_csv_location, sep=',', header=0)
    return plp_results

plp_results = plp_rescoring('/home/tony/Documents/consensus_docking_python/temp/clustering/RMSD_kE_full.sdf')


CHEMPLP

In [None]:
plants_search_speed = "speed1"
ants = "20"

def chemplp_rescoring(sdf):
    #Read protein and ref files generated during PLANTS docking
    plants_protein_mol2 = root_dir+"/temp/plants/protein.mol2"
    plants_ref_mol2 = root_dir+"/temp/plants/ref.mol2"
    #Convert clustered ligand file to .mol2 using open babel
    plants_ligands_mol2 = root_dir+"/temp/plants/ligands.mol2"
    try:
        obabel_command = 'obabel -isdf '+sdf+' -O '+plants_ligands_mol2
        os.system(obabel_command)
    except:
        print('ERROR: Failed to convert clustered library file to .mol2!')
    #Determine binding site coordinates
    plants_binding_site_command = "cd "+software+" && ./PLANTS --mode bind "+plants_ref_mol2+" 6"
    run_plants_binding_site = os.popen(plants_binding_site_command)
    output_plants_binding_site = run_plants_binding_site.readlines()
    keep = []
    for l in output_plants_binding_site:
        if l.startswith("binding"):
            keep.append(l)
        else:
            pass
    binding_site_center = keep[0].split()
    binding_site_radius = keep[1].split()
    binding_site_radius = binding_site_radius[1]
    binding_site_x = binding_site_center[1]
    binding_site_y = binding_site_center[2]
    binding_site_z = binding_site_center[3]
    #Generate plants config file
    chemplp_rescoring_config_path_txt = chemplp_rescoring_folder+"config.txt"
    chemplp_config = ['# search algorithm\n',
    'search_speed '+plants_search_speed+'\n',
    'aco_ants '+ants+'\n',
    'flip_amide_bonds 0\n',
    'flip_planar_n 1\n',
    'force_flipped_bonds_planarity 0\n',
    'force_planar_bond_rotation 1\n',
    'rescore_mode simplex\n',
    'flip_ring_corners 0\n',
    '# scoring functions\n',
    '# Intermolecular (protein-ligand interaction scoring)\n',
    'scoring_function chemplp\n',
    'outside_binding_site_penalty 50.0\n',
    'enable_sulphur_acceptors 1\n',
    '# Intramolecular ligand scoring\n',
    'ligand_intra_score clash2\n',
    'chemplp_clash_include_14 1\n',
    'chemplp_clash_include_HH 0\n',

    '# input\n',
    'protein_file '+plants_protein_mol2+'\n',
    'ligand_file '+plants_ligands_mol2+'\n',

    '# output\n',
    'output_dir '+chemplp_rescoring_folder+'results\n',

    '# write single mol2 files (e.g. for RMSD calculation)\n',
    'write_multi_mol2 1\n',

    '# binding site definition\n',
    'bindingsite_center '+binding_site_x+' '+binding_site_y+' '+binding_site_z+'+\n',
    'bindingsite_radius '+binding_site_radius+'\n',

    '# cluster algorithm\n',
    'cluster_structures 10\n',
    'cluster_rmsd 2.0\n',

    '# write\n',
    'write_ranking_links 0\n',
    'write_protein_bindingsite 1\n',
    'write_protein_conformations 1\n',
    'write_protein_splitted 1\n',
    'write_merged_protein 0\n',
    '####\n']
    #Write config file
    chemplp_rescoring_config_path_config = chemplp_rescoring_config_path_txt.replace(".txt", ".config")
    config = open(chemplp_rescoring_config_path_txt, 'w')
    config.writelines(chemplp_config)
    os.rename(chemplp_rescoring_config_path_txt, chemplp_rescoring_config_path_config)
    #Run PLANTS docking
    chemplp_rescoring_command = "cd "+software+" && ./PLANTS --mode screen "+chemplp_rescoring_config_path_config
    os.system(chemplp_rescoring_command)
    #Fetch results
    results_csv_location = chemplp_rescoring_folder+'results/ranking.csv'
    chemplp_results = pd.read_csv(results_csv_location, sep=',', header=0)
    return chemplp_results

chemplp_results = chemplp_rescoring('/home/tony/Documents/consensus_docking_python/temp/clustering/RMSD_kE_full.sdf')

RFSCORE

In [None]:
def rfscore_rescoring(sdf):
    results_path = rfscore_rescoring_folder+"/ligands_rfscore.csv"
    cd = "cd "+software+" &&"
    cmd = "./rf-score-vs --receptor "+protein_file+" "+sdf+" -O "+results_path+" -o csv --field PoseID --field RFScoreVS_v2"
    rfscore_command = cd+cmd
    os.system(rfscore_command)
    rfscore_results = pd.read_csv(results_path, delimiter=',', header=0)
    sdf_read = PandasTools.LoadSDF(sdf, idName='Pose ID', molColName='Molecule', includeFingerprints=False, removeHs=False)
    id_list = list(sdf_read['Pose ID'])
    rfscore_results['PoseID'] = id_list
    return rfscore_results

rfscore_rescoring_results = rfscore_rescoring('/home/tony/Documents/consensus_docking_python/temp/clustering/RMSD_kE_full.sdf')