Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sprint 4 #18

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions synthetic-enumeration/01-fix-csv-files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

# Fix CSV files so that SMILES and then CID occur first

for prefix in "primary_amine_enumeration_for_chodera_lab_FEP" "boronic_ester_enumeration_for_chodera_lab_FEP" "nucleophilic_displacement_enumeration_for_FEP"; do
#for prefix in "primary_amine_enumeration_for_chodera_lab_FEP" "boronic_ester_enumeration_for_chodera_lab_FEP" "nucleophilic_displacement_enumeration_for_FEP"; do
for prefix in "UGI_tert_butyl_enumeration_for_chodera_lab"; do
echo "Generating $prefix-permuted.csv"
awk -F',' '{ print $4 "," $1 "," $2 "," $3 }' $prefix.csv > $prefix-permuted.csv
#awk -F',' '{ print $4 "," $1 "," $2 "," $3 }' $prefix.csv > $prefix-permuted.csv
awk -F',' '{ print $5 "," $1 "," $3 }' $prefix.csv > $prefix-permuted.csv
echo "Generating $prefix.pdf"
mols2pdf.py -in $prefix-permuted.csv -out $prefix.pdf -cols 6 -rows 3
mols2pdf.py -in $prefix-permuted.csv -out $prefix.pdf -cols 4 -rows 3
done
159 changes: 82 additions & 77 deletions synthetic-enumeration/02-generate-poses.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def count(self, lig_mol):
bump_count += np.exp(-0.5 * (nb.GetDist() / self.cutoff)**2)
return bump_count

def generate_restricted_conformers(receptor, refmol, mol, core_smarts=None):
def generate_restricted_conformers(receptor, refmol, mol, core_smarts):
"""
Generate and select a conformer of the specified molecule using the reference molecule

Expand All @@ -143,21 +143,25 @@ def generate_restricted_conformers(receptor, refmol, mol, core_smarts=None):
"""
from openeye import oechem, oeomega

# DEBUG: For benzotriazoles, truncate refmol
core_smarts = 'c1ccc(NC(=O)[C,N]n2nnc3ccccc32)cc1' # prospective
core_smarts = 'NC(=O)[C,N]n2nnc3ccccc32' # retrospective

# Get core fragment
if core_smarts:
if core_smarts is not None:
if not smarts_match(refmol, core_smarts):
raise Exception('refmol does not match core SMARTS')
if not smarts_match(mol, core_smarts):
print(f'Molecule {mol.GetTitle()} does not match core_smarts')
return None

# Truncate refmol to SMARTS if specified
#print(f'Trunctating using SMARTS {refmol_smarts}')
ss = oechem.OESubSearch(core_smarts)
oechem.OEPrepareSearch(refmol, ss)
core_fragment = oechem.OEGraphMol()
for match in ss.Match(refmol):
core_fragment = oechem.OEGraphMol()
oechem.OESubsetMol(core_fragment, match)
break
#print(f'refmol has {refmol.NumAtoms()} atoms')
#print(f'core_fragment has {core_fragment.NumAtoms()} atoms')
if core_fragment.NumAtoms() == 0:
return None
else:
core_fragment = GetCoreFragment(refmol, [mol])
oechem.OESuppressHydrogens(core_fragment)
Expand All @@ -175,7 +179,8 @@ def generate_restricted_conformers(receptor, refmol, mol, core_smarts=None):
omegaFixOpts.SetFixMaxMatch(10) # allow multiple MCSS matches
omegaFixOpts.SetFixDeleteH(True) # only use heavy atoms
omegaFixOpts.SetFixMol(core_fragment)
#omegaFixOpts.SetFixSmarts(smarts)

omegaFixOpts.SetFixSmarts(core_smarts)
omegaFixOpts.SetFixRMS(0.5)

atomexpr = oechem.OEExprOpts_Aromaticity | oechem.OEExprOpts_Hybridization
Expand All @@ -188,11 +193,11 @@ def generate_restricted_conformers(receptor, refmol, mol, core_smarts=None):
molBuilderOpts.SetStrictAtomTypes(False) # don't give up if MMFF types are not found
omegaOpts.SetMolBuilderOptions(molBuilderOpts)

omegaOpts.SetWarts(False) # expand molecule title
omegaOpts.SetWarts(True) # expand molecule title
omegaOpts.SetStrictStereo(False) # set strict stereochemistry
omegaOpts.SetIncludeInput(False) # don't include input
omegaOpts.SetMaxConfs(1000) # generate lots of conformers
#omegaOpts.SetEnergyWindow(10.0) # allow high energies
omegaOpts.SetMaxConfs(10000) # generate lots of conformers
omegaOpts.SetEnergyWindow(10.0) # allow high energies
omega = oeomega.OEOmega(omegaOpts)

from openeye import oequacpac
Expand All @@ -204,7 +209,7 @@ def generate_restricted_conformers(receptor, refmol, mol, core_smarts=None):

ret_code = omega.Build(mol)
if (mol.GetDimension() != 3) or (ret_code != oeomega.OEOmegaReturnCode_Success):
print(f'Omega failure: {mol.GetDimension()} and {oeomega.OEGetOmegaError(ret_code)}')
print(f'Omega failure: {mol.GetDimension()} and {oeomega.OEGetOmegaError(ret_code)} for {mol.GetTitle()}')
return None

# Extract poses
Expand Down Expand Up @@ -242,9 +247,9 @@ def __init__(self, conformer):
shapeFunc.Overlap(tmpmol, oeshape_result)
pose.overlap_score = oeshape_result.GetRefTversky()

# Filter poses based on top 10% of overlap
# Filter poses based on top fraction of overlap
poses = sorted(poses, key= lambda pose : pose.overlap_score)
poses = poses[int(0.9*len(poses)):]
poses = poses[int(0.8*len(poses)):]

# Select the best docking score
import numpy as np
Expand Down Expand Up @@ -340,7 +345,7 @@ def check_if_smi_in_series(
def generate_restricted_conformers_star(args):
return generate_restricted_conformers(*args)

def generate_poses(receptor, refmol, target_molecules, output_filename):
def generate_poses(receptor, refmol, target_molecules, scaffold_smarts, output_filename):
"""
Parameters
----------
Expand All @@ -350,6 +355,8 @@ def generate_poses(receptor, refmol, target_molecules, output_filename):
Reference molecule which shares some part in common with the proposed molecule
target_molecules : list of OEMol
List of molecules to build
scaffold_smarts : str
SMARTS to use for scaffold match
output_filename : str
Output filename for generated conformers
"""
Expand All @@ -371,72 +378,90 @@ def generate_poses(receptor, refmol, target_molecules, output_filename):
from tqdm import tqdm

pool = Pool()
args = [ (receptor, refmol, mol) for mol in target_molecules ]
args = [ (receptor, refmol, mol, scaffold_smarts) for mol in target_molecules ]
for pose in track(pool.imap_unordered(generate_restricted_conformers_star, args), total=len(args), description='Enumerating conformers...'):
if pose is not None:
oechem.OEWriteMolecule(ofs, pose)
pool.close()
pool.join()

#for mol in tqdm(target_molecules):
# pose = generate_restricted_conformers(receptor, core_fragment, mol)
# if pose is not None:
# oechem.OEWriteMolecule(ofs, pose)
def smarts_match(mol, smarts):
ss = oechem.OESubSearch()
unique = True
if not ss.Init(smarts):
oechem.OEThrow.Fatal("Cannot parse smarts: %s" % smarts)
matches = ss.Match(mol)
if matches is None:
return False
else:
matches = [ match for match in matches ]
if len(matches) == 0:
return False
return True

if __name__ == '__main__':
#fragment = 'x2646' # TRY-UNI-714a760b-6 (the main amino pyridine core)
#fragment = 'x10789' # TRY-UNI-2eddb1ff-7 (beta-lactam an chloride)

import os

# TODO: Figure out why this single-threading workaround is needed to prevent issues
from openeye import oechem
#oechem.OESetMemPoolMode(oechem.OEMemPoolMode_SingleThreaded |
# oechem.OEMemPoolMode_UnboundedCache)

assay_data_filename = 'activity-data-2020-09-01.csv'
fragments = {
#'x10789' : 'TRY-UNI-2eddb1ff-7',
# Benzotriazoles
'x10876' : 'ALP-POS-d2866bdf-1',
'x10820' : 'ALP-POS-c59291d4-4',
'x10871' : 'ALP-POS-c59291d4-2',
# Latest activity data
# TODO: Download and cache latest data
assay_data_filename = 'activity-data/activity-data-2020-09-05.csv'

# SMARTS pattern to use for common scaffold matches
scaffold_smarts = 'N(C(=O)c2ccco2)C(C(=O)NC)c2cccnc2'

# Reference ligands to build from (aligned in binding site with reference structures)
reference_ligand_fragments = {
'3v3m-2020-04-Jacobs' : '3v3m-2020-04-Jacobs', # SARS-CoV-1 noncovalent Ugi reference
}

# Reference protein structure ID
reference_protein_fragment = 'x3110' # Ugi covalent reference structure

# Load assay data if available
assayed_molecules = list()
with oechem.oemolistream(assay_data_filename) as ifs:
for mol in ifs.GetOEGraphMols():
assayed_molecules.append( oechem.OEGraphMol(mol) )
print(f' There are {len(assayed_molecules)} assayed molecules')
# Retain only those with pIC50s
assayed_molecules = [ mol for mol in assayed_molecules if has_ic50(mol) ]
print(f' There are {len(assayed_molecules)} assayed molecules with pIC50s')

# Filter assayed molecules to include only those that include the required scaffold
assayed_molecules = [ mol for mol in assayed_molecules if smarts_match(mol, scaffold_smarts) ]
print(f' There are {len(assayed_molecules)} assayed molecules with the core SMARTS scaffold')
with oechem.oemolostream(f'filtered.mol2') as ofs:
for mol in assayed_molecules:
oechem.OEWriteMolecule(ofs, oechem.OEGraphMol(mol))

# Load all fragments
sprint_path = 'sprint-4' # location of target molecule sets
for prefix in [
'2020-09-01-benzotriazoles-retrospective',
#'2020-08-20-benzotriazoles',
#'BEN-DND-93268d01',
#'EDG-MED-0da5ad92',
#'RAL-THA-6b94ceba',
#'activity-data-2020-08-11',
#'aminopyridine-retrospective-jdc-2020-08-11',
#'fastgrant-table1',
#'aminopyridine_compounds_for_FEP_benchmarking',
#'nucleophilic_displacement_enumeration_for_FEP-permuted',
#'activity-data-2020-07-29',
#'primary_amine_enumeration_for_chodera_lab_FEP-permuted',
#'boronic_ester_enumeration_for_chodera_lab_FEP-permuted',
'2020-09-06-ugi-tBu', # Ugi P2 enumeration; tBu in P4
#'2020-09-03-ugi-fluorobenzene', # Ugi P2 enumeration; fluorobenzene in P4
]:
for fragment in fragments:
for reference_ligand_fragment in reference_ligand_fragments:
reference_ligand_name = reference_ligand_fragments[reference_ligand_fragment]

# Read receptor
print('Reading receptor...')
from openeye import oechem
receptor = oechem.OEGraphMol()
receptor_filename = f'../receptors/monomer/Mpro-{fragment}_0_bound-receptor.oeb.gz'
receptor_filename = f'../receptors/monomer/Mpro-{reference_protein_fragment}_0_bound-receptor.oeb.gz'
from openeye import oedocking
oedocking.OEReadReceptorFile(receptor, receptor_filename)
print(f' Receptor has {receptor.NumAtoms()} atoms.')

# Read reference fragment with coordinates
refmol_filename = f'../receptors/monomer/Mpro-{fragment}_0_bound-ligand.mol2'
refmol_filename = f'../receptors/monomer/Mpro-{reference_ligand_fragment}_0_bound-ligand.mol2'
refmol = None
with oechem.oemolistream(refmol_filename) as ifs:
for mol in ifs.GetOEGraphMols():
Expand All @@ -445,55 +470,35 @@ def generate_poses(receptor, refmol, target_molecules, output_filename):
if refmol is None:
raise Exception(f'Could not read {refmol_filename}')
print(f'Reference molecule has {refmol.NumAtoms()} atoms')
assert smarts_match(refmol, scaffold_smarts), "refmol does not match scaffold SMARTS"
# Replace title
refmol.SetTitle(fragments[fragment])
# Copy data from assayed molecules (if present)
refmol.SetTitle(reference_ligand_name)
# Copy data from assayed molecules to reference molecule (if present)
for mol in assayed_molecules:
if mol.GetTitle() == fragments[fragment]:
if mol.GetTitle() == reference_ligand_name:
print(f'{refmol.GetTitle()} found in target_molecules; copying SDData')
oechem.OECopySDData(refmol, mol)
break

# Read target molecules
target_molecules_filename = prefix + f'.csv'
target_molecules_filename = os.path.join(sprint_path, prefix + f'.csv')
print('Reading target molecules...')
from openeye import oechem
target_molecules = list()
if not os.path.exists(target_molecules_filename):
raise Exception(f'Target molecules filename {target_molecules_filename} not found.')
with oechem.oemolistream(target_molecules_filename) as ifs:
for mol in ifs.GetOEGraphMols():
# Copy data from assayed molecules (if present)
for assayed_mol in assayed_molecules:
if assayed_mol.GetTitle() == mol.GetTitle():
print(f'{mol.GetTitle()} found in assayed data; copying SDData')
oechem.OECopySDData(refmol, mol)
break
# Store a copy
target_molecules.append( oechem.OEGraphMol(mol) )
target_molecules.append( oechem.OEGraphMol(mol) )

if len(target_molecules) == 0:
raise Exception('No target molecules specified; check filename!')
print(f' There are {len(target_molecules)} target molecules')

# Filter series and include only those that include the required scaffold
#filter_series = '3-aminopyridine-like'
#filter_series = 'benzotriazoles'
filter_series = None
if filter_series is not None:
print(f'Filtering out series {filter_series}...')
target_molecules = [ mol for mol in target_molecules if (get_series(mol) == filter_series) ]
print(f' There are {len(target_molecules)} target molecules')
with oechem.oemolostream(f'filtered.mol2') as ofs:
for mol in target_molecules:
oechem.OEWriteMolecule(ofs, oechem.OEGraphMol(mol))

# Filter series to include only those with IC50s
filter_IC50 = False
if filter_IC50:
print(f'Retaining only molecules with IC50s...')
target_molecules = [ mol for mol in target_molecules if has_ic50(mol) ]
print(f' There are {len(target_molecules)} target molecules')

# Prepend assayed molecules at beginning of target molecules.
target_molecules = assayed_molecules + target_molecules
print(f' Prepended {len(assayed_molecules)} assayed molecules to target molecules')

# Generate poses for all molecules
output_filename = f'{prefix}-dockscores-{fragment}.sdf'
generate_poses(receptor, refmol, target_molecules, output_filename)
output_filename = f'{sprint_path}/{prefix}-{reference_protein_fragment}-{reference_ligand_fragment}.sdf'
generate_poses(receptor, refmol, target_molecules, scaffold_smarts, output_filename)
4 changes: 4 additions & 0 deletions synthetic-enumeration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Modeling of design conformations suggested by this [excellent blog post from Pat
* `sprint-2/` : Nucleophilic displacement for 3-aminopyridine
* `results/` : results and ordered compounds
* `sprint-3/` : Benzotriazoles
* `sprint-4/` : Ugi enumeration toward P2

#### Sprint 1 : Retrospective and prospective 3-aminopyridine

Expand Down Expand Up @@ -57,6 +58,9 @@ Modeling of design conformations suggested by this [excellent blog post from Pat
* `2020-08-20-benzotriazoles.csv` - benzotriazole derivatives
* `2020-08-20-benzotriazoles-dockscores-x10876.sdf` - docked to x10876

#### Sprint 4 : Ugi enumeration toward P4


## Misc
* `activity-data-2020-07-29.csv`: activity data downloaded from the [COVID Moonshot](https://covid.postera.ai/covid/activity_data.csv) on 2020-07-29
* `activity-data-2020-07-29-conformers-x10789.sdf.gz`: strictly filtered 3-aminopyridine related set for retrospective testing, with activity data preserved as SD tags (40 compounds)
Expand Down
Loading