In [6]:
from pathlib import Path
from yaml import safe_load
from rdkit import Chem
import shutil
from rdkit.Chem import rdmolfiles

# data is downloaded from the zenodo DOI:10.5281/zenodo.4813734
# link: https://zenodo.org/records/6600875 (version 0.2.1)

downloaded_root = "/mnt/c/Users/david/Downloads/protein-ligand-benchmark-0.2.1/openforcefield-protein-ligand-benchmark-fe6f969/data"

In [7]:
def lig_data_from_yml(yml_path: Path):
    ligands_txt = yml_path.read_text().split("\n")
    ligands_data = safe_load("\n".join(ligands_txt))

    formatted_ligdata = {}
    for key, value in ligands_data.items():
        new_key = key.replace("lig_", "")  # remove lig_ prefix
        formatted_ligdata[new_key] = {}
        for name, val in value.items():
            if name == "name":
                continue
            elif name == "measurement":
                for meas_name, meas_val in val.items():
                    # make measurement names as dictionary keys & values
                    formatted_ligdata[new_key]["measurement_" + meas_name] = meas_val
            else:
                formatted_ligdata[new_key][name.replace("lig_", "")] = val
    return formatted_ligdata


def load_and_update_mol(sdf_path, formatted_ligdata):
    """function to load a molecule and update its properties according to the ligand data dictionary

    Args:
        sdf_path: path to sdf file
        formatted_ligdata: dictionary containing the properties to be set to the file

    Returns:
        retuns the molecule with the properties set.
    """
    mols = [mol for mol in Chem.SDMolSupplier(str(sdf_path)) if mol is not None]
    assert len(mols) == 1
    mol = mols[0]
    mol.SetProp("_Name", sdf_path.stem.replace("lig_", ""))
    for key, value in formatted_ligdata[sdf_path.stem.replace("lig_", "")].items():
        mol.SetProp(key, str(value))
    return mol

In [12]:
targ_paths

[PosixPath('/mnt/c/Users/david/Downloads/protein-ligand-benchmark-0.2.1/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-09-23_jnk1'),
 PosixPath('/mnt/c/Users/david/Downloads/protein-ligand-benchmark-0.2.1/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-09-23_pde2'),
 PosixPath('/mnt/c/Users/david/Downloads/protein-ligand-benchmark-0.2.1/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-09-23_thrombin'),
 PosixPath('/mnt/c/Users/david/Downloads/protein-ligand-benchmark-0.2.1/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-09_p38'),
 PosixPath('/mnt/c/Users/david/Downloads/protein-ligand-benchmark-0.2.1/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-12_ptp1b'),
 PosixPath('/mnt/c/Users/david/Downloads/protein-ligand-benchmark-0.2.1/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-13_cdk2'),
 PosixPath('/mnt/c/Users/david/Downloads/protein-ligand-benchmark-0.2.1/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12

In [13]:
targ_paths = sorted(
    list(Path(downloaded_root).glob("*-*-*_*/"))
)  # pattern for the directories

for target_p in targ_paths:
    targ_name = target_p.name
    if any(["bace_hunt" in targ_name, "bace_p2" in targ_name]):
        targ_name = "_".join(targ_name.strip("0123456789-").split("_")[-2:])
        if targ_name == "bace_p":
            targ_name = "bace_p2"
        local_root = Path(targ_name)
    else:
        local_root = Path(targ_name.split("_")[-1])
    local_lig = local_root / "ligands"
    local_prot = local_root / "protein"

    if not local_lig.exists():
        local_lig.mkdir(parents=True)
    if not local_prot.exists():
        local_prot.mkdir(parents=True)

    data_root = target_p / "00_data"

    shutil.copyfile(data_root / "target.yml", local_prot / "target.yml")
    ligands_data = lig_data_from_yml(data_root / "ligands.yml")

    sdfs_root = target_p / "02_ligands"
    lig_sdfs = sorted(list(sdfs_root.glob("lig_*/crd/*.sdf")))
    lig_mols = [load_and_update_mol(sdf, ligands_data) for sdf in lig_sdfs]

    writer = rdmolfiles.SDWriter(f"{str(local_lig)}/ligands.sdf")
    for mol in lig_mols:
        writer.write(mol)

    # also copy the protein data
    pdbs_root = target_p / "01_protein/crd/"
    pdbs = sorted(list(pdbs_root.glob("*.pdb")))
    for pdb in pdbs:
        shutil.copyfile(pdb, local_prot / pdb.name)

bace_hunt
bace_p


In [71]:
lig_mols[0].GetPropsAsDict()

{'measurement_comment': 'table 1 cmpd 6t',
 'measurement_doi': '10.1021/jm060199b',
 'measurement_error': 38,
 'measurement_type': 'ic50',
 'measurement_unit': 'nM',
 'measurement_value': 77,
 'smiles': 'CCOc1c(c(cc(n1)NC(=O)Cc2cc(c(cc2OC)Br)OC)N)C#N'}

In [47]:


target_data = (data_root / "target.yml").read_text()

ligands_txt = (data_root / "ligands.yml").read_text().split("\n")
ligands_data = safe_load("\n".join(ligands_txt))

formatted_ligdata = {}
for key, value in ligands_data.items():
    new_key = key.replace("lig_", "")  # remove lig_ prefix
    formatted_ligdata[new_key] = {}
    for name, val in value.items():
        if name == "name":
            continue
        elif name == "measurement":
            for meas_name, meas_val in val.items():
                # make measurement names as dictionary keys & values
                formatted_ligdata[new_key]["measurement_" + meas_name] = meas_val
        else:
            formatted_ligdata[new_key][name.replace("lig_", "")] = val
            

    

In [51]:
lig_sdfs



In [13]:
protein_files = list((targ_paths[0] / "01_protein/crd").glob("*.pdb"))
protein_files

[PosixPath('/mnt/c/Users/david/Downloads/DavidHahn_benchmarkingPoses/openforcefield/protein-ligand-benchmark-0.2.0/openforcefield-protein-ligand-benchmark-7fc6021/data/2019-09-23_jnk1/01_protein/crd/cofactors_crystalwater.pdb'),
 PosixPath('/mnt/c/Users/david/Downloads/DavidHahn_benchmarkingPoses/openforcefield/protein-ligand-benchmark-0.2.0/openforcefield-protein-ligand-benchmark-7fc6021/data/2019-09-23_jnk1/01_protein/crd/protein.pdb')]