In [1]:
%load_ext autoreload
%autoreload 2

# Directly from David Hahn's paper

In [26]:
import re
import shutil
from pathlib import Path

import pandas as pd
from QligFEP.functions import convert_error, convert_value
from QligFEP.functions import unit as u
from rdkit import Chem
from rdkit.Chem import rdmolfiles
from yaml import safe_load

# data is downloaded from the zenodo DOI:10.5281/zenodo.4813734
# link: https://zenodo.org/records/6600875 (version 0.2.1)

downloaded_root = "/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data"
Path(downloaded_root).exists()

True

In [2]:
def lig_data_from_yml(yml_path: Path):
    ligands_txt = yml_path.read_text().split("\n")
    ligands_data = safe_load("\n".join(ligands_txt))

    formatted_ligdata = {}
    for key, value in ligands_data.items():
        new_key = key.replace("lig_", "")  # remove lig_ prefix
        formatted_ligdata[new_key] = {}
        for name, val in value.items():
            if name == "name":
                continue
            elif name == "measurement":
                for meas_name, meas_val in val.items():
                    # make measurement names as dictionary keys & values
                    formatted_ligdata[new_key]["measurement_" + meas_name] = meas_val
            else:
                formatted_ligdata[new_key][name.replace("lig_", "")] = val
    return formatted_ligdata


def load_and_update_mol(sdf_path, formatted_ligdata):
    """function to load a molecule and update its properties according to the ligand data dictionary

    Args:
        sdf_path: path to sdf file
        formatted_ligdata: dictionary containing the properties to be set to the file

    Returns:
        retuns the molecule with the properties set.
    """
    mols = [mol for mol in Chem.SDMolSupplier(str(sdf_path)) if mol is not None]
    assert len(mols) == 1
    mol = mols[0]
    mol.SetProp("_Name", sdf_path.stem.replace("lig_", ""))
    for key, value in formatted_ligdata[sdf_path.stem.replace("lig_", "")].items():
        mol.SetProp(key, str(value))
    return mol

In [3]:
targ_paths = sorted(
    list(Path(downloaded_root).glob("*-*-*_*/"))
)  # pattern for the directories
targ_paths

[PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-09-23_jnk1'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-09-23_pde2'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-09-23_thrombin'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-09_p38'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-12_ptp1b'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-13_cdk2'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-13_cmet'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-13_galectin'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2019-12-13_mcl1'),
 PosixPath('/home/david/openforcefield-protein-ligand-benchmark-fe6f969/data/2020-02-04_bace'),
 PosixPath('/home/david/openforc

In [34]:
all_dfs = []
date_regex = re.compile(r"\d\d\d\d-\d\d-\d\d_")

for target_p in targ_paths:
    targ_name = date_regex.sub("", target_p.name)
    local_root = Path(targ_name)

    local_lig = local_root / "ligands"
    local_prot = local_root / "protein"

    if not local_lig.exists():
        local_lig.mkdir(parents=True)
    if not local_prot.exists():
        local_prot.mkdir(parents=True)

    data_root = target_p / "00_data"

    shutil.copyfile(data_root / "target.yml", local_prot / "target.yml")
    ligands_data = lig_data_from_yml(data_root / "ligands.yml")

    for key, propdict in ligands_data.items():
        value = propdict["measurement_value"]
        unit = u(propdict["measurement_unit"])
        _type = propdict["measurement_type"]
        error_value = propdict["measurement_error"]

        converted_val = convert_value(
            value * unit,
            original_type=propdict["measurement_type"],
            final_type="dg",
            temperature=300,
            out_unit="kcal/mol",
        )
        converted_err = (
            convert_error(
                error_value * unit,
                value * unit,
                original_type=_type,
                final_type="dg",
                temperature=300,
                out_unit="kcal/mol",
            )
            if error_value > 0
            else 0
        )
        ligands_data[key].update(
            {"dg_value": converted_val.magnitude, "dg_error": converted_err}
        )

    # print(ligands_data)
    all_dfs.append(
        pd.DataFrame.from_dict(ligands_data, orient="index")
        .reset_index(names=["lig"])
        .assign(target=targ_name)
    )

    sdfs_root = target_p / "02_ligands"
    lig_sdfs = sorted(list(sdfs_root.glob("lig_*/crd/*.sdf")))
    lig_mols = [load_and_update_mol(sdf, ligands_data) for sdf in lig_sdfs]

    writer = rdmolfiles.SDWriter(f"{str(local_lig)}/ligands.sdf")
    for mol in lig_mols:
        writer.write(mol)

    # also copy the protein data
    pdbs_root = target_p / "01_protein/crd/"
    pdbs = sorted(list(pdbs_root.glob("*.pdb")))
    for pdb in pdbs:
        shutil.copyfile(pdb, local_prot / pdb.name)

all_dfs = pd.concat(all_dfs, ignore_index=True)

In [None]:
target_data = (data_root / "target.yml").read_text()

ligands_txt = (data_root / "ligands.yml").read_text().split("\n")
ligands_data = safe_load("\n".join(ligands_txt))

formatted_ligdata = {}
for key, value in ligands_data.items():
    new_key = key.replace("lig_", "")  # remove lig_ prefix
    formatted_ligdata[new_key] = {}
    for name, val in value.items():
        if name == "name":
            continue
        elif name == "measurement":
            for meas_name, meas_val in val.items():
                # make measurement names as dictionary keys & values
                formatted_ligdata[new_key]["measurement_" + meas_name] = meas_val
        else:
            formatted_ligdata[new_key][name.replace("lig_", "")] = val

# Directly from `protein-ligand-benchmark` repo

# For this we have to install their stuff, which I do with the command:
```bash
python -m pip install git+https://github.com/openforcefield/protein-ligand-benchmark.git@fd88824f9114244f95a14b485e6d6c96c1de716d
```

This ensures that we have the same code present in the commit hash I'm going to use to extract the data...

In [32]:
import json
import re
from io import StringIO
from pathlib import Path
from urllib.parse import urljoin

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import requests
import yaml
from plbenchmark import utils

targets = [  # list of the targets available in the given commit hash
    "cdk2",
    "cdk8",
    "cmet",
    "eg5",
    "hif2a",
    "mcl1",
    "p38",
    "pde2",
    "pfkfb3",
    "ptp1b",
    "shp2",
    "syk",
    "thrombin",
    "tnks2",
    "tyk2",
]

ligand_prefix_regex = re.compile(r"^lig_")
pdb_ignore_regex = re.compile(
    r"^TITLE|^REMARK|^SEQRES|^FORMUL|^HELIX|^TURN|^SHEET|^CRYST1|^CONECT"
)

# URL variables to be used for protein and ligand files
commit_hash = "fd88824f9114244f95a14b485e6d6c96c1de716d"
rawurl_base = (
    "https://raw.githubusercontent.com/openforcefield/protein-ligand-benchmark/"
)
protfile_url_fmt = "{commit_hash}/data/{target}/01_protein/crd/protein.pdb"
ligfile_url_fmt = "{commit_hash}/data/{target}/02_ligands/ligands.sdf"
lig_metadata_url_fmt = "{commit_hash}/data/{target}/00_data/ligands.yml"

# URL variables to be used for edge files
edge_files = [
    "kartograf_mapper_lomap_network.yml",
    "kartograf_mapper_lomap_network_no_element_changes.yml",
    "kartograf_mapper_mst_network.yml",
    "kartograf_mapper_mst_network_no_element_changes.yml",
    "kartograf_mapper_radial_network_element_changes.yml",
    "kartograf_mapper_radial_network_no_element_changes.yml",
    "lomap_mapper_lomap_network.yml",
    "lomap_mapper_mst_network.yml",
    "lomap_mapper_mst_network_no_element_changes.yml",
]

edge_url_fmt = "{commit_hash}/data/{target}/03_edges/{edge_file}"

## Define the functions to extract the data

In [38]:
def get_protein_and_ligand_files_txt(commit_hash, target):
    """function to get the protein and ligand files from the given commit hash and target

    Args:
        commit_hash: the commit hash of the repository
        target: the target for which the files are to be fetched

    Returns:
        returns the protein and ligand files as text
    """
    protfile_url = urljoin(
        rawurl_base, protfile_url_fmt.format(commit_hash=commit_hash, target=target)
    )
    ligfile_url = urljoin(
        rawurl_base, ligfile_url_fmt.format(commit_hash=commit_hash, target=target)
    )

    protfile_txt = requests.get(protfile_url).text
    protfile_txt = "\n".join(  # filter out the metadata lines we don't need
        line for line in protfile_txt.split("\n") if not pdb_ignore_regex.match(line)
    )

    ligfile_txt = requests.get(ligfile_url).text
    ligands = []
    sdf_content = []
    is_prop = -1
    for line in ligfile_txt.split("\n"):
        if line.startswith(">"):
            is_prop = 1
        if line.startswith("lig_"):
            line = line.replace("lig_", "")
            if is_prop != 0:  # if it's 0, the is a property of the ligand
                ligands.append(line)
        sdf_content.append(line)
        is_prop -= 1
    ligfile_txt = "\n".join(sdf_content)
    return protfile_txt, ligfile_txt, ligands

In [39]:
def get_plbenchmark_node_dict(rawurl_base, lig_metadata_url_fmt, target):
    ligand_sample_url = urljoin(
        rawurl_base, lig_metadata_url_fmt.format(commit_hash=commit_hash, target=target)
    )
    yml_file = requests.get(ligand_sample_url).text
    nodes_dict = yaml.load(StringIO(yml_file), Loader=yaml.FullLoader)

    for key, value_dict in nodes_dict.items():
        measurement_dict = value_dict.pop("measurement")
        ori_type = measurement_dict["type"]
        unit = utils.unit(measurement_dict["unit"])
        conv_val = utils.convert_value(
            measurement_dict["value"] * unit, ori_type, "dg", out_unit=None
        )
        conv_err = utils.convert_value(
            measurement_dict["error"] * unit,
            measurement_dict["value"] * unit,
            ori_type,
            "dg",
            out_unit=None,
        )
        name = value_dict.pop("name").replace("lig_", "")
        smiles = value_dict.pop("smiles")
        value_dict.update(
            {
                "name": name,
                **{f"measurement_{key}": val for key, val in measurement_dict.items()},
                "smiles": smiles,
                "dg_value": conv_val.magnitude,
                "dg_error": (conv_err.magnitude if conv_err is not None else None),
                "dg_unit": "kilocalorie / mole",
            }
        )
    return {k.replace("lig_", ""): v for k, v in nodes_dict.items()}

In [40]:
def get_edges_list(edge_url_fmt, edge_file, target) -> list[dict]:
    edge_url_final = urljoin(
        rawurl_base,
        edge_url_fmt.format(
            commit_hash=commit_hash, target=target, edge_file=edge_file
        ),
    )
    yml_file = requests.get(edge_url_final).text
    edges_dict = yaml.load(StringIO(yml_file), Loader=yaml.FullLoader)
    edges_metadata = {
        "remarks": edges_dict["remarks"],
        "planner": edges_dict["planner"],
    }
    edges_list = (
        pd.DataFrame.from_dict(edges_dict["edges"], orient="index")
        .reset_index(names=["fep_name"])
        .assign(
            ligand_a=lambda x: x["ligand_a"].str.replace("lig_", ""),
            ligand_b=lambda x: x["ligand_b"].str.replace("lig_", ""),
            fep_name=lambda x: x["fep_name"]
            .str.replace("fep_", "")
            .str.replace("lig_", "")
            .str.replace("edge_", "FEP_"),
            score_value=lambda x: x["score"].apply(lambda _dict: _dict["value"]),
            score_method=lambda x: x["score"].apply(lambda _dict: _dict["method"]),
        )
        .drop(columns=["atom mapping", "score", "mapper", "remarks"])
        .rename(columns={"score_value": "score", "ligand_a": "from", "ligand_b": "to"})
    ).to_dict(orient="records")
    return edges_list, edges_metadata

In [41]:
def add_ddg_to_edges(edges_list, nodes_dict):
    in_edge_nodes = []
    all_nodes = list(nodes_dict.keys())
    for edge in edges_list:
        _from = nodes_dict[edge["from"]]
        _to = nodes_dict[edge["to"]]
        edge.update(
            {
                "ddg_value": round(_to["dg_value"] - _from["dg_value"], 2),
                "ddg_error": (
                    round(
                        np.sqrt(
                            np.power(_to["dg_error"], 2.0)
                            + np.power(_from["dg_error"], 2.0)
                        ),
                        2,
                    )
                    if _to["dg_error"] is not None and _from["dg_error"] is not None
                    else None
                ),
            }
        )
        in_edge_nodes.extend([_from["name"], _to["name"]])
    not_in_edge_nodes = np.setdiff1d(all_nodes, in_edge_nodes).tolist()
    if not_in_edge_nodes:
        print(f"Nodes not in edges: {not_in_edge_nodes}")
    return edges_list

## Fetch & save data per target

In [42]:
for target in targets:
    print(f"Processing {target}...")
    base_path = Path(target)
    protroot_path = base_path / "protein"
    ligroot_path = base_path / "ligands"

    for _dir in [protroot_path, ligroot_path]:
        if not _dir.exists():
            _dir.mkdir(parents=True, exist_ok=True)

    protfile, ligfile, lignames = get_protein_and_ligand_files_txt(commit_hash, target)

    with (protroot_path / "protein.pdb").open("w") as f:
        f.write(protfile)
    with (ligroot_path / "ligands.sdf").open("w") as f:
        f.write(ligfile)

    nodes_dict = get_plbenchmark_node_dict(
        rawurl_base=rawurl_base,
        lig_metadata_url_fmt=lig_metadata_url_fmt,
        target=target,
    )
    edges_list, mapping_metadata = get_edges_list(
        edge_url_fmt=edge_url_fmt,
        edge_file="kartograf_mapper_lomap_network.yml",
        target=target,
    )
    edges_list = add_ddg_to_edges(edges_list, nodes_dict)
    # assert all the lignames in nodes_dict
    for ligname in lignames:
        if ligname not in nodes_dict:
            print(f"ligand {ligname} not in nodes_dict")

    final_mapping_dict = {
        "edges": edges_list,
        "nodes": nodes_dict,
        "mapping_metadata": mapping_metadata,
    }

    with (ligroot_path / "mapping.json").open("w") as f:
        json.dump(final_mapping_dict, f, indent=4)

Processing cdk2...
Processing cdk8...
Processing cmet...
Processing eg5...
Processing hif2a...
Processing mcl1...
Processing p38...
Processing pde2...
Processing pfkfb3...
Processing ptp1b...
Processing shp2...
Processing syk...
Processing thrombin...
Nodes not in edges: ['7d']
Processing tnks2...
Processing tyk2...


In [30]:
nodes_dict

{'ejm_31': {'name': 'ejm_31',
  'measurement_comment': 'Table 4, entry 31',
  'measurement_doi': '10.1016/j.ejmech.2013.03.070',
  'measurement_error': 0.029,
  'measurement_type': 'ki',
  'measurement_unit': 'uM',
  'measurement_value': 0.096,
  'smiles': '[H]c1c(c(c(c(c1[H])Cl)C(=O)N([H])c2c(c(nc(c2[H])N([H])C(=O)C([H])([H])[H])[H])[H])Cl)[H]',
  'dg_value': -9.63,
  'dg_error': None,
  'dg_unit': 'kilocalorie / mole'},
 'ejm_42': {'name': 'ejm_42',
  'measurement_comment': 'Table 4, entry 42',
  'measurement_doi': '10.1016/j.ejmech.2013.03.070',
  'measurement_error': 0.019,
  'measurement_type': 'ki',
  'measurement_unit': 'uM',
  'measurement_value': 0.064,
  'smiles': '[H]c1c(c(c(c(c1[H])Cl)C(=O)N([H])c2c(c(nc(c2[H])N([H])C(=O)C([H])([H])C([H])([H])[H])[H])[H])Cl)[H]',
  'dg_value': -9.88,
  'dg_error': None,
  'dg_unit': 'kilocalorie / mole'},
 'ejm_43': {'name': 'ejm_43',
  'measurement_comment': 'Table 4, entry 43',
  'measurement_doi': '10.1016/j.ejmech.2013.03.070',
  'measu

## Inspect graph topology

In [43]:
def generate_graph(df):
    all_nodes = np.unique(df["ligand_a"].tolist() + df["ligand_b"].tolist())
    G = nx.Graph()
    for node in all_nodes:
        G.add_node(node, label=node)
    for idx, row in df.iterrows():
        G.add_edge(row["ligand_a"], row["ligand_b"])
    # for idx, row in df.iterrows():
    #     G[row["ligand_a"]][row["ligand_b"]]["weight"] = row["weight"]
    return G


for edge_file in edge_files:
    edge_url_final = urljoin(
        rawurl_base,
        edge_url_fmt.format(
            commit_hash=commit_hash, target=targets[1], edge_file=edge_file
        ),
    )
    yml_file = requests.get(edge_url_final).text
    nodes_dict = yaml.load(StringIO(yml_file), Loader=yaml.FullLoader)
    df = (
        pd.DataFrame.from_dict(nodes_dict["edges"], orient="index").reset_index(
            names=["fep_name"]
        )
        # .drop(columns=["atom mapping", "score", "mapper", "remarks"])
        .assign(
            ligand_a=lambda x: x["ligand_a"].str.replace("lig_", ""),
            ligand_b=lambda x: x["ligand_b"].str.replace("lig_", ""),
            fep_name=lambda x: x["fep_name"]
            .str.replace("fep_", "")
            .str.replace("edge_", "FEP_"),
        )
    )
    G = generate_graph(df)

    edges = G.edges()
    # pos = nx.spring_layout(G, seed=4)
    # nx.draw_networkx_nodes(G, pos, node_size=500)
    # nx.draw_networkx_labels(G, pos)
    # # nx.draw_networkx_edges(G, pos, edgelist=edges, edge_color='r', arrows=True)
    # nx.draw_networkx_edges(G, pos, edgelist=edges, arrows=True)
    # ax = plt.gca()
    # ax.set_title(edge_file)
    # plt.show()