In [1]:
%load_ext autoreload
%autoreload 2

## `IndustryBenchmarks2024`

### To install the package:

In this notebook we install the package & fetch the data from an specific git commit hash to ensure future reproducibility.

To install the package, run the following command:
```bash
python -m pip install git+https://github.com/OpenFreeEnergy/IndustryBenchmarks2024.git@30f6ec462f00ebc7359982cf827f45285ae2e69a.git
```

In [2]:
import json
import re
from io import StringIO
from pathlib import Path
from urllib.parse import urljoin

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import requests
import yaml
from plbenchmark import utils


targets = {
    "jacs_set": {
        "bace",
        "cdk2",
        "jnk1",
        "mcl1",
        "p38",
        "ptp1b",
        "thrombin",
        "tyk2",
    },
    "merck": {
        "cdk8",
        "cmet",
        "eg5",
        "hif2a",
        "pfkfb3",
        "shp2",
        "syk",
        "tnks2",
    },
}

ligand_prefix_regex = re.compile(r"^lig_")
pdb_ignore_regex = re.compile(
    r"^TITLE|^REMARK|^SEQRES|^FORMUL|^HELIX|^TURN|^SHEET|^CRYST1|^CONECT"
)

# URL variables to be used for protein and ligand files
commit_hash = "30f6ec462f00ebc7359982cf827f45285ae2e69a"
rawurl_base = "https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/"
protfile_url_fmt = "{commit_hash}/industry_benchmarks/input_structures/prepared_structures/{benchmarkset}/{target}/protein.pdb"
# benchmarkset as in: jacs_set, merck...
ligfile_url_fmt = "{commit_hash}/industry_benchmarks/input_structures/prepared_structures/{benchmarkset}/{target}/ligands.sdf"

## Define the functions to extract the data

In [3]:
def get_protein_and_ligand_files_txt(commit_hash, target, benchmarkset):
    """function to get the protein and ligand files from the given commit hash and target

    Args:
        commit_hash: the commit hash of the repository
        target: the target for which the files are to be fetched

    Returns:
        returns the protein and ligand files as text
    """
    protfile_url = urljoin(
        rawurl_base,
        protfile_url_fmt.format(
            commit_hash=commit_hash, target=target, benchmarkset=benchmarkset
        ),
    )
    ligfile_url = urljoin(
        rawurl_base,
        ligfile_url_fmt.format(
            commit_hash=commit_hash, target=target, benchmarkset=benchmarkset
        ),
    )

    protfile_txt = requests.get(protfile_url).text
    protfile_txt = "\n".join(  # filter out the metadata lines we don't need
        line for line in protfile_txt.split("\n") if not pdb_ignore_regex.match(line)
    )

    ligfile_txt = requests.get(ligfile_url).text
    ligands = []
    sdf_content = []
    is_prop = -1
    for line in ligfile_txt.split("\n"):
        if line.startswith(">"):
            is_prop = 1
        if line.startswith("lig_"):
            line = line.replace("lig_", "")
            if is_prop != 0:  # if it's 0, the is a property of the ligand
                ligands.append(line)
        sdf_content.append(line)
        is_prop -= 1
    ligfile_txt = "\n".join(sdf_content)
    return protfile_txt, ligfile_txt, ligands

## Fetch & save data per target

In [4]:
for key in targets.keys():
    benchmarkset = key
    for t in targets[key]:
        print(f"Processing {t}...")
        base_path = Path(t)
        protroot_path = base_path / "protein"
        ligroot_path = base_path / "ligands"

        for _dir in [protroot_path, ligroot_path]:
            if not _dir.exists():
                _dir.mkdir(parents=True, exist_ok=True)

        protfile, ligfile, lignames = get_protein_and_ligand_files_txt(
            commit_hash, t, benchmarkset
        )

        with (protroot_path / "protein.pdb").open("w") as f:
            f.write(protfile)
        with (ligroot_path / "ligands.sdf").open("w") as f:
            f.write(ligfile)

Processing mcl1...
Processing p38...
Processing bace...
Processing thrombin...
Processing ptp1b...
Processing tyk2...
Processing jnk1...
Processing cdk2...
Processing syk...
Processing cdk8...
Processing cmet...
Processing eg5...
Processing hif2a...
Processing shp2...
Processing pfkfb3...
Processing tnks2...
