In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from QligFEP.pdb_utils import (
    nest_pdb,
    unnest_pdb,
    read_pdb_to_dataframe,
    write_dataframe_to_pdb,
)
from QligFEP.CLI.pdb_to_amber import asp_search, histidine_search
import pandas as pd

# Define functions

In [3]:
# Need to remove Hs from GLY
# Need to cap the last residue

rename_mapping = {
    "ACE": {
        "3H": "HH33",
        "2H": "HH32",
        "1H": "HH31",
        "H2_3": "HH33",
        "H2_2": "HH32",
        "H2_1": "HH31",
        "3HH3": "HH33",
        "2HH3": "HH32",
        "1HH3": "HH31",
        "C1": "C",
        "C2": "CH3",
        "O1": "O",
    },
    "NMA": {
        "C": "CT",
        "H3": "HA3",
        "H2": "HA2",
        "H1": "HA1",
        "3HA": "HA3",
        "2HA": "HA2",
        "1HA": "HA1",
        "H1_3": "HA3",
        "H1_2": "HA2",
        "H1_1": "HA1",
        "C1": "CA",
        "N1": "N",
    },
    "GLY": {"H1": "H", "H2": "H"},
    "ARG": {"H2": "H"},
    "GLU": {"H2": "H"},
    "GLN": {"H2": "H"},
    "THR": {"H2": "H"},
    "LEU": {"H1": "H"},
    "VAL": {
        "3HG1": "HG13",
        "2HG1": "HG12",
        "1HG1": "HG11",
        "3HG2": "HG13",
        "2HG2": "HG12",
        "1HG2": "HG11",
    },
}


def reindex_pdb_residues(pdb_path: Path, out_pdb_path: str):
    pdb_df = read_pdb_to_dataframe(pdb_path)
    uniq_indexes = pdb_df.set_index(
        ["residue_seq_number", "residue_name", "chain_id", "insertion_code"]
    ).index
    resn_mapping = {resn: idx for idx, resn in enumerate(uniq_indexes.unique(), 1)}
    pdb_df["residue_seq_number"] = uniq_indexes.map(resn_mapping)
    pdb_df["insertion_code"] = ""
    # pdb_df = pdb_df.assign(residue_seq_number=uniq_indexes.map(resn_mapping))
    write_dataframe_to_pdb(pdb_df, out_pdb_path)


def correct_amino_acid_atom_names(npdb_i, resname, rename_mapping):
    """corrects the amino acid atom names according to the mapping provided

    Args:
        npdb_i: nested pdb data structure for a single residue
        resname: the residue name
        rename_mapping: a dictionary mapping old names to new names
    """
    if resname in rename_mapping:
        for old_name, new_name in rename_mapping[resname].items():
            npdb_i = [extract_and_replace(x, old_name, new_name) for x in npdb_i]
            # certify that we have the alignment as expected for pdb files
    return npdb_i


def extract_and_replace(line, old_name, new_name):
    """extracts the atom name and replaces it with the new name"""
    atom_name = line[12:16].strip()
    if atom_name != old_name:
        return line
    new_atom_name = atom_name.replace(old_name, new_name).strip()
    if len(new_atom_name) == 4:
        return line[:12] + new_atom_name + line[16:]
    else:
        # return left aligned atom name always with len() == 3 but with a " " in the beginning
        return line[:12] + f" {new_atom_name:<3}" + line[16:]


def fix_pdb(pdb_path: Path, rename_mapping):
    renamed_pdb_path = pdb_path.with_name(pdb_path.stem + "_renamed.pdb")
    with open(pdb_path) as f:
        pdb_lines = f.readlines()

    npdb = nest_pdb(pdb_lines)
    npdb = asp_search(npdb)
    npdb = histidine_search(npdb)

    for i, res in enumerate(npdb):
        resname = res[-1][17:20]
        if resname == "HIS":  # rename to HIP according to our FF library
            npdb[i] = [x.replace("HIS", "HIP") for x in npdb[i]]
            resname = "HIP"
        if resname == "NME":  # we use NMA in our FF library
            npdb[i] = [x.replace("NME", "NMA") for x in npdb[i]]
            resname = "NMA"
        npdb[i] = correct_amino_acid_atom_names(npdb[i], resname, rename_mapping)
    pdb_lines = unnest_pdb(npdb)

    with open(renamed_pdb_path, "w") as f:
        for line in pdb_lines:
            f.write(line)
    return pdb_lines


def cap_and_reindex_pdb(inp_pdb: Path):
    """Function that removes additionaly hydrogens from N terminal not covered
    in our library files, caps the last residue and reindexes the atoms

    Args:
        inp_pdb: path for the pdb file
    """

    pdb_df = read_pdb_to_dataframe(inp_pdb)

    # remove extra Hs from the first Gly residue
    if pdb_df["residue_name"].values[0] in ["GLY", "LEU", "GLU", "ASH", "ILE", "ASN"]:
        first_residue = pdb_df["residue_seq_number"].values[0]
        # remove atoms with atom_name H2 and H3, and rename H1 to H
        subset_first = pdb_df[
            (pdb_df["residue_seq_number"] == first_residue)
            & (~pdb_df["atom_name"].isin(["H2", "H3"]))
        ].copy()
        subset_first["atom_name"] = subset_first["atom_name"].str.replace("H1", "H")
        rm_idxs = pdb_df.query("residue_seq_number == @first_residue").index
        pdb_df = pd.concat(
            [subset_first, pdb_df.drop(index=rm_idxs)], ignore_index=True
        )
    # cap the last residue
    last_residue = pdb_df["residue_name"].values[-1]
    last_residue_number = pdb_df["residue_seq_number"].values[-1]  # noqa: F841
    if last_residue in ["ILE", "NME"]:
        rm_idxs = pdb_df.query("residue_seq_number == @last_residue_number").index
        pdb_df.drop(index=rm_idxs, inplace=True)
    _len = len(pdb_df)
    pdb_df["atom_serial_number"] = range(1, _len + 1)
    write_dataframe_to_pdb(pdb_df, inp_pdb)

# Rename the protein files

In [4]:
pdb_paths = sorted(Path().glob("*/protein/protein.pdb"))

In [5]:
pdb_path = Path("thrombin/protein/protein.pdb")

reindexed_path = pdb_path.with_stem(pdb_path.stem + "_reindexed")
reindex_pdb_residues(pdb_path, out_pdb_path=reindexed_path)

In [6]:
for pdb_path in pdb_paths:
    reindexed_path = pdb_path.with_stem(pdb_path.stem + "_reindexed")
    reindex_pdb_residues(pdb_path, out_pdb_path=reindexed_path)
    fix_pdb(reindexed_path, rename_mapping)
    # cap_and_reindex_pdb(pdb_path.with_stem(pdb_path.stem + "_renamed"))

# Rename the water & cofactor files

In [7]:
atom_renaming_dict = {
    # salts
    "MG": "MAG",
    "ZN": "ZIN",
    "NA": "SOD",
}
residue_renaming_dict = {
    "MG": "MAG",
    "ZN": "ZIN",
}


def rename_cofactor_atoms(pdb_path: Path, atom_renaming_dict: dict):
    pdb_df = read_pdb_to_dataframe(pdb_path).assign(
        atom_name=lambda x: x["atom_name"].replace(atom_renaming_dict),
        residue_name=lambda x: x["residue_name"]
        .str.strip(" ")
        .replace(residue_renaming_dict),
    )
    write_dataframe_to_pdb(pdb_df, pdb_path)

In [8]:
pdb_paths = sorted(Path().glob("*/protein/protein_reindexed_renamed.pdb"))

for pdb_path in pdb_paths:
    if pdb_path.stat().st_size != 0:
        rename_cofactor_atoms(pdb_path, atom_renaming_dict)

# Merge renamed protein & cofactor files

In [9]:
# prot_root_paths = sorted(Path().glob("*/protein/"))

# for _path in prot_root_paths:
#     processed_pdbs = []
#     protfile = _path / "protein_renamed.pdb"
#     cofactor = _path / "cofactors_crystalwater_renamed.pdb"

#     prot_df = read_pdb_to_dataframe(protfile)

#     # reindex both atom_serial_number and residue_seq_number
#     prot_df["atom_serial_number"] = range(1, len(prot_df) + 1)
#     residue_seq_mapping = {
#         old: new
#         for old, new in zip(
#             prot_df["residue_seq_number"].unique(),
#             range(1, len(prot_df["residue_seq_number"].unique()) + 1),
#         )
#     }
#     prot_df["residue_seq_number"] = prot_df["residue_seq_number"].replace(
#         residue_seq_mapping
#     )
#     last_prot_res = prot_df["residue_seq_number"].max()
#     last_prot_atom = prot_df["atom_serial_number"].max()
#     processed_pdbs.append(prot_df)

#     if cofactor.exists():
#         print("Including cofactors for ", _path)
#         cof_df = read_pdb_to_dataframe(cofactor)
#         cof_df["atom_serial_number"] = range(
#             last_prot_res + 1, last_prot_res + len(cof_df) + 1
#         )
#         residue_seq_mapping = {
#             old: new
#             for old, new in zip(
#                 cof_df["residue_seq_number"].unique(),
#                 range(
#                     last_prot_res + 1,
#                     last_prot_res + len(cof_df["residue_seq_number"].unique() + 1),
#                 ),
#             )
#         }
#         cof_df["residue_seq_number"] = cof_df["residue_seq_number"].replace(
#             residue_seq_mapping
#         )
#         processed_pdbs.append(cof_df)
#     final_df = pd.concat(processed_pdbs, ignore_index=True)
#     write_dataframe_to_pdb(final_df, protfile.parent / "protfile_final.pdb")

# Preparing data

running qprep through the notebook. First we get the Center of Geometry (COG) of all the ligands, and then use it prepare the water spheres of the respective systems. The COG is the center of the water sphere

In [11]:
import shutil
from QligFEP.CLI.qprep_cli import main, QprepError, QprepAtomLibMissingError
from QligFEP.CLI.cog_cli import MolecularCOG
import argparse
import os

prot_root_paths = sorted([p.absolute() for p in Path().glob("*/protein/")])
cwd = Path.cwd()

for _path in prot_root_paths:
    print("Processing: ", _path.parent.name)
    if _path.parent.name == "pfkfb3":
        print(f"{_path.parent.name} not in the repo yet as of 20241010")
        continue
    processed_pdbs = []
    protfile = _path / "protein_reindexed_renamed.pdb"
    qprep_dir = _path / "qprep"
    if not qprep_dir.exists():
        qprep_dir.mkdir()
    shutil.copy(protfile, qprep_dir / "protein.pdb")

    # change the working directory to the qprep directory
    os.chdir(qprep_dir)

    # calculate the center of geometry for the ligands
    ligpath = _path.parent / "ligands/ligands.sdf"
    cog = MolecularCOG(ligpath)
    coords_str = cog()
    coordinates = [n for n in coords_str.strip("[]").split()]

    args = argparse.Namespace()
    args.log_level = "info"
    args.input_pdb_file = "protein.pdb"
    args.FF = "AMBER14sb"
    args.cog = coordinates
    args.sphereradius = 25
    args.cysbond = "auto"
    args.solvent_pack = 3.0

    try:
        main(args)
    except QprepError as e:
        print("Qprep Error: ", e)
        pass
        # continue
    except QprepAtomLibMissingError as e:
        print("Qprep atomlib missing:", e)
        continue

os.chdir(cwd)



Processing:  bace


STOP qprep ended normally
[32m2024-10-11 16:32:20[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:20[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m


Processing:  cdk2


STOP qprep ended normally
[32m2024-10-11 16:32:21[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:21[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m
[32m2024-10-11 16:32:21[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m278[0m - [1mAll water molecules are inside the sphere radius.[0m


Processing:  cdk8


STOP qprep ended normally
[32m2024-10-11 16:32:22[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/cdk8/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.lib[0m
[32m2024-10-11 16:32:22[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/cdk8/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER1

Qprep Error:  {'>>>>> ERROR: Too many atoms in residue ASH    366\n>>>>> ERROR: The check of the PDB file failed.'}
Processing:  cmet


STOP qprep ended normally
[32m2024-10-11 16:32:23[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:23[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m


Processing:  eg5


STOP qprep ended normally
[32m2024-10-11 16:32:24[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:24[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m


Processing:  hif2a


STOP qprep ended normally
[32m2024-10-11 16:32:25[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:25[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m
[32m2024-10-11 16:32:25[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m278[0m - [1mAll water molecules are inside the sphere radius.[0m


Processing:  jnk1


STOP qprep ended normally
[32m2024-10-11 16:32:25[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/jnk1/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.lib[0m
[32m2024-10-11 16:32:25[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/jnk1/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER1

Qprep Error:  {'>>>>> ERROR: Too many atoms in residue HIP    358\n>>>>> ERROR: Too many atoms in residue PRO    359\n>>>>> ERROR: Too many atoms in residue PHE    368\n>>>>> ERROR: The check of the PDB file failed.'}
Processing:  mcl1


STOP qprep ended normally
[32m2024-10-11 16:32:26[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/mcl1/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.lib[0m
[32m2024-10-11 16:32:26[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/mcl1/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER1

Qprep Error:  {'>>>>> ERROR: Too many atoms in residue GLY      1\n>>>>> ERROR: The check of the PDB file failed.'}
Processing:  p38


STOP qprep ended normally
[32m2024-10-11 16:32:27[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:27[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m


Processing:  pfkfb3
pfkfb3 not in the repo yet as of 20241010
Processing:  ptp1b


STOP qprep ended normally
[32m2024-10-11 16:32:28[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:28[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m


Processing:  shp2


STOP qprep ended normally
[32m2024-10-11 16:32:29[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:29[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m
[32m2024-10-11 16:32:29[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m278[0m - [1mAll water molecules are inside the sphere radius.[0m


Processing:  syk


STOP qprep ended normally
[32m2024-10-11 16:32:30[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m72[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/syk/protein/qprep/qprep.out. Your protein file likely contains atoms that are not present in the forcefield's .lib & .prm files:, 
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.lib[0m


Qprep atomlib missing: {'>>> Atom CH3  in residue no.   273 not found in library entry for NMA '}
Processing:  thrombin


STOP qprep ended normally
[32m2024-10-11 16:32:30[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/thrombin/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.lib[0m
[32m2024-10-11 16:32:30[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/thrombin/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/F

Qprep Error:  {'>>>>> ERROR: Too many atoms in residue ILE     31\n>>>>> ERROR: Too many atoms in residue GLY    290\n>>>>> ERROR: Residue number   324 is of unknown type  NA \n>>>>> ERROR: The check of the PDB file failed.'}
Processing:  tnks2


STOP qprep ended normally
[32m2024-10-11 16:32:31[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/tnks2/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.lib[0m
[32m2024-10-11 16:32:31[0m | [31m[1mERROR   [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mqprep_error_check[0m:[36m65[0m - [31m[1mErrors found in qprep output file /zfsdata/data/david/qligfepv2-BenchmarkExperiments/startFiles/tnks2/protein/qprep/qprep.out. Please check if the amino acids in your pdb file match the residue & atom conventions on the forcefield .lib & .prm files:
/zfsdata/data/david/Q/src/QligFEP/FF/AMBER14sb.prm & /zfsdata/data/david/Q/src/QligFEP/FF/AMBE

Qprep Error:  {'>>>>> ERROR: Too many atoms in residue GLY      1\n>>>>> ERROR: Too many atoms in residue MET    163\n>>>>> ERROR: The check of the PDB file failed.'}
Processing:  tyk2


STOP qprep ended normally
[32m2024-10-11 16:32:31[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m235[0m - [1mqprep run finished. Check the output `qprep.out` for more information.[0m
[32m2024-10-11 16:32:31[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m258[0m - [1mwater.pdb file created.[0m
[32m2024-10-11 16:32:31[0m | [1mINFO    [0m | [36mQligFEP.CLI.qprep_cli[0m:[36mmain[0m:[36m278[0m - [1mAll water molecules are inside the sphere radius.[0m


# TODO:
After running qprep, remove the water molecules from `protein.pdb`.

# Checking for qprep errors:

In [10]:
import subprocess

outqprep_pattern = "*/protein/qprep/qprep.out"
p = subprocess.Popen(
    " ".join(["grep", "-winr", "error", outqprep_pattern]),
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    shell=True,
    text=True,
)
stdout, stderr = p.communicate()
stdout.split("\n")

['']

# Move perturbation files to directory

In [11]:
from pathlib import Path
from joblib import Parallel, delayed
import shutil


def copyfile(src: Path, dest: Path):
    try:
        shutil.copy(src, dest / src.name)
    except FileNotFoundError as e:
        print(f"error for src: {src}:\n{e}")

In [12]:
from QligFEP.chemIO import MoleculeIO

targets = [p for p in sorted(Path().glob("*/")) if p.is_dir()]

destpath = Path.cwd().parent / "perturbations"
for target in targets:
    ligands_sdf = sorted((target / "ligands/").glob("*.sdf"))
    ligands_pdb = sorted((target / "ligands/").glob("*.pdb"))
    ligands_lib = sorted((target / "ligands/").glob("*.lib"))
    ligands_prm = sorted((target / "ligands/").glob("*.prm"))
    lomap_file = target / "ligands/mapping.json"
    protein_file = target / "protein/qprep/protein.pdb"
    noHOH_protfile = target / "protein/qprep/protein_noHOH.pdb"
    water_file = target / "protein/qprep/water.pdb"
    perturbation_root = destpath / target.name
    if not perturbation_root.exists():
        perturbation_root.mkdir(parents=True, exist_ok=True)
    allfiles = (
        ligands_pdb
        + ligands_lib
        + ligands_prm
        + [
            lomap_file,
            (noHOH_protfile if noHOH_protfile.exists() else protein_file),
            water_file,
        ]
    )
    molio = MoleculeIO(str(list(ligands_sdf)[0]))
    molio.write_to_single_sdf(
        str(perturbation_root / "ligands.sdf")
    )  # reindex hydrogens
    Parallel(n_jobs=6, backend="threading")(
        delayed(copyfile)(src, perturbation_root) for src in allfiles
    )
    if (
        perturbation_root / "protein_noHOH.pdb"
    ).exists():  # rename it to just protein.pdb
        shutil.move(
            perturbation_root / "protein_noHOH.pdb", perturbation_root / "protein.pdb"
        )

[32m2024-09-10 10:49:51[0m | [1mINFO    [0m | [36mQligFEP.chemIO[0m:[36mwrite_to_single_sdf[0m:[36m164[0m - [1m`self.molecules` written to /zfsdata/data/david/qligfepv2-BenchmarkExperiments/perturbations/cdk2/ligands.sdf[0m
[32m2024-09-10 10:49:52[0m | [1mINFO    [0m | [36mQligFEP.chemIO[0m:[36mwrite_to_single_sdf[0m:[36m164[0m - [1m`self.molecules` written to /zfsdata/data/david/qligfepv2-BenchmarkExperiments/perturbations/cdk8/ligands.sdf[0m
[32m2024-09-10 10:49:53[0m | [1mINFO    [0m | [36mQligFEP.chemIO[0m:[36mwrite_to_single_sdf[0m:[36m164[0m - [1m`self.molecules` written to /zfsdata/data/david/qligfepv2-BenchmarkExperiments/perturbations/cmet/ligands.sdf[0m
[32m2024-09-10 10:49:54[0m | [1mINFO    [0m | [36mQligFEP.chemIO[0m:[36mwrite_to_single_sdf[0m:[36m164[0m - [1m`self.molecules` written to /zfsdata/data/david/qligfepv2-BenchmarkExperiments/perturbations/eg5/ligands.sdf[0m
[32m2024-09-10 10:49:56[0m | [1mINFO    [0m | [36mQl