# Imports

In [86]:
from pathlib import Path
from QligFEP.pdb_utils import nest_pdb, unnest_pdb, read_pdb_to_dataframe, write_dataframe_to_pdb
from QligFEP.CLI.pdb_to_amber import asp_search
import pandas as pd

# Define functions

In [87]:
# Need to remove Hs from GLY
# Need to cap the last residue

rename_mapping = {
    'ARG': {
        "1HH1": "HH11",
        "2HH1": "HH12",
        "1HH2": "HH21",
        "2HH2": "HH22",
        "HA2": "HA3",
        "HA1": "HA2",
        "HB2": "HB3",
        "HB1": "HB2",
        "HG2": "HG3",
        "HG1": "HG2",
        "HD2": "HD3",
        "HD1": "HD2",
    },
    'ILE': {
        "1HG2": "HG21",
        "2HG2": "HG22",
        "3HG2": "HG23",
        # HG1 with has the number +1 in our naming scheme
        "1HG1": "HG12",
        "2HG1": "HG13",
        # this is fine...
        "CD": "CD1",
        "HD1": "HD11",
        "HD2": "HD12",
        "HD3": "HD13",
    },
    'THR': {
        "1HG2": "HG21",
        "2HG2": "HG22",
        "3HG2": "HG23",
    },
    "LEU": {
        "1HD1": "HD11",
        "2HD1": "HD12",
        "3HD1": "HD13",
        "1HD2": "HD21",
        "2HD2": "HD22",
        "3HD2": "HD23",
        "HA2": "HA3",
        "HA1": "HA2",
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "GLN": {
        "1HE2": "HE21",
        "2HE2": "HE22",
        "HA2": "HA3",
        "HA1": "HA2",
        "HB2": "HB3",
        "HB1": "HB2",
        "HG2": "HG3",
        "HG1": "HG2",
    },
    "GLY": {
        "HA2": "HA3",
        "HA1": "HA2",
    },
    "VAL": {
        "1HG1": "HG11",
        "2HG1": "HG12",
        "3HG1": "HG13",
        "1HG2": "HG21",
        "2HG2": "HG22",
        "3HG2": "HG23",
    },
    "SER": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "PHE": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "GLU": {
        "HB2": "HB3",
        "HB1": "HB2",
        "HG2": "HG3",
        "HG1": "HG2",
    },
    "ASP": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "ASH": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "ASN": {
        "HB2": "HB3",
        "HB1": "HB2",
        "1HD2": "HD21",
        "2HD2": "HD22",
    },
    "LYS": {
        "HB2": "HB3",
        "HB1": "HB2",
        "HG2": "HG3",
        "HG1": "HG2",
        "HD2": "HD3",
        "HD1": "HD2",
        "HE2": "HE3",
        "HE1": "HE2",
    },
    "SER": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "PRO": {
        "HB2": "HB3",
        "HB1": "HB2",
        "HD2": "HD3",
        "HD1": "HD2",
        "HG2": "HG3",
        "HG1": "HG2",
    },
    "MET": {
        "HB2": "HB3",
        "HB1": "HB2",
        "HG2": "HG3",
        "HG1": "HG2",
    },
    "TYR": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "HIE": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "HIP": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "HID": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "TRP": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "CYS": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
    "CYX": {
        "HB2": "HB3",
        "HB1": "HB2",
    },
}

def correct_amino_acid_atom_names(npdb_i, resname, rename_mapping):
    """corrects the amino acid atom names according to the mapping provided

    Args:
        npdb_i: nested pdb data structure for a single residue
        resname: the residue name
        rename_mapping: a dictionary mapping old names to new names
    """
    rename_atom_map = rename_mapping.get(resname, {})
    if resname in rename_mapping:
        for old_name, new_name in rename_mapping[resname].items():
            npdb_i = [extract_and_replace(x, rename_atom_map) for x in npdb_i]
            # certify that we have the alignment as expected for pdb files
    return npdb_i

def extract_and_replace(line, rename_mapping):
    """extracts the atom name and replaces it with the new name"""
    atom_name = line[12:16].strip()
    new_atom_name = rename_mapping.get(atom_name, atom_name)
    if len(new_atom_name) == 4:
        return line[:12] + new_atom_name + line[16:]
    else:
        # return left aligned atom name always with len() == 3 but with a " " in the beginning
        return line[:12] + f" {new_atom_name:<3}" + line[16:]

def fix_pdb(pdb_path:Path, rename_mapping):
    renamed_pdb_path = pdb_path.with_name(pdb_path.stem + '_renamed.pdb')
    with open(pdb_path) as f:
        pdb_lines = f.readlines()

    npdb = nest_pdb(pdb_lines)
    npdb = asp_search(npdb)
    
    for i, res in enumerate(npdb):
        resname = res[-1][17:20]
        npdb[i] = correct_amino_acid_atom_names(npdb[i], resname, rename_mapping)
    pdb_lines = unnest_pdb(npdb)
    
    with open(renamed_pdb_path, "w") as f:
        for line in pdb_lines:
            f.write(line)
    return pdb_lines

def cap_and_reindex_pdb(inp_pdb: Path):
    """Function that removes additionaly hydrogens from N terminal not covered
    in our library files, caps the last residue and reindexes the atoms

    Args:
        inp_pdb: path for the pdb file
    """    
    
    pdb_df= read_pdb_to_dataframe(inp_pdb)
    
    # remove extra Hs from the first Gly residue
    if pdb_df['residue_name'].values[0] == 'GLY':
        first_residue = pdb_df['residue_seq_number'].values[0]
        # remove atoms with atom_name H2 and H3, and rename H1 to H
        subset_first = pdb_df[(pdb_df['residue_seq_number'] == first_residue) & (~pdb_df['atom_name'].isin(['H2', 'H3']))].copy()
        subset_first['atom_name'] = subset_first['atom_name'].str.replace('H1', 'H')
        rm_idxs = pdb_df.query('residue_seq_number == @first_residue').index
        pdb_df = pd.concat([subset_first, pdb_df.drop(index=rm_idxs)], ignore_index=True)
    
    # cap the last residue
    last_residue = pdb_df['residue_name'].values[-1]
    last_residue_number = pdb_df['residue_seq_number'].values[-1]
    if last_residue in ['ILE', 'NME']:
        rm_idxs = pdb_df.query('residue_seq_number == @last_residue_number').index
        pdb_df.drop(index=rm_idxs, inplace=True)
    _len = len(pdb_df)
    pdb_df['atom_serial_number'] = range(1, _len + 1)
    write_dataframe_to_pdb(pdb_df, inp_pdb)

# Rename the files

In [84]:
pdb_paths = sorted(Path().glob('*/protein/protein.pdb'))

In [85]:
for pdb_path in pdb_paths:
    fix_pdb(pdb_path, rename_mapping)
    cap_and_reindex_pdb(pdb_path.with_stem(pdb_path.stem + '_renamed'))

[PosixPath('bace/protein/protein.pdb'),
 PosixPath('bace_hunt/protein/protein.pdb'),
 PosixPath('bace_p2/protein/protein.pdb'),
 PosixPath('cdk2/protein/protein.pdb'),
 PosixPath('cdk8/protein/protein.pdb'),
 PosixPath('cmet/protein/protein.pdb'),
 PosixPath('eg5/protein/protein.pdb'),
 PosixPath('galectin/protein/protein.pdb'),
 PosixPath('hif2a/protein/protein.pdb'),
 PosixPath('hunt/protein/protein.pdb'),
 PosixPath('jnk1/protein/protein.pdb'),
 PosixPath('mcl1/protein/protein.pdb'),
 PosixPath('p2/protein/protein.pdb'),
 PosixPath('p38/protein/protein.pdb'),
 PosixPath('pde10/protein/protein.pdb'),
 PosixPath('pde2/protein/protein.pdb'),
 PosixPath('pfkfb3/protein/protein.pdb'),
 PosixPath('ptp1b/protein/protein.pdb'),
 PosixPath('shp2/protein/protein.pdb'),
 PosixPath('syk/protein/protein.pdb'),
 PosixPath('thrombin/protein/protein.pdb'),
 PosixPath('tnks2/protein/protein.pdb'),
 PosixPath('tyk2/protein/protein.pdb')]

# Preparing data

Maybe I can run qprep through the notebook already, so I get the COG of all the ligands, and then use to prepare the water spheres of the respective systems!