#### This script uses LocalColabFold to repair the PDB files in AbAGym, specifically:
* Add missing side-chain atoms
* Add missing residues
* Mutate some wrong residues in the original files to the correct ones  

In [2]:
import os
import warnings
from pathlib import Path

import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils import seq3
from pdbfixer import PDBFixer

warnings.filterwarnings("ignore")

working_dir = Path(os.getcwd()).parent
rcsb_dir = working_dir / "RCSB_PDBs"
repair_dir = working_dir / "Problematic_PDBs"

##### Produce a table of metadata

In [3]:
metatbl = pd.DataFrame(columns=["project", "dms_table", "dms_pdb", "rcsb_id"])

for data_dir in working_dir.iterdir():
    if data_dir.name.count("_") == 2 and not data_dir.name.count("."):  # Excludes dirs like "src"
        if (data_dir/"PDB_structures").exists() and (data_dir/"DMS_interface_data").exists():
            for fpath in (data_dir/"DMS_interface_data").iterdir():
                project_id = "_".join(fpath.name.split("_")[:-3])
                rcsb_id = project_id.split("_")[-1]
                metatbl.loc[len(metatbl.index)] = [
                    project_id, 
                    str(fpath), 
                    str(data_dir / f"PDB_structures/{project_id}.pdb"), 
                    rcsb_id
                ]

metatbl.to_csv(working_dir / "DMS_metadata.csv", index=False)

##### Check PDB files

In [4]:
def get_seqres_atom_seq_mapper(rcsb_file):
    chain_to_seqres_seq = {r.id[-1]: str(r.seq) for r in SeqIO.parse(rcsb_file, "pdb-seqres")}
    chain_to_atom_seq = {r.id[-1]: str(r.seq) for r in SeqIO.parse(rcsb_file, "pdb-atom")}
    atom_to_seqres = {}
    for chain in chain_to_seqres_seq:
        atom_to_seqres[chain_to_atom_seq[chain]] = chain_to_seqres_seq[chain]
    return atom_to_seqres


def get_seqres_lines(chain, seq):
    seq = [seq3(a).upper() for a in list(seq)]
    seq_len = len(seq)
    seqres_lines = []
    i = 0
    for i in range(seq_len//13):
        seq_perline = seq[13*i:13*i+13]
        line = f"SEQRES{str(i+1):^5}{chain}{str(seq_len):^7}{' '.join(seq_perline)}"
        seqres_lines.append(line)
    seq_perline = seq[13*i+13:]
    line = f"SEQRES{str(i+2):^4} {chain}{str(seq_len):^7}{' '.join(seq_perline)}"
    seqres_lines.append(line)
    seqres = "\n".join(seqres_lines)
    return seqres


def write_seqres(sample_file, rcsb_file, outdir):
    chain_to_atom_seq = {
        r.id[-1]: str(r.seq) 
        for r in list(SeqIO.parse(sample_file, "pdb-atom"))
    }
    rcsb_atom_to_seqres = get_seqres_atom_seq_mapper(rcsb_file)
    chain_to_seqres_seq = {
        chain: rcsb_atom_to_seqres[seq]
        for chain, seq in chain_to_atom_seq.items()
    }
    seqres_lines = []
    for chain, seq in chain_to_seqres_seq.items():
        seqres_lines.append(get_seqres_lines(chain, seq))
    seqres_lines = "\n".join(seqres_lines)
    # Merge structure and seqres
    with open(sample_file, "r") as fp:
        pdb_content = fp.read()
    outfile = f"{outdir}/{Path(sample_file).name}"
    with open(outfile, "w") as fp:
        fp.write(seqres_lines+"\n"+pdb_content)
    return outfile


for i in metatbl.index:
    repaired_file = write_seqres(
        metatbl.loc[i, "dms_pdb"],
        f"{rcsb_dir}/{metatbl.loc[i, 'rcsb_id']}.pdb",
        str(repair_dir)
    )
    metatbl.loc[i, "repaired_pdb"] = repaired_file

metatbl.to_csv(working_dir / "DMS_metadata.csv", index=False)

In [7]:
pdbs_to_mutate = {
    "BD55-3152_7wr8": [(339, 'D', 'G')],
    "BD55-5840_7wrz": [(339, 'D', 'G')],
    "CAB-A17_8c2r": [(417, 'N', 'K'), (496, 'S', 'G'), (477, 'N', 'S'), (501, 'Y', 'N'), (505, 'H', 'Y'), (493, 'R', 'Q')],
    "D441_1mlc": [(32, 'Y', 'N'), (31, 'T', 'N'), (50, 'E', 'Y'), (33, 'W', 'L')],
    "EDE1-C8_5lbs": [(317, 'I', 'V')],
    "ZKA64_5kvf": [(343, 'A', 'V')],
    "ZV-67_5kvg": [(393, 'E', 'D'), (317, 'I', 'V')]
}


def check_seq_entirety(pdbfixer: PDBFixer):
    missing_resids = pdbfixer.missingResidues
    chains = list(pdbfixer.topology.chains())
    flags = []
    for chain_idx, posit in missing_resids.keys():
        chain = chains[chain_idx]
        if posit == 0 or len(list(chain.residues())) == posit:
            flags.append(1)
        else:
            flags.append(2)
    if 2 in flags:
        return 2
    elif 1 in flags:
        return 1
    else:
        return 0


for i in metatbl.index:
    fixer = PDBFixer(metatbl.loc[i, "repaired_pdb"])
    fixer.findMissingResidues()
    fixer.findMissingAtoms()
    missing_atoms = fixer.missingAtoms
    # print(metatbl.loc[i, "project"])
    seq_entirety_flag = check_seq_entirety(fixer)
    metatbl.loc[i, "seq_entirety"] = seq_entirety_flag
    metatbl.loc[i, "atom_entirety"] = int(bool(missing_atoms))
    mutations = pdbs_to_mutate.get(metatbl.loc[i, "project"], [])
    metatbl.loc[i, "mutation_dict"] = ",".join([f"{mut[1]}{mut[0]}{mut[2]}" for mut in mutations])

metatbl.to_csv(working_dir / "DMS_metadata.csv", index=False)

##### Get target sequences