In [None]:
import os
import warnings
from pathlib import Path

import requests
from Bio import SeqIO
from Bio.SeqUtils import seq3
from openmm.app import PDBFile
from pdbfixer import PDBFixer

warnings.filterwarnings("ignore")

working_dir = Path(os.getcwd()).parent
rcsb_dir = working_dir / "RCSB_PDBs"
rcsb_dir.mkdir(parents=True, exist_ok=True)

#### Write SEQRES to ATOM-only PDBs 

##### 1) Collects PDB IDs

In [None]:
sample_to_rscb = {}  # Maps sample ID to RCSB PDB ID
for data_dir in working_dir.iterdir():
    if data_dir.name.count("_") == 2 and not data_dir.name.count("."):  # Excludes dirs like "src"
        if (data_dir/"PDB_structures").exists():
            for fpath in (data_dir/"PDB_structures").iterdir():
                sample_id = fpath.name.split(".")[0]
                rscb_id = sample_id.split("_")[-1]
                sample_to_rscb[sample_id] = {"rcsb": rscb_id, "dms_pdb_file": str(fpath)}
sample_to_rscb

In [None]:
metadata = {
    "4edw": {
        "rcsb": "4edw",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/NGF_whiteheadlab_2018/PDB_structures/4edw.pdb"
    },
    "FC08_7dx4": {
        "rcsb": "7dx4",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/FC08_7dx4.pdb"
    },
    "BD-812_7ezv": {
        "rcsb": "7ezv",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/BD-812_7ezv.pdb"
    },
    "BD55-3372_7wro": {
        "rcsb": "7wro",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/BD55-3372_7wro.pdb"
    },
    "BD-804_7eya": {
        "rcsb": "7eya",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/BD-804_7eya.pdb"
    },
    "BD55-5840_7wrz": {
        "rcsb": "7wrz",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/BD55-5840_7wrz.pdb"
    },
    "BD55-3152_7wr8": {
        "rcsb": "7wr8",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/BD55-3152_7wr8.pdb"
    },
    "BD55-1239_7wrl": {
        "rcsb": "7wrl",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/BD55-1239_7wrl.pdb"
    },
    "BD-836_7ezv": {
        "rcsb": "7ezv",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/BD-836_7ezv.pdb"
    },
    "LY-CoV1404_7mmo": {
        "rcsb": "7mmo",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2022/PDB_structures/LY-CoV1404_7mmo.pdb"
    },
    "7y71": {
        "rcsb": "7y71",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2023/PDB_structures/7y71.pdb_not_clean"
    },
    "D441_1mlc": {
        "rcsb": "1mlc",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/lysozyme_fleishmanlab_2019/PDB_structures/D441_1mlc.pdb"
    },
    # "5fuu": {
    #     "rcsb": "5fuu",
    #     "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2017/PDB_structures/5fuu.cif"
    # },
    "LY-CoV555_7kmg": {
        "rcsb": "7kmg",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/LY-CoV555_7kmg.pdb"
    },
    "C105_6xcm": {
        "rcsb": "6xcm",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/C105_6xcm.pdb"
    },
    "REGN10933_6xdg": {
        "rcsb": "6xdg",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/REGN10933_6xdg.pdb"
    },
    "C135_7k8z": {
        "rcsb": "7k8z",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/C135_7k8z.pdb"
    },
    "C002_7k8s": {
        "rcsb": "7k8s",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/C002_7k8s.pdb"
    },
    "C110_7k8v": {
        "rcsb": "7k8v",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/C110_7k8v.pdb"
    },
    "LY-CoV016_7c01": {
        "rcsb": "7c01",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/LY-CoV016_7c01.pdb"
    },
    "C121_7k8x": {
        "rcsb": "7k8x",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/C121_7k8x.pdb"
    },
    "REGN10987_6xdg": {
        "rcsb": "6xdg",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/REGN10987_6xdg.pdb"
    },
    "C144_7k90": {
        "rcsb": "7k90",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021c/PDB_structures/C144_7k90.pdb"
    },
    "VRC01_5fyk": {
        "rcsb": "5fyk",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019a/PDB_structures/VRC01_5fyk.pdb"
    },
    "PGT151_5fuu": {
        "rcsb": "5fuu",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019a/PDB_structures/PGT151_5fuu.pdb"
    },
    "PG9_3u4e": {
        "rcsb": "3u4e",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019a/PDB_structures/PG9_3u4e.pdb"
    },
    "3BNC117_5v8m": {
        "rcsb": "5v8m",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019a/PDB_structures/3BNC117_5v8m.pdb"
    },
    "3BN-1074_5t3z": {
        "rcsb": "5t3z",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019a/PDB_structures/3BN-1074_5t3z.pdb"
    },
    "PGT145_5v8l": {
        "rcsb": "5v8l",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019a/PDB_structures/PGT145_5v8l.pdb"
    },
    "PGT121_5fyl": {
        "rcsb": "5fyl",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019a/PDB_structures/PGT121_5fyl.pdb"
    },
    "ZKA64_5kvf": {
        "rcsb": "5kvf",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/zika_JBloom_2019/PDB_structures/ZKA64_5kvf.pdb"
    },
    "6xdg": {
        "rcsb": "6xdg",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021e/PDB_structures/6xdg.pdb"
    },
    "6xdg_REGN10933-only": {
        "rcsb": "6xdg",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021e/PDB_structures/6xdg_REGN10933-only.pdb"
    },
    "6xdg_REGN10987-only": {
        "rcsb": "6xdg",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021e/PDB_structures/6xdg_REGN10987-only.pdb"
    },
    "7c01": {
        "rcsb": "7c01",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021e/PDB_structures/7c01.pdb"
    },
    "AZD1061_7l7e": {
        "rcsb": "7l7e",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021a/PDB_structures/AZD1061_7l7e.pdb"
    },
    "AZD8895_7l7d": {
        "rcsb": "7l7d",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021a/PDB_structures/AZD8895_7l7d.pdb"
    },
    "FI6V_3ztn": {
        "rcsb": "3ztn",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/Influenza_JBloom_2018/PDB_structures/FI6V_3ztn.pdb"
    },
    "C179_4hlz": {
        "rcsb": "4hlz",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/Influenza_JBloom_2018/PDB_structures/C179_4hlz.pdb"
    },
    "S139_4gms": {
        "rcsb": "4gms",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/Influenza_JBloom_2018/PDB_structures/S139_4gms.pdb"
    },
    "S2X259_7m7w": {
        "rcsb": "7m7w",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S2X259_7m7w.pdb"
    },
    "S309_7r6w": {
        "rcsb": "7r6w",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S309_7r6w.pdb"
    },
    "S2D106_7r7n": {
        "rcsb": "7r7n",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S2D106_7r7n.pdb"
    },
    "S2H13_7jv6": {
        "rcsb": "7jv6",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S2H13_7jv6.pdb"
    },
    "S2H97_7m7w": {
        "rcsb": "7m7w",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S2H97_7m7w.pdb"
    },
    "S304_7jx3": {
        "rcsb": "7jx3",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S304_7jx3.pdb"
    },
    "S2H14_7jx3": {
        "rcsb": "7jx3",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S2H14_7jx3.pdb"
    },
    "S2E12_7r6x": {
        "rcsb": "7r6x",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S2E12_7r6x.pdb"
    },
    "S2X35_7r6w": {
        "rcsb": "7r6w",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2021d/PDB_structures/S2X35_7r6w.pdb"
    },
    "106E6_6n1w": {
        "rcsb": "6n1w",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019b/PDB_structures/106E6_6n1w.pdb"
    },
    "OPV20_6osy": {
        "rcsb": "6osy",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019b/PDB_structures/OPV20_6osy.pdb"
    },
    "DF1W314_6mph": {
        "rcsb": "6mph",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019b/PDB_structures/DF1W314_6mph.pdb"
    },
    "17D4_6n1v": {
        "rcsb": "6n1v",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019b/PDB_structures/17D4_6n1v.pdb"
    },
    "110D12_6mph": {
        "rcsb": "6mph",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019b/PDB_structures/110D12_6mph.pdb"
    },
    "OPV12_6ot1": {
        "rcsb": "6ot1",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2019b/PDB_structures/OPV12_6ot1.pdb"
    },
    "SIgN-3C_7bua": {
        "rcsb": "7bua",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/zika_JBloom_2023/PDB_structures/SIgN-3C_7bua.pdb"
    },
    "MZ4_6niu": {
        "rcsb": "6niu",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/zika_JBloom_2023/PDB_structures/MZ4_6niu.pdb"
    },
    "ZV-67_5kvg": {
        "rcsb": "5kvg",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/zika_JBloom_2023/PDB_structures/ZV-67_5kvg.pdb"
    },
    "EDE1-C8_5lbs": {
        "rcsb": "5lbs",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/zika_JBloom_2023/PDB_structures/EDE1-C8_5lbs.pdb"
    },
    "EDE1-C10_5h37": {
        "rcsb": "5h37",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/zika_JBloom_2023/PDB_structures/EDE1-C10_5h37.pdb"
    },
    "118_6udj": {
        "rcsb": "6udj",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2020/PDB_structures/118_6udj.pdb"
    },
    "FP20-01_6cde": {
        "rcsb": "6cde",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2018/PDB_structures/FP20-01_6cde.pdb"
    },
    "FP16-02_6cdi": {
        "rcsb": "6cdi",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2018/PDB_structures/FP16-02_6cdi.pdb"
    },
    "VRC34-01_5i8h": {
        "rcsb": "5i8h",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/HIV_JBloom_2018/PDB_structures/VRC34-01_5i8h.pdb"
    },
    "FI6V3-H1_3ztn": {
        "rcsb": "3ztn",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/Influenza_JBloom_2020/PDB_structures/FI6V3-H1_3ztn.pdb"
    },
    "CR9114_4fqy": {
        "rcsb": "4fqy",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/Influenza_JBloom_2020/PDB_structures/CR9114_4fqy.pdb"
    },
    "FI6V3-H3_3ztj": {
        "rcsb": "3ztj",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/Influenza_JBloom_2020/PDB_structures/FI6V3-H3_3ztj.pdb"
    },
    "CAB-A17_8c2r": {
        "rcsb": "8c2r",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/COVID-19_JBloom_2024/PDB_structures/CAB-A17_8c2r.pdb"
    },
    "Cetuximab_1yy9": {
        "rcsb": "1yy9",
        "dms_pdb_file": "/home/dongli/DMS_antibody_dataset/EGFR_AbbVie_2013/PDB_structures/Cetuximab_1yy9.pdb"
    }
}

##### 2) Crawl full PDB files from RCSB

In [None]:
def crawl_pdb(rcsb_ids, rcsb_dir):
    for rcsb_id in rcsb_ids:
        url = f"https://files.rcsb.org/view/{rcsb_id}.pdb"
        response = requests.get(url)
        if response.status_code == 200:
            content = response.content.decode("utf-8")
            with open(Path(rcsb_dir)/f"{rcsb_id}.pdb", "w") as fp:
                fp.write(content)
        else:
            print(f"{rcsb_id}: {response.status_code}")


rcsb_ids = list([data["rcsb"] for data in metadata.values()])
crawl_pdb(rcsb_ids, rcsb_dir)

##### 3) Write SEQRES lines

In [None]:
def get_seqres_lines(chain, seq):
    seq = [seq3(a).upper() for a in list(seq)]
    seq_len = len(seq)
    seqres_lines = []
    i = 0
    for i in range(seq_len//13):
        seq_perline = seq[13*i:13*i+13]
        line = f"SEQRES{str(i+1):^5}{chain}{str(seq_len):^7}{' '.join(seq_perline)}"
        seqres_lines.append(line)
    seq_perline = seq[13*i+13:]
    line = f"SEQRES{str(i+2):^4} {chain}{str(seq_len):^7}{' '.join(seq_perline)}"
    seqres_lines.append(line)
    seqres = "\n".join(seqres_lines)
    return seqres


def write_seqres(sample_file, rcsb_file, outdir):
    # After chain renaming, the DMS chains and RCSB chains are not the same!!!
    chains = [r.id[-1] for r in SeqIO.parse(sample_file, "pdb-atom")]
    chain_to_seq = {
        r.id[-1]: str(r.seq) 
        for r in list(SeqIO.parse(rcsb_file, "pdb-seqres"))
        if r.id[-1] in chains
    }
    seqres_lines = []
    for chain, seq in chain_to_seq.items():
        seqres_lines.append(get_seqres_lines(chain, seq))
    seqres_lines = "\n".join(seqres_lines)
    # Merge structure and seqres
    with open(sample_file, "r") as fp:
        pdb_content = fp.read()
    with open(f"{outdir}/{Path(sample_file).name}", "w") as fp:
        fp.write(seqres_lines+"\n"+pdb_content)


for sid, data in metadata.items():
    write_seqres(
        data["dms_pdb_file"],
        rcsb_dir/f"{data['rcsb']}.pdb", 
        "/home/dongli/DMS_antibody_dataset/Repaired_PDBs"
    )

#### Mutate residues in DMS PDB files

In [None]:
fixer = PDBFixer(...)
topo = fixer.topology
for chain in topo.chains():
    pass

#### Add missing side-chain atoms

In [None]:
fixer = PDBFixer(...)
fixer.findMissingAtoms()
