# pdbparsing.ipynb
Uses BioPyton to find clothia formatted pdb files in multiple directories and and identify the amino acid chains, their intra- and inter-molecular relationships, and display the sequence data. 
The routines in the cells form the basis for extracting data to format for H and T pdb files for RFantibody and sequence analyses.  

In [14]:
import Bio
print(Bio.__version__)

1.85


## Find and count the number of Clothia pdb files

In [15]:
from pathlib import Path

base_dir = Path(".")  # Replace with your root folder

pdb_files = []

for file in base_dir.rglob("*chothia.pdb"):
    # Check if the grandparent directory name is 4 chars (pdbId)
    # file.parent = 4-char dir, file.parent.parent = one of the 10 dirs
    if len(file.parent.name) == 4 and file.parent.parent.parent == base_dir:
        pdb_files.append(file)

print(f"Found {len(pdb_files)} *chothia.pdb files")


Found 23 *chothia.pdb files


## Number of aa chains/file
Count the number of chains in each file, list wether the file as a single or multiple structures, and note the chains' IDs

In [24]:
from collections import Counter
from Bio.PDB import PDBParser
import pandas as pd

# Create parser
parser = PDBParser(QUIET=True)
chain_counts = []

for pdb_file in sorted(pdb_files):
    structure = parser.get_structure(pdb_file.stem, pdb_file)
    chain_ids = set()
    for model in structure:
        for chain in model:
            chain_ids.add(chain.id)
        break  # first model only
    num_chains = len(chain_ids)
    classification = "single" if num_chains == 2 else "multi" if num_chains % 2 == 0 else "other"
    chain_counts.append({
        "PDB": pdb_file.name,
        "NumChains": num_chains,
        "Classification": classification,
        "Chains": ",".join(sorted(chain_ids))
    })

df_chains = pd.DataFrame(chain_counts)
df_chains_sorted = df_chains.sort_values(by="NumChains")
df_chains_sorted

Unnamed: 0,PDB,NumChains,Classification,Chains
0,6lr7_chothia.pdb,2,single,"A,B"
19,8g0i_chothia.pdb,2,single,"A,D"
15,7sak_chothia.pdb,2,single,"A,B"
13,7sai_chothia.pdb,2,single,"A,C"
12,7sah_chothia.pdb,2,single,"A,B"
10,6ir1_chothia.pdb,2,single,"A,B"
9,7e53_chothia.pdb,2,single,"A,B"
11,6ir2_chothia.pdb,2,single,"A,B"
7,3g9a_chothia.pdb,2,single,"A,B"
4,8rle_chothia.pdb,3,other,"B,C,D"


## Chains and structures
For each chain in a file, get the filename, the files structures (listed as {'ID1':'ID2', 'ID3':'ID4', ... }), the chain's ID, chain's role, and its aa sequence (length, and sequence)

In [26]:
import re
from Bio.PDB import PDBParser, Polypeptide
from Bio.SeqUtils import seq1
from pathlib import Path
import pandas as pd

parser = PDBParser(QUIET=True)
# pdb_dir = Path("Rcomb prot purification/7saj")  # adjust path


def parse_remark_5_roles(pdb_path):
    struct = {}
    hchains = set()
    agchains = set()
    with open(pdb_path) as f:
        for line in f:
            if line.startswith("REMARK   5 SINGLE HCHAIN="):
                match = re.search(r"HCHAIN=(\w)\s+AGCHAIN=(\w)", line)
                if match:
                    h, ag = match.group(1), match.group(2)
                    struct[h] = ag
                    hchains.add(h)
                    agchains.add(ag)
    return struct, hchains, agchains

def extract_chain_sequences_selective(pdb_path, agchains_wanted):
    structure = parser.get_structure(pdb_path.stem, pdb_path)
    chain_seqs = {}
    for model in structure:
        for chain in model:
            residues = []
            for res in chain:
                hetfield, resseq, icode = res.id
                is_hetatm = hetfield != " "
                is_ag = chain.id in agchains_wanted

                if is_hetatm and not is_ag:
                    continue  # skip HETATM for HChains

                try:
                    aa = seq1(res.get_resname(), custom_map={"MSE": "M", "SEP": "S", "TPO": "T", "PTR": "Y"})
                    residues.append(aa)
                except KeyError:
                    continue
            if residues:
                chain_seqs[chain.id] = "".join(residues)
        break  # only first model
    return chain_seqs

records = []

for pdb_file in sorted(pdb_files):
    # pdb_file in sorted(pdb_dir.glob("*chothia.pdb")):
    struct, hchains, agchains = parse_remark_5_roles(pdb_file)
    chain_seqs = extract_chain_sequences_selective(pdb_file, agchains)

    for cid, seq in chain_seqs.items():
        role = "HChain" if cid in hchains else "AGChain" if cid in agchains else "Other"
        records.append({
            "PDB": pdb_file.name,
            "Structure": struct,
            "Chain": cid,
            "Role": role,
            "Length": len(seq),
            "Sequence": seq
        })

df = pd.DataFrame(records)
df


Unnamed: 0,PDB,Structure,Chain,Role,Length,Sequence
0,6lr7_chothia.pdb,{'B': 'A'},A,AGChain,364,GEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICT...
1,6lr7_chothia.pdb,{'B': 'A'},B,HChain,125,VQLVESGGRLVQAGDSLRLSCAASGRTFSTSAMAWFRQAPGREREF...
2,8rl9_chothia.pdb,"{'D': 'B', 'K': 'A'}",B,AGChain,225,EELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTT...
3,8rl9_chothia.pdb,"{'D': 'B', 'K': 'A'}",A,AGChain,435,PERRVRSTLKKVFGFDSFKTPLQESATMAVVKGNKDVFVCMPTGAG...
4,8rl9_chothia.pdb,"{'D': 'B', 'K': 'A'}",D,HChain,114,QVQLVENGGACVKPGGSLRLSCAASGFPVNRYSMRWYRQAPGKERE...
...,...,...,...,...,...,...
86,8sfz_chothia.pdb,"{'C': 'D', 'A': 'F', 'B': 'F'}",E,Other,226,KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFIC...
87,8sfz_chothia.pdb,"{'C': 'D', 'A': 'F', 'B': 'F'}",F,AGChain,282,KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFIC...
88,8sfz_chothia.pdb,"{'C': 'D', 'A': 'F', 'B': 'F'}",C,HChain,129,DVQLVESGGGLVQAGGSLRLSCTVSGRTFSNYAMGWFRQAPGKERE...
89,8sfz_chothia.pdb,"{'C': 'D', 'A': 'F', 'B': 'F'}",A,HChain,123,DVQLVESGGGLVQAGGSLRLSCTVSGRTFSNYAMGWFRQAPGKERE...


## Chain pairings
Same as above, but also note chain parings

In [27]:
import re
from Bio.PDB import PDBParser, Polypeptide
from Bio.SeqUtils import seq1
from pathlib import Path
import pandas as pd

parser = PDBParser(QUIET=True)
# pdb_dir = Path(".")  # adjust path
# pdb_files = sorted(pdb_dir.glob("*chothia.pdb"))


def parse_remark_5_roles(pdb_path):
    struct = {}
    hchains = set()
    agchains = set()
    with open(pdb_path) as f:
        for line in f:
            if line.startswith("REMARK   5 SINGLE HCHAIN="):
                match = re.search(r"HCHAIN=(\w)\s+AGCHAIN=(\w)", line)
                if match:
                    h, ag = match.group(1), match.group(2)
                    struct[h] = ag
                    hchains.add(h)
                    agchains.add(ag)
    return struct, hchains, agchains


def extract_chain_sequences_selective(pdb_path, agchains_wanted):
    structure = parser.get_structure(pdb_path.stem, pdb_path)
    chain_seqs = {}
    for model in structure:
        for chain in model:
            residues = []
            for res in chain:
                hetfield, resseq, icode = res.id
                is_hetatm = hetfield != " "
                is_ag = chain.id in agchains_wanted

                if is_hetatm and not is_ag:
                    continue  # skip HETATM for HChains

                try:
                    aa = seq1(res.get_resname(), custom_map={"MSE": "M", "SEP": "S", "TPO": "T", "PTR": "Y"})
                    residues.append(aa)
                except KeyError:
                    continue
            if residues:
                chain_seqs[chain.id] = "".join(residues)
        break  # only first model
    return chain_seqs


records = []

for pdb_file in pdb_files:
    struct, hchains, agchains = parse_remark_5_roles(pdb_file)
    chain_seqs = extract_chain_sequences_selective(pdb_file, agchains)

    for cid, seq in chain_seqs.items():
        role = "HChain" if cid in hchains else "AGChain" if cid in agchains else "Other"
        # Determine which chain this one pairs with, if any
        if role == "HChain":
            pairs_with = struct.get(cid, None)
        elif role == "AGChain":
            # reverse lookup for AGChain
            pairs_with = [h for h, ag in struct.items() if ag == cid]
            pairs_with = ",".join(pairs_with) if pairs_with else None
        else:
            pairs_with = None

        records.append({
            "PDB": pdb_file.name,
            "Structure": struct,
            "Chain": cid,
            "PairsWith": pairs_with,
            "Role": role,
            "Length": len(seq),
            "Sequence": seq
        })

df = pd.DataFrame(records)
df

Unnamed: 0,PDB,Structure,Chain,PairsWith,Role,Length,Sequence
0,7e53_chothia.pdb,{'B': 'A'},A,B,AGChain,284,SKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFI...
1,7e53_chothia.pdb,{'B': 'A'},B,A,HChain,126,QVQLQESGGGSVQAGGSLRLSCAASGPTYSSYFMAWFRQAPGMERE...
2,6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",A,,Other,765,RTLNRYEKIANDIDAIRGDYENLSDDALKHKTIEFKERLEKGATTD...
3,6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",Y,V,AGChain,422,FRTISNFMRVSDIRNKIIFTLLMLIVFRIGTFIPVPSVNTDVLKLQ...
4,6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",E,,Other,58,QRVTNFFKEVVRELKKVSWPNRKELVNYTAVVLATVAFFTVFFAVI...
...,...,...,...,...,...,...,...
86,3ogo_chothia.pdb,"{'E': 'A', 'F': 'C', 'G': 'B', 'H': 'B'}",F,C,HChain,115,QVQLVESGGALVQPGGSLRLSCAASGFPVNRYSMRWYRQAPGKERE...
87,3ogo_chothia.pdb,"{'E': 'A', 'F': 'C', 'G': 'B', 'H': 'B'}",G,B,HChain,116,QVQLVESGGALVQPGGSLRLSCAASGFPVNRYSMRWYRQAPGKERE...
88,3ogo_chothia.pdb,"{'E': 'A', 'F': 'C', 'G': 'B', 'H': 'B'}",H,B,HChain,116,QVQLVESGGALVQPGGSLRLSCAASGFPVNRYSMRWYRQAPGKERE...
89,6lr7_chothia.pdb,{'B': 'A'},A,B,AGChain,364,GEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICT...


## Data Display
Display the above data in a scrollable table

In [29]:
from IPython.display import display, HTML

def display_scrollable_df(df):
    html = df.to_html(index=False)
    display(HTML(f"""
        <div style="max-width: 100%; overflow-x: auto; white-space: nowrap; border: 1px solid #ddd;">
            <style>
                table {{border-collapse: collapse;}}
                td, th {{
                    text-align: left !important;
                    padding: 4px 8px;
                }}
            </style>
            {html}
        </div>
    """))

# Usage
display_scrollable_df(df)


PDB,Structure,Chain,PairsWith,Role,Length,Sequence
7e53_chothia.pdb,{'B': 'A'},A,B,AGChain,284,SKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLXVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7e53_chothia.pdb,{'B': 'A'},B,A,HChain,126,QVQLQESGGGSVQAGGSLRLSCAASGPTYSSYFMAWFRQAPGMEREGVAASSYDGSTTLYADSVKGRFTISQGNAKNTKFLLLNNLEPEDTAIYYCALRRRGWSNTSGWKQPGWYDYWGQGTQVTV
6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",A,,Other,765,RTLNRYEKIANDIDAIRGDYENLSDDALKHKTIEFKERLEKGATTDDLLVEAFAVVREASRRVTGMFPFKVQLMGGVALHDGNIAEMKTGEGKTLTSTLPVYLNALTGKGVHVVTVNEYLASRDAEQMGKIFEFLGLTVGLNLNSMSKDEKREAYAADITYSTNNELGFDYLRDNMVLYKEQMVQRPLHFAVIDEVDSILIDEARTPLIISGQAAKSTKLYVQANAFVRTLKAEKDYTYDIKTKAVQLTEEGMTKAEKAFGIDNLFDVKHVALNHHINQALKAHVAMQKDVDYVVEDGQVVIVDSFTGRLMKGRRYSEGLHQAIEAKEGLEIQNESMTLATITFQNYFRMYEKLAGMTGTAKTEEEEFRNIYNMQVVTIPTNRPVVRDDRPDLIYRTMEGKFKAVAEDVAQRYMTGQPVLVGTVAVETSELISKLLKNKGIPHQVLNAKNHEREAQIIEEAGQKGAVTIATNMAGRGTDIKLGEGVKELGGLAVVGTERHESRRIDNQLRGRSGRQGDPGITQFYLSMEDELMRRFGAERTMAMLDRFGMDDSTPIQSKMVSRAVESSQKRVEGNNFDSRKQLLQYDDVLRQQREVIYKQRFEVIDSENLREIVENMIKSSLERAIAAYTPREELPEEWKLDGLVDLINTTYLDEGALEKSDIFGKEPDEMLELIMDRIITKYNEKEEQFGKEQMREFEKVIVLRAVDSKWMDHIDAMDQLRQGIHLRAYAQTNPLREYQMEGFAMFEHMIESIEDEVAKFVMKA
6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",Y,V,AGChain,422,FRTISNFMRVSDIRNKIIFTLLMLIVFRIGTFIPVPSVNTDVLKLQDQLNAFGVLNIFCGGALQNFSIFAMGVMPYITASIIVQLLQMDVVPKFAEWSKQGEMGRRKLAQFTRYFTIVLGFIQALGMSYGFNNLAGGMLIQNPGIGTYLLIAVVLTAGTAFLMWLGEQITAKGVGNGISIIIFAGIVSGIPTILNQIYAQTLNIVRLLLVALAVVAVIVGVIYIQQAFRKIPIQYAKRLEGRNPVGGHSTHLPLKVNPAGVIPVIFAVSFLIAPPTIASFFGTNDVTLWIRRTFDYTHPVGMTIYVVLIIAFTYFYAFVQVNPEQMADNLKKQGGYIPGIRPGKNTQEYVTRILYRLTLVGSLFLAFIAVLPVFFVNFANLPPSAQIGGTSLLIVVGVALETMKQLESQLVKRHYRGFIKXX
6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",E,,Other,58,QRVTNFFKEVVRELKKVSWPNRKELVNYTAVVLATVAFFTVFFAVIDLGISQLIRLVF
6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",B,,Other,43,AKKTAIAIAVALAGFATVASYAQYEDGCSGELERDSPHSYHSG
6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",G,C,AGChain,225,KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFXVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGI
6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",V,Y,HChain,116,QVQLVETGGGLVQPGGSLRLSCGASGSIFNMYAMGWYRQAPGKRREVVARIATDDSTMYPDSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCYYQRTVMSQPYWGQGTQVTVS
6itc_chothia.pdb,"{'V': 'Y', 'C': 'G'}",C,G,HChain,112,VALVESGGALVQPGGSLRLSCAASGFPVNRYSMRWYRQAPGKEREWVAGMSAGDRSSYEDSVKGRFTISRDDARNTVYLQMNSLKPEDTAVYYCNVNVGFEYWGQGTQVTVS
7cz0_chothia.pdb,"{'E': 'A', 'F': 'B', 'G': 'C', 'H': 'D'}",A,E,AGChain,220,SVIKPEMKIKLRMEGAVNGHKFVIEGEGIGKPYEGTQTLDLTVEEGAPLPFSYDILTPAFXNRAFTKYPEDIPDYFKQAFPEGYSWERSMTYEDQGICIATSDITMEGDCFFYEIRFDGTNFPPNGPVMQKKTLKWEPSTEKMYVEDGVLKGDVEMALLLEGGGHYRCDFKTTYKAKKDVRLPDAHEVDHRIEILSHDKDYNKVRLYEHAEARYSXXXXX
