In [2]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m126.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [4]:
import os
import requests
from Bio.PDB import PDBParser

def download_pdb(pdb_id, save_dir="."):
    """
    Downloads a PDB file from RCSB for the given PDB ID and saves it locally.
    pdb_id: The 4-character PDB ID (e.g., '1CRN').
    save_dir: Directory to save the downloaded PDB file.
    return: The local file path to the downloaded PDB file.
    """
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    local_filename = os.path.join(save_dir, f"{pdb_id}.pdb")

    print(f"Downloading PDB {pdb_id} from {url}")
    response = requests.get(url)
    if response.status_code == 200:
        with open(local_filename, "wb") as f:
            f.write(response.content)
        print(f"File saved to {local_filename}")
    else:
        raise Exception(f"Error downloading {pdb_id}, status code {response.status_code}")

    return local_filename

def parse_pdb(file_path):
    """
    Parses a PDB file using Bio.PDB and extracts basic information like:
    - Chain IDs
    - Residues
    - Atom coordinates
    file_path: The path to the PDB file.
    return: A dictionary containing some extracted data.
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("structure", file_path)

    # We'll store info in a dictionary
    pdb_info = {
        "chains": [],
        "residues": [],
        "atoms": []
    }

    for model in structure:
        for chain in model:
            chain_id = chain.id
            pdb_info["chains"].append(chain_id)

            for residue in chain:
                # residue.id is a tuple of form (hetero_flag, sequence_identifier, insertion_code)
                # e.g., (' ', 50, ' ') for standard amino acids
                residue_name = residue.get_resname()
                residue_id = residue.id[1]
                pdb_info["residues"].append((chain_id, residue_name, residue_id))

                for atom in residue:
                    # Extract x, y, z coordinates
                    coord = atom.get_coord()
                    pdb_info["atoms"].append({
                        "chain": chain_id,
                        "residue_name": residue_name,
                        "residue_id": residue_id,
                        "atom_name": atom.get_name(),
                        "coordinates": coord.tolist()  # convert NumPy array to list
                    })

    return pdb_info

if __name__ == "__main__":
    # Example usage: download and parse the PDB file for 1CRN (Crambin).
    pdb_id = "1CRN"
    pdb_file_path = download_pdb(pdb_id, save_dir=".")

    # Parse the downloaded file
    extracted_info = parse_pdb(pdb_file_path)

    # Print basic extracted information
    print("\nExtracted Chains:")
    print(set(extracted_info["chains"]))

    print("\nNumber of Residues:")
    print(len(extracted_info["residues"]))

    print("\nFirst 5 Atoms Extracted:")
    for atom_info in extracted_info["atoms"][:5]:
        print(atom_info)


Downloading PDB 1CRN from https://files.rcsb.org/download/1CRN.pdb
File saved to ./1CRN.pdb

Extracted Chains:
{'A'}

Number of Residues:
46

First 5 Atoms Extracted:
{'chain': 'A', 'residue_name': 'THR', 'residue_id': 1, 'atom_name': 'N', 'coordinates': [17.047000885009766, 14.098999977111816, 3.625]}
{'chain': 'A', 'residue_name': 'THR', 'residue_id': 1, 'atom_name': 'CA', 'coordinates': [16.966999053955078, 12.784000396728516, 4.3379998207092285]}
{'chain': 'A', 'residue_name': 'THR', 'residue_id': 1, 'atom_name': 'C', 'coordinates': [15.6850004196167, 12.755000114440918, 5.132999897003174]}
{'chain': 'A', 'residue_name': 'THR', 'residue_id': 1, 'atom_name': 'O', 'coordinates': [15.267999649047852, 13.824999809265137, 5.593999862670898]}
{'chain': 'A', 'residue_name': 'THR', 'residue_id': 1, 'atom_name': 'CB', 'coordinates': [18.170000076293945, 12.70300006866455, 5.336999893188477]}
