In [1]:
import os
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
!pip install --upgrade rcsb-api


Collecting rcsb-api
  Downloading rcsb_api-1.5.0-py3-none-any.whl.metadata (11 kB)
Downloading rcsb_api-1.5.0-py3-none-any.whl (70 kB)
Installing collected packages: rcsb-api
  Attempting uninstall: rcsb-api
    Found existing installation: rcsb-api 1.4.2
    Uninstalling rcsb-api-1.4.2:
      Successfully uninstalled rcsb-api-1.4.2
Successfully installed rcsb-api-1.5.0


## PDB: retrieve all unnatural biopolymers

In [7]:
#!/usr/bin/env python3
"""
Find PDB polymer entities that contain modified (non-standard) monomers
using the RCSB Search API + Data API.

Definition of "unnatural biopolymer" here:
- polymer entities with rcsb_polymer_entity_feature_summary.type == "modified chemical component"
- and rcsb_polymer_entity_feature_summary.count > 0

Output: CSV with one row per polymer_entity ID.
"""

import csv
import time
from typing import Dict, Any, List, Optional

import requests

SEARCH_URL = "https://search.rcsb.org/rcsbsearch/v2/query"
DATA_URL_TEMPLATE = "https://data.rcsb.org/rest/v1/core/polymer_entity/{entity_id}"

# --- Search parameters ---
PAGE_SIZE = 1000      # rows per page (max 10000, but 1000 is safer)
SLEEP_BETWEEN_PAGES = 0.2  # polite pause between requests (seconds)


def build_search_payload(start: int = 0, rows: int = PAGE_SIZE) -> Dict[str, Any]:
    """
    Build the JSON payload for the RCSB Search API.

    We search for polymer entities whose feature summary includes
    type == "modified chemical component" AND count > 0.
    """
    return {
        "query": {
            "type": "group",
            "logical_operator": "and",
            "nodes": [
                {
                    "type": "terminal",
                    "service": "attribute",
                    "parameters": {
                        "attribute": "rcsb_polymer_entity_feature_summary.type",
                        "operator": "exact_match",
                        "value": "modified chemical component",
                    },
                },
                {
                    "type": "terminal",
                    "service": "attribute",
                    "parameters": {
                        "attribute": "rcsb_polymer_entity_feature_summary.count",
                        "operator": "greater",
                        "value": 0,
                    },
                },
            ],
        },
        "return_type": "polymer_entity",
        "request_options": {
            # pagination options
            "paginate": {
                "start": start,
                "rows": rows,
            },
            # We only really need the identifiers; no facets.
            "results_content_type": ["identifier"],
        },
    }


def fetch_all_polymer_entities() -> List[str]:
    """
    Paginate through the Search API and collect all polymer_entity IDs
    that contain modified chemical components.
    """
    polymer_entities: List[str] = []
    start = 0

    while True:
        payload = build_search_payload(start=start, rows=PAGE_SIZE)
        r = requests.post(SEARCH_URL, json=payload)
        r.raise_for_status()
        data = r.json()

        results = data.get("result_set", [])
        if not results:
            break

        # Each result has "identifier" like "1ABC_1"
        ids = [hit["identifier"] for hit in results]
        polymer_entities.extend(ids)

        print(f"Fetched {len(ids)} entities (start={start}), total so far={len(polymer_entities)}")

        # If fewer than PAGE_SIZE returned, we've reached the end
        if len(results) < PAGE_SIZE:
            break

        start += PAGE_SIZE
        time.sleep(SLEEP_BETWEEN_PAGES)

    return polymer_entities


def fetch_polymer_entity_core(entity_id: str) -> Optional[Dict[str, Any]]:
    """
    Fetch core polymer_entity data from the Data API.

    entity_id example: '1ABC_1'
    """
    url = DATA_URL_TEMPLATE.format(entity_id=entity_id)
    r = requests.get(url)
    if r.status_code != 200:
        print(f"[WARN] Failed to fetch data for {entity_id}: HTTP {r.status_code}")
        return None
    return r.json()


def extract_unnatural_info(entity_id: str, core: Dict[str, Any]) -> Dict[str, Any]:
    """
    From the Data API JSON, extract:
    - entry_id
    - description
    - polymer_type
    - total_length
    - number of modified monomers (from feature_summary)
    - coverage fraction of modified monomers
    - a list of chem_comp IDs that are modified (if available)
    """
    # Basic identifiers
    entry_id = core.get("rcsb_polymer_entity_container_identifiers", {}).get(
        "entry_id"
    )

    description = core.get("rcsb_polymer_entity", {}).get("pdbx_description")

    # polymer type, e.g. 'polypeptide(L)', 'polydeoxyribonucleotide', etc.
    polymer_type = core.get("entity_poly", {}).get("type")

    # length: number of monomers in the polymer
    total_length = core.get("rcsb_polymer_entity", {}).get("rcsb_sample_sequence_length")

    # Summaries of features per entity
    feature_summaries = core.get("rcsb_polymer_entity_feature_summary", []) or []
    modified_count = 0
    modified_coverage = 0.0

    for fs in feature_summaries:
        if fs.get("type") == "modified chemical component":
            modified_count = fs.get("count", 0) or 0
            modified_coverage = fs.get("coverage", 0.0) or 0.0
            break

    # More detailed feature list (optional, may be absent)
    modified_comp_ids = set()
    features = core.get("rcsb_polymer_entity_feature", []) or []
    for feat in features:
        if feat.get("type") == "modified chemical component":
            # chem_comp_id is usually in "monomer_chem_comp_id"
            comp_id = feat.get("monomer_chem_comp_id")
            if comp_id:
                modified_comp_ids.add(comp_id)

    return {
        "polymer_entity_id": entity_id,
        "entry_id": entry_id,
        "description": description,
        "polymer_type": polymer_type,
        "total_length": total_length,
        "modified_monomer_count": modified_count,
        "modified_monomer_coverage": modified_coverage,
        "modified_chem_comp_ids": ";".join(sorted(modified_comp_ids)) if modified_comp_ids else "",
    }


def main(output_csv: str = "unnatural_biopolymers_from_pdb.csv"):
    print("Searching for polymer entities with modified chemical components...")
    polymer_ids = fetch_all_polymer_entities()
    print(f"\nTotal polymer entities with modified components: {len(polymer_ids)}")

    fieldnames = [
        "polymer_entity_id",
        "entry_id",
        "description",
        "polymer_type",
        "total_length",
        "modified_monomer_count",
        "modified_monomer_coverage",
        "modified_chem_comp_ids",
    ]

    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for i, entity_id in enumerate(polymer_ids, start=1):
            core = fetch_polymer_entity_core(entity_id)
            if core is None:
                continue

            row = extract_unnatural_info(entity_id, core)
            writer.writerow(row)

            if i % 100 == 0:
                print(f"Wrote {i} entities to CSV...")

    print(f"\nDone. Wrote results to: {output_csv}")


if __name__ == "__main__":
    main()


Searching for polymer entities with modified chemical components...


HTTPError: 400 Client Error: Bad Request for url: https://search.rcsb.org/rcsbsearch/v2/query