# Cofactors via PDB API

This repository contains a Jupyter Notebook to query and analyze cofactor information from the Protein Data Bank (PDB) API.

**What this notebook does**  
- Calls the PDB REST/JSON API to retrieve structures and annotations.  
- Parses ligand/cofactor information.  
- Aggregates results into tidy tables for downstream analysis.  
- Saves clean CSV outputs.


> **Note on provenance**  
> This notebook is **based on PDBe API notebooks** and **reuses some helper functions** (adapted here) for PDBe endpoints and data normalization.  
> See PDBe resources: https://www.ebi.ac.uk/pdbe/

In [33]:
import os
import time
import csv
import re
import requests
from pathlib import Path
from typing import Any, Dict, Optional, List

# Find cofactors in PDB


In [35]:
PROJECT_ROOT = Path(".").resolve()
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "outputs"
DATA_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)

# Network + API
PDB_API_BASE = "https://data.rcsb.org/rest/v1"  # adjust if you use a different endpoint
REQUEST_TIMEOUT = 30  # seconds
RETRY_ATTEMPTS = 3

# Random seed for reproducibility (if applicable)
SEED = 42

In [36]:

try:
    import requests
except ImportError:
    raise ImportError("Please install 'requests' (pip install requests)")


def http_get(url: str, params: Optional[Dict[str, Any]] = None, timeout: int = REQUEST_TIMEOUT) -> requests.Response:
    """GET with basic retry."""
    last_err = None
    for attempt in range(1, RETRY_ATTEMPTS + 1):
        try:
            resp = requests.get(url, params=params, timeout=timeout)
            resp.raise_for_status()
            return resp
        except Exception as e:
            last_err = e
            if attempt < RETRY_ATTEMPTS:
                time.sleep(min(2**attempt, 10))
    raise last_err


def save_csv(df, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    return path

# Find PDB entries containing a specific ligand (cofactor)


## Making imports and setting variables 

First, we import some packages that we will use, and set some variables.

Note: The full list of valid URLs is available from http://www.ebi.ac.uk/pdbe/api/doc/


In [40]:
# Base URL for the PDBe API
base_url = "https://www.ebi.ac.uk/pdbe/"

# Base endpoint for PDBe REST API calls
api_base = base_url + "api/"

# API endpoint to retrieve summary information about a PDB entry
summary_url = api_base + 'pdb/entry/summary/'

# API endpoint to retrieve binding site information for a PDB entry
binding_url = api_base + 'pdb/entry/binding_sites/'


def get_cofactor_information(cof_api):
    URL_base = "https://www.ebi.ac.uk/pdbe/api/pdb/compound/cofactors"
    query = URL_base
    response = requests.get(query)
    if response.status_code == 200:
        return response.json()
    else:
        print("No data available")
        return None


def retrieve_cofactors_PDB(cof_api):
    response = get_cofactor_information(cofactorclass)
    return [cofactorclass.keys]


def save_cofactor_information(cofactorclass):
    response = get_cofactor_information(cofactorclass)
    for ci in response[cofactorclass]:
        print(cofactorclass, ci["cofactors"])
    return None


def save_cofactor_information1(cofactorclass):
    response = get_cofactor_information(cofactorclass)
    for ci in response[cofactorclass]:
        return [cofactorclass, ci["cofactors"]]

In [41]:
# List containing coenzyme classes
cof_list = [ "Ascorbic acid", "Factor F430", "MIO", "Phosphopantetheine", "Nicotinamide-adenine dinucleotide", "Dipyrromethane",  "Molybdopterin", "Adenosylcobalamin", "Flavin adenine dinucleotide", "Tetrahydrofolic acid", "Coenzyme A", "Coenzyme B", "Flavin Mononucleotide", "Menaquinone",  "Coenzyme M", "Heme", "Biopterin", "Pyrroloquinoline Quinone", "Biotin", "Lipoic acid", "Ubiquinone", "Glutathione", "Orthoquinone residues (LTQ, TTQ, CTQ)", "S-adenosylmethionine",  "Thiamine diphosphate", "Pyridoxal 5'-phosphate", "Topaquinone"]

# Retrieve Chemical component code from PDB chemical component dictionary of each coenzyme class
for cof in cof_list:
    print(save_cofactor_information1(cof))

['Ascorbic acid', ['ASC']]
['Factor F430', ['F43', 'M43']]
['MIO', ['MDO']]
['Phosphopantetheine', ['PNS']]
['Nicotinamide-adenine dinucleotide', ['0WD', '1DG', '3AA', '3CD', '6V0', '8ID', 'A3D', 'AP0', 'CND', 'DG1', 'DN4', 'EAD', 'ENA', 'LNC', 'N01', 'NA0', 'NAD', 'NAE', 'NAI', 'NAJ', 'NAP', 'NAQ', 'NAX', 'NBD', 'NBP', 'NDC', 'NDE', 'NDO', 'NDP', 'NHD', 'NPW', 'ODP', 'P1H', 'PAD', 'SAD', 'SAE', 'SND', 'TAD', 'TAP', 'TDT', 'TXD', 'TXE', 'TXP', 'ZID']]
['Dipyrromethane', ['18W', '29P', 'DPM']]
['Molybdopterin', ['2MD', 'MCN', 'MGD', 'MSS', 'MTE', 'MTQ', 'MTV', 'PCD', 'XAX']]
['Adenosylcobalamin', ['B12', 'CNC', 'COB', 'COY']]
['Flavin adenine dinucleotide', ['6FA', 'FA8', 'FAA', 'FAB', 'FAD', 'FAE', 'FAO', 'FAS', 'FCG', 'FDA', 'FED', 'FSH', 'P5F', 'RFL', 'SFD']]
['Tetrahydrofolic acid', ['1YJ', 'C2F', 'FFO', 'FON', 'FOZ', 'THF', 'THG', 'THH']]
['Coenzyme A', ['01A', '01K', '0ET', '1C4', '1CV', '1CZ', '1HA', '1VU', '1XE', '2CP', '2NE', '3CP', '3H9', '3HC', '4CA', '4CO', '8JD', '8Z2', 'AC

## Retrieve PDB codes for each coenzyme class

In [57]:
OUT_DIR_CLASSES = OUTPUT_DIR / "cofactors_by_class"
OUT_DIR_CLASSES.mkdir(parents=True, exist_ok=True)

# Default coenzyme classes if not defined
try:
    cof_list
except NameError:
    cof_list = [
        "Flavin Mononucleotide",
        "Flavin Adenine Dinucleotide",
        "Nicotinamide Adenine Dinucleotide",
        "Coenzyme A"
    ]


def get_PDB_entries_associated_cofactor(cofId):
    """Retrieve PDB entries associated with a given cofactor CCD code."""
    URL_base = "https://www.ebi.ac.uk/pdbe/api/pdb/compound/in_pdb"
    try:
        response = requests.get(f"{URL_base}/{cofId}", timeout=10)
        if response.status_code == 200:
            return response.json().get(cofId, [])
    except requests.RequestException:
        pass  # suppress errors
    return []


def _slugify(name: str) -> str:
    """Convert a name into a filesystem-safe string."""
    return re.sub(r'[^A-Za-z0-9]+', '_', name.strip()).strip('_')


def save_single_CCD_PDB_entries(cofId, filename):
    """Save all PDB entries for a single CCD code into a CSV with a single list."""
    entries = get_PDB_entries_associated_cofactor(cofId)
    entries_list = sorted(set(entries))  # remove duplicates
    with open(filename, "w") as f:
        f.write(f"# {cofId} ({len(entries_list)} PDB codes)\n")
        f.write(f"{entries_list}\n")


# Export PDB entries for coenzyme classes 
for cof_class in cof_list:
    cls_name, ccd_codes = save_cofactor_information1(cof_class)
    if not ccd_codes:
        continue  # skip if no CCD codes

    out_path = OUT_DIR_CLASSES / f"{_slugify(cof_class)}.csv"
    
    # Collect all PDB entries for this class
    all_entries = set()
    for code in ccd_codes:
        entries = get_PDB_entries_associated_cofactor(code)
        all_entries.update(entries)

    all_entries_list = sorted(all_entries)

    # Write a single CSV with a Python-style list and header showing number of PDB codes
    with open(out_path, "w") as f:
        f.write(f"# {cof_class} ({len(all_entries_list)} PDB codes)\n")
        f.write(f"{all_entries_list}\n")


# Include "ATP" as an additional coenzyme class
save_single_CCD_PDB_entries("ATP", OUT_DIR_CLASSES / "ATP_codes.csv")
