In [2]:
import requests
import pandas as pd

CONCEPT_ID = 166466  # PubChem Concept: Non-proteinogenic amino acids

def get_concept_children(concept_id):
    """Return all child concept IDs for a PubChem concept."""
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/heading/JSON/?heading=Concept&cid={concept_id}"
    r = requests.get(url).json()

    child_ids = []

    def extract_children(node):
        if isinstance(node, dict):
            if "ConceptChildren" in node:
                for child in node["ConceptChildren"]:
                    child_ids.append(child["ConceptID"])
                    extract_children(child)
            for v in node.values():
                extract_children(v)
        elif isinstance(node, list):
            for item in node:
                extract_children(item)

    extract_children(r)
    return list(set(child_ids))


def get_cids_for_concept(concept_id):
    """Return all compound CIDs associated with a PubChem concept."""
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/categories/compound/{concept_id}/JSON"
    r = requests.get(url).json()

    cids = []
    def extract_cids(node):
        if isinstance(node, dict):
            if "CID" in node:
                cids.append(node["CID"])
            for v in node.values():
                extract_cids(v)
        elif isinstance(node, list):
            for item in node:
                extract_cids(item)

    extract_cids(r)
    return list(set(cids))


# ---- MAIN PIPELINE ----
print("Fetching child concepts...")
children = get_concept_children(CONCEPT_ID)
all_concepts = [CONCEPT_ID] + children

print(f"Found {len(children)} child concepts.")

all_cids = set()
for cid in all_concepts:
    try:
        cids = get_cids_for_concept(cid)
        all_cids.update(cids)
        print(f"Concept {cid}: {len(cids)} compounds")
    except:
        pass

print(f"\nTotal non-proteinogenic amino acids retrieved: {len(all_cids)}")

# Save to CSV
df = pd.DataFrame({"PubChem_CID": sorted(all_cids)})
df.to_csv("non_proteinogenic_amino_acids_pubchem.csv", index=False)

print("\nSaved to non_proteinogenic_amino_acids_pubchem.csv")


Fetching child concepts...
Found 0 child concepts.
Concept 166466: 0 compounds

Total non-proteinogenic amino acids retrieved: 0

Saved to non_proteinogenic_amino_acids_pubchem.csv
