In [1]:
import requests
import json
from rich import inspect

In [2]:
with requests.get("https://mibig.secondarymetabolites.org/repository/BGC0001792.5/annotations.json") as r:
    bgc_data = r.json()

In [3]:
bgc_data

{'accession': 'BGC0001792',
 'version': 5,
 'changelog': {'releases': [{'version': '1',
    'entries': [{'contributors': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'reviewers': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'date': '2018-08-06',
      'comment': 'Submitted'}],
    'date': '2018-08-06'},
   {'version': '2',
    'entries': [{'contributors': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'reviewers': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'date': '2019-10-16',
      'comment': 'Migrated from v1.4'},
     {'contributors': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'reviewers': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'date': '2019-10-16',
      'comment': 'Updated compound(s) information (NPAtlas curation)'}],
    'date': '2019-10-16'},
   {'version': '3',
    'entries': [{'contributors': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'reviewers': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'date': '2022-09-15',
      'comment': 'Updated bioactivity data'},
     {'contributors': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
      'reviewers': ['AAAAA

In [4]:
inspect(bgc_data['compounds'])

In [5]:
surugamide_smiles_string = [i for i in bgc_data['compounds'] if i['name'] == 'surugamide A'][0]['structure']
print(surugamide_smiles_string)

CC[C@H](C)[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC2=CC=CC=C2)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](NC(=O)[C@@H](NC(=O)[C@@H](C)NC1=O)[C@@H](C)CC)[C@H](C)CC)[C@@H](C)CC


In [6]:
def get_compound_smiles(bgc_id):
    with requests.get(f"https://mibig.secondarymetabolites.org/repository/{bgc_id}/annotations.json") as r:
        bgc_data = r.json()
    # assumes there is only one structure with a given name
    # if there are multiple, this will only return the first one
    compound_smiles = {compound['name']: compound['structure'] for compound in bgc_data['compounds']}
    return compound_smiles

In [7]:
bgc_id = "BGC0001792.5"
smiles_dict = get_compound_smiles(bgc_id)
print(smiles_dict)

{'surugamide A': 'CC[C@H](C)[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC2=CC=CC=C2)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](NC(=O)[C@@H](NC(=O)[C@@H](C)NC1=O)[C@@H](C)CC)[C@H](C)CC)[C@@H](C)CC', 'surugamide D': 'CC[C@H](C)[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC2=CC=CC=C2)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](NC(=O)[C@@H](NC(=O)[C@@H](C)NC1=O)C(C)C)[C@H](C)CC)[C@@H](C)CC'}


# Search NP Atlas for a compound


https://www.npatlas.org/api/v1/docs


> Developers and other researchers are free to download all the data in the Atlas from the Download page and use them for any research purpose, provided that this falls within the requirements of the **Creative Commons Attribution-Noncommercial 4.0 International license**.

> Note: This API is rate limitted to 20 requests per minute. If you require higher bandwidth, please contact us (support@npatlas.org), and we can provide an APIkey which dramatically increases this limit.



In [8]:
# Get the SMILES string for Surugamide A using the dictionary from above
surugamide_smiles_string = smiles_dict.get("surugamide A")

In [9]:

# Define full endpoint URL
url = "https://www.npatlas.org/api/v1/compounds/structureSearch"

# Query parameters must be in the URL for this endpoint
query_params = {
    "structure": surugamide_smiles_string,
    "type": "smiles",
    "method": "full",
    "threshold": 0.8,
    "skip": 0,
    "limit": 10,
    "stereo": "false"
}

headers = {
    "Content-Type": "application/json",
    "Accept": "application/json"
}

# Send POST request with query parameters in URL, empty JSON body
response = requests.post(url, headers=headers, params=query_params, json={})

# Parse JSON response
if response.status_code == 200:
    data = response.json()

In [10]:
print(json.dumps(data, indent=2))

[
  {
    "id": 5450,
    "npaid": "NPA005450",
    "original_name": "Surugamide A",
    "mol_formula": "C48H81N9O8",
    "mol_weight": "912.2310",
    "exact_mass": "911.6208",
    "inchikey": "NPYICXUUGUJPMM-QIUOYRCFSA-N",
    "smiles": "CC[C@H](C)[C@H]1C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N1)C)[C@@H](C)CC)CC(C)C)CC2=CC=CC=C2)[C@@H](C)CC)CCCCN)[C@@H](C)CC",
    "cluster_id": 26,
    "node_id": 25,
    "has_exclusions": false,
    "similarity": 1.0
  }
]


But that doesn't have all the info!

To get that we have to use another API call, using the `id` from the first call to get more details about the specific compound.

In [11]:
surugamide_npatlas_id = data[0]['npaid']

# Define full endpoint URL
url = f"https://www.npatlas.org/api/v1/compound/{surugamide_npatlas_id}"

response = requests.get(url)
# Parse JSON response
if response.status_code == 200:
    data2 = response.json()


In [12]:
print(json.dumps(data2, indent=2))

{
  "id": 5450,
  "npaid": "NPA005450",
  "original_name": "Surugamide A",
  "mol_formula": "C48H81N9O8",
  "mol_weight": "912.2310",
  "exact_mass": "911.6208",
  "inchikey": "NPYICXUUGUJPMM-QIUOYRCFSA-N",
  "smiles": "CC[C@H](C)[C@H]1C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N1)C)[C@@H](C)CC)CC(C)C)CC2=CC=CC=C2)[C@@H](C)CC)CCCCN)[C@@H](C)CC",
  "cluster_id": 26,
  "node_id": 25,
  "has_exclusions": false,
  "synonyms": [],
  "inchi": "InChI=1S/C48H81N9O8/c1-12-28(7)37-45(62)50-32(11)41(58)54-40(31(10)15-4)48(65)57-39(30(9)14-3)46(63)51-34(23-19-20-24-49)42(59)55-38(29(8)13-2)47(64)53-36(26-33-21-17-16-18-22-33)43(60)52-35(25-27(5)6)44(61)56-37/h16-18,21-22,27-32,34-40H,12-15,19-20,23-26,49H2,1-11H3,(H,50,62)(H,51,63)(H,52,60)(H,53,64)(H,54,58)(H,55,59)(H,56,61)(H,57,65)/t28-,29-,30-,31-,32+,34-,35+,36+,37-,38-,39+,40-/m0/s1",
  "m_plus_h": "912.6281",
  "m_plus_na": "934.6100",
  "origin_reference": {
    "doi": "10.1021/jo400708u",


In [13]:
for i in data2['external_ids']:
    print(i)

{'external_db_name': 'mibig', 'external_db_code': 'BGC0001792'}
{'external_db_name': 'gnps', 'external_db_code': "CCMSLIB00000839206%3%Dereplicator&Identification&-&E'Surugamide_A'"}
{'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00012161430%Suspect related to Surugamide A (predicted molecular formula SIRIUS: C39H79N19O7 / BUDDY: C48H87N5O12) with delta m/z 14.016 (putative explanation: Asn->Gln substitution|Asp->Glu substitution|Gly->Ala substitution|Methylation|Ser->Thr substitution|Val->Leu/Ile substitution|chain elongation; atomic difference: 1C,2H|1C,2H|1C,2H|1C,2H|1C,2H|1C,2H|1C,2H) [M+H]+%4'}
{'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00000579271%1%Surugamide&A'}
{'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00010001223%Suspect related to Surugamide A (predicted molecular formula: C42H75N17O5) with delta m/z -14.016 (putative explanation: Ala->Gly substitution|Gln->Asn substitution|Glu->Asp substitution|Leu/Ile->Val substitution|Thr->Ser substi

In [14]:
import re

external_ids = []

# clean up and decode external IDs
for entry in data2['external_ids']:
    match = re.search(r'CCMSLIB\d+', entry['external_db_code'])
    if match:
        # Extract the matched part
        id = match.group(0)
    else:
        # If no match, use the original code
        id = entry['external_db_code']
    external_ids.append({
        'external_db_name': entry['external_db_name'],
        'external_db_code': id})      
        

In [15]:
external_ids

[{'external_db_name': 'mibig', 'external_db_code': 'BGC0001792'},
 {'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00000839206'},
 {'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00012161430'},
 {'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00000579271'},
 {'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00010001223'},
 {'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00010001227'},
 {'external_db_name': 'gnps', 'external_db_code': 'CCMSLIB00012161426'},
 {'external_db_name': 'npmrd', 'external_db_code': 'NP0011813'}]

In [16]:
gnps_codes = [entry['external_db_code'] for entry in external_ids if entry['external_db_name'] == 'gnps']
gnps_codes


['CCMSLIB00000839206',
 'CCMSLIB00012161430',
 'CCMSLIB00000579271',
 'CCMSLIB00010001223',
 'CCMSLIB00010001227',
 'CCMSLIB00012161426']