# Retrieving cofactor-protein interatomic interactions through PDBe API

This repository contains a Jupyter Notebook to query and analyze cofactor information from the Protein Data Bank (PDB) API.

**Quick start**  
1. Install dependencies: `pip install -r requirements.txt`  
2. Open the notebook: `jupyter lab` or `jupyter notebook`  
3. Run cells top-to-bottom.

**What this notebook does**  
- Queries the PDBe REST/JSON API to retrieve structural and annotation data.
- Assigns interatomic cofactor-protein interactions to each PDB entry from every coenzyme class.
- Distinguish between side chain or backbone amino acid interactions.
- Generates and saves clean, structured CSV files for downstream analysis.


In [20]:
import pandas as pd
import numpy as np
import requests
import csv
import matplotlib.pyplot as plt
from IPython.display import SVG, display
from pprint import pprint
import sys
sys.path.insert(0,'..')
#python_modules are adopted from PDBe API jupyter notebooks
from api_modules import run_sequence_search, explode_dataset, get_ligand_site_data, run_search, pandas_dataset, pandas_count, pandas_plot, pandas_plot_multi_groupby 


## Get bound molecule codes


In [None]:
def get_bound_molecules(pdbId):
    URL_base = "https://www.ebi.ac.uk/pdbe/graph-api/pdb/bound_molecules"
    query = URL_base + "/" + pdbId
    response = requests.get(query)
    if response.status_code == 200:
        return response.json()
    else:
        print("No data available")
        return None


def save_pdb_chain_auth(pdbId, filename):
    try:
        response = get_bound_molecules(pdbId)
        with open(filename, "a") as file:
            for bm in response[pdbId]:
                for ligand in bm["composition"]["ligands"]:
                    print(pdbId, ligand["chain_id"], ligand["author_residue_number"], ligand["chem_comp_id"], ligand["entity"], sep=",", file=file)
    except TypeError:
        print("bad pdb code")
    return None


def iterate_list1(my_list):
    for pdbId in pdb_list:
        save_pdb_chain_auth(pdbId, "cofmapping.csv")
        
iterate_list1(pdb_list)
    

In [None]:
#Include the PDB code list per each coenzyme class, e.g.: Coenzyme_B
pdb_list = ['1hbo, '3m30, '5g0r, '5n2a, '1hbm, '1e6v, '1hbu, '1mro, '5a8k, '5n28, '3m2u, '5a0y, '5a8w, '1e6y, '1hbn, '5odh, '5odr, '3m1v, '3m2r, '3m2v, '3pot, '3sqg, '3m32, '5a8r, '5n1q']

In [None]:
# Filter those chem_comp_id that are cofactors. To preserve all lines containing the complete list of 453 cofactors associated IDs
my_file_name = "cofmapping.csv"
cleaned_file = "cofmapping_cofactors.csv"

save_words = ['ASC', 'F43', 'M43', 'MDO', 'PNS', '0WD', '1DG', '3AA', '3CD', '5J8', '6V0', '80F', '8ID', 'A3D', 'AP0', 'CNA', 'CND', 'DG1', 'DN4', 'DND', 'DQV', 'EAD', 'ENA', 'LNC', 'N01', 'NA0', 'NAD', 'NAE', 'NAI', 'NAJ', 'NAP', 'NAQ', 'NAX', 'NBD', 'NBP', 'NDA', 'NDC', 'NDE', 'NDO', 'NDP', 'NHD', 'NHO', 'NJP', 'NPW', 'ODP', 'P1H', 'PAD', 'SAD', 'SAE', 'SND', 'TAD', 'TAP', 'TDT', 'TXD', 'TXE', 'TXP', 'ZID', '18W', '29P', 'DPM', '2MD', 'MCN', 'MGD', 'MSS', 'MTE', 'MTQ', 'MTV', 'PCD', 'PGD', 'XAX', 'B12', 'B1M', 'CNC', 'COB', 'COY', '6FA', 'FA8', 'FAA', 'FAB', 'FAD', 'FAE', 'FAO', 'FAS', 'FCG', 'FDA', 'FED', 'FNK', 'FSH', 'P5F', 'RFL', 'SFD', '1YJ', 'C2F', 'DHF', 'FFO', 'FOL', 'FON', 'FOZ', 'THF', 'THG', 'THH', '01A', '01K', '0ET', '1C4', '1CV', '1CZ', '1HA', '1VU', '1XE', '2CP', '2NE', '3CP', '3H9', '3HC', '3VV', '4CA', '4CO', '52O', '7L1', '8JD', '8Z2', '94Q', 'ACO', 'AMX', 'BCA', 'BCO', 'BSJ', 'BYC', 'CA3', 'CA5', 'CA6', 'CA8', 'CAA', 'CAJ', 'CAO', 'CIC', 'CMC', 'CMX', 'CO6', 'CO7', 'CO8', 'COA', 'COD', 'COF', 'COO', 'COT', 'COW', 'COZ', 'DCA', 'DCC', 'FAM', 'FCX', 'FRE', 'FYN', 'GRA', 'HAX', 'HMG', 'HSC', 'HXC', 'IVC', 'MCA', 'MCD', 'MDE', 'MLC', 'MYA', 'NHM', 'NHQ', 'NHW', 'NMX', 'OXK', 'Q5B', 'QHD', 'RMW', 'S0N', 'SCA', 'SCD', 'SCO', 'SDX', 'SOP', 'T1G', 'TC6', 'TUY', 'UT7', 'UTA', 'WCA', 'YNC', 'ZOZ', 'SHT', 'TP7', 'TPZ', 'TXZ', 'XP8', 'XP9', '4LS', '4LU', '9O9', '9P3', '9PF', '9Q6', '9QF', 'F7F', 'FMN', 'FNR', 'FNS', 'IRF', 'RBF', 'MQ7', 'COM', '1FH', '2FH', '522', '6HE', '76R', '7HE', 'BW9', 'CCH', 'COH', 'CV0', 'DDH', 'DHE', 'F0L', 'F0X', 'FDD', 'FDE', 'FEC', 'FMI', 'H02', 'HAS', 'HDD', 'HDE', 'HEA', 'HEB', 'HEC', 'HEM', 'HEO', 'HEV', 'HIF', 'HP5', 'ISW', 'MH0', 'MI9', 'MNH', 'MNR', 'MP1', 'N7H', 'OBV', 'PP9', 'SH0', 'SIR', 'SRM', 'UFE', 'VEA', 'VER', 'VOV', 'ZEM', 'ZNH', '4AB', '7AP', 'BHS', 'BIO', 'H2B', 'H4B', 'HBI', 'WSD', 'PQQ', 'BC4', 'BTI', 'BTN', 'BYT', 'DTB', 'Y7Y', 'LPA', 'LPB', '4YP', '9BL', 'AT5', 'DBT', 'RQX', 'UHD', 'UQ1', 'UQ2', 'UQ5', 'UQ6', '0HG', '0HH', '1JO', '1JP', '1R4', '3GC', '48T', '5AU', '6SG', 'ABY', 'AHE', 'ATA', 'BOB', 'BWS', 'BYG', 'EPY', 'ESG', 'GBI', 'GBP', 'GBX', 'GDN', 'GDS', 'GF5', 'GGC', 'GIP', 'GNB', 'GPR', 'GPS', 'GS8', 'GSB', 'GSF', 'GSH', 'GSM', 'GSN', 'GSO',
'GTB', 'GTD', 'GTS', 'GTX', 'GTY', 'GVX', 'HAG', 'HGS', 'IBG', 'ICY', 'JM2', 'JM5', 'JM7', 'L9X', 'LEE', 'LZ6', 'P9H', 'RGE', 'TGG', 'TS5', 'VDW', 'VWW', 'ZBF', '0AF', 'TOQ', 'TQQ', 'TRQ', '0UM', '0XU', '0Y0', '0Y1', '0Y2', '36A', '37H', '4IK', '62X', '6D6', '6NR', '76H', '76J', '76K', '76L', '76M', 'AN6', 'EEM', 'K15', 'P2J', 'SA8', 'SAH', 'SAM', 'SFG', 'SMM', 'SX0', 'TT8', '1TP', '1U0', '2TP', '5GY', '5SR', '8EF', '8EL', '8EO', '8FL', '8ML', '8N9', '8PA', 'A5X', 'D7K', 'EN0', 'HTL', 'M6T', 'N1T', 'N3T', 'NDQ', 'O2T', 'QSP', 'R1T', 'S1T', 'T5X', 'T6F', 'TD5', 'TD6', 'TD7', 'TD8', 'TD9', 'TDK', 'TDL', 'TDM', 'TDN', 'TDP', 'TDW', 'THD', 'THV', 'THW', 'THY', 'TOG', 'TOI', 'TP8', 'TPP', 'TPU', 'TPW', 'TZD', 'WWF', 'ZP1', 'EM2', 'MPL', 'NOP', 'NPL', 'PDP', 'PLP', 'PLR', 'PMP', 'PXP', 'PZP', 'UAH', 'X04',
'1TY', '2TY', '3TY', '4HL', 'AGQ', 'ESB', 'G27', 'HCC', 'P2Q', 'P3Q', 'PAQ', 'T0I', 'TPQ', 'TTS', 'TYQ', 'TYY', 'YPZ', 'ATP']

with open(my_file_name, 'r', newline='') as infile, \
     open(cleaned_file, 'w',newline='') as outfile:
    writer = csv.writer(outfile)
    for line in csv.reader(infile, delimiter=','):
        if any(save_word in element
                      for element in line
                      for save_word in save_words):
            writer.writerow(line)

## Get PDB bound ligand interatomic interactions 


In [None]:
# settings for PDBe API
base_url = "https://www.ebi.ac.uk/pdbe/"  # the beginning of the URL for PDBe's API.
search_url = base_url + 'search/pdb/select?'  # the rest of the URL used for PDBe's search API.

pdbe_kb_interacting_residues_api = base_url + "graph-api/uniprot/ligand_sites/"
pdbe_kb_api_uniprot_base_url = base_url + "graph-api/uniprot/"

pdbe_kb_api_pdb_base_url = base_url + "graph-api/pdb/"
pdbe_kb_api_bound_ligand_interactions = base_url + "graph-api/pdb/bound_ligand_interactions/" 

pdbe_sifts_mappings_api_url = base_url + "api/"


def get_ligand_site_url():
    return pdbe_kb_api_uniprot_base_url + "ligand_sites/"


def get_interaction_site_url():
    return pdbe_kb_api_uniprot_base_url + "interface_residues/"


def get_url_with_accession(url, accession):
    url = url + accession
    ret = get_url(url)
    return ret.get(accession, {})


def get_url(url):
    """
    Makes a request to a URL. Returns a JSON of the results
    :param str url:
    :return dict:
    """
    response = requests.get(url)

    if response.status_code == 200:
        return response.json()
    else:
        print("[No data retrieved - %s] %s" % (response.status_code, response.text))

    return {}


def get_bound_ligand_interactions_url():
    return pdbe_kb_api_pdb_base_url + "bound_ligand_interactions/" 


def get_sifts_mappings_url():
    return pdbe_sifts_mappings_api_url + "mappings/"


def bound_ligand_interactions(pdbId, chain, seqId, chem_comp_id, entity):
    url =  get_bound_ligand_interactions_url() + pdbId + "/" + chain + "/" + seqId
    print(url)
    data = get_url(url=url)
    data_to_ret = []
    for data_pdb_info in data:
        accession_data = data.get(data_pdb_info)
        for row in accession_data:
            chem_comp_id_api_i = row.get('ligand', {}).get('chem_comp_id')
            chain_id_api_i = row.get('ligand', {}).get('chain_id')
            author_residue_number_api_i = row.get('ligand', {}).get('author_residue_number')
            for interaction in row.get('interactions', []):
                interaction['amino_acid'] = interaction.get('end', {}).get('chem_comp_id')
                interaction['chain_id'] = interaction.get('end', {}).get('chain_id')
                interaction['sequence_residue'] = interaction.get('end', {}).get('author_residue_number')
                interaction['atom_names_features'] = interaction.get('end', {}).get('atom_names')
                #interaction['interaction_types'] = interaction.get('interaction_details')
                interaction['distance_A'] = interaction.get('distance')
                interaction['pdb_id'] = pdbId
                interaction['chain_request'] = chain
                interaction['auth_res_num_request'] = seqId
                interaction['chem_comp_id_api'] = chem_comp_id_api_i
                interaction['chain_id_api'] = chain_id_api_i
                interaction['author_residue_number_api'] = author_residue_number_api_i
                #this line for retrieving the entity:
                interaction['entity_b'] = entity 
                _ = interaction.pop("end")
                _ = interaction.pop("distance")
                data_to_ret.append(interaction)
    return data_to_ret


def get_interactions_tidy(filename):
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        mylist = []   
        for row in csv_reader:
            pdbId, chain_Id, author_residue_number, chem_comp_id, entity  = row  
            mylist.extend(bound_ligand_interactions(pdbId, chain_Id, author_residue_number, chem_comp_id, entity))  
    df = explode_dataset(mylist)
    return df  

In [None]:
results = get_interactions_tidy("cofmapping_cofactors.csv")
results.to_csv("cofmapping_cofactors_results.csv")

 ## Get ligand info for mappings

In [None]:
def get_ligands(pdbId):
    URL_base = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers"
    query = URL_base + "/" + pdbId
    response = requests.get(query)
    if response.status_code == 200:
        return response.json()
    else:
        print("No data available")
        return None


def save_ligands(pdbId, filename):
    response = get_ligands(pdbId)
    with open(filename, "a") as file:
        for ligand in response[pdbId]:
            print(pdbId, ligand["entity_id"], ligand["author_residue_number"], ligand["chain_id"], ligand["chem_comp_id"], sep=",", file=file)
    return None


def iterate_list_save_ligands(my_list):
    for pdbId in pdb_list:
        save_ligands(pdbId, "get_ligands_list.csv")
            

In [None]:
iterate_list_save_ligands(pdb_list)

In [None]:
# To preserve all lines containing the complete list of 453 cofactors associated IDs:
my_file_name = "get_ligands_list.csv"
cleaned_file = "get_ligands_list_cofactors.csv"
save_words = ['ASC', 'F43', 'M43', 'MDO', 'PNS', '0WD', '1DG', '3AA', '3CD', '5J8', '6V0', '80F', '8ID', 'A3D', 'AP0', 'CNA', 'CND', 'DG1', 'DN4', 'DND', 'DQV', 'EAD', 'ENA', 'LNC', 'N01', 'NA0', 'NAD', 'NAE', 'NAI', 'NAJ', 'NAP', 'NAQ', 'NAX', 'NBD', 'NBP', 'NDA', 'NDC', 'NDE', 'NDO', 'NDP', 'NHD', 'NHO', 'NJP', 'NPW', 'ODP', 'P1H', 'PAD', 'SAD', 'SAE', 'SND', 'TAD', 'TAP', 'TDT', 'TXD', 'TXE', 'TXP', 'ZID', '18W', '29P', 'DPM', '2MD', 'MCN', 'MGD', 'MSS', 'MTE', 'MTQ', 'MTV', 'PCD', 'PGD', 'XAX', 'B12', 'B1M', 'CNC', 'COB', 'COY', '6FA', 'FA8', 'FAA', 'FAB', 'FAD', 'FAE', 'FAO', 'FAS', 'FCG', 'FDA', 'FED', 'FNK', 'FSH', 'P5F', 'RFL', 'SFD', '1YJ', 'C2F', 'DHF', 'FFO', 'FOL', 'FON', 'FOZ', 'THF', 'THG', 'THH', '01A', '01K', '0ET', '1C4', '1CV', '1CZ', '1HA', '1VU', '1XE', '2CP', '2NE', '3CP', '3H9', '3HC', '3VV', '4CA', '4CO', '52O', '7L1', '8JD', '8Z2', '94Q', 'ACO', 'AMX', 'BCA', 'BCO', 'BSJ', 'BYC', 'CA3', 'CA5', 'CA6', 'CA8', 'CAA', 'CAJ', 'CAO', 'CIC', 'CMC', 'CMX', 'CO6', 'CO7', 'CO8', 'COA', 'COD', 'COF', 'COO', 'COT', 'COW', 'COZ', 'DCA', 'DCC', 'FAM', 'FCX', 'FRE', 'FYN', 'GRA', 'HAX', 'HMG', 'HSC', 'HXC', 'IVC', 'MCA', 'MCD', 'MDE', 'MLC', 'MYA', 'NHM', 'NHQ', 'NHW', 'NMX', 'OXK', 'Q5B', 'QHD', 'RMW', 'S0N', 'SCA', 'SCD', 'SCO', 'SDX', 'SOP', 'T1G', 'TC6', 'TUY', 'UT7', 'UTA', 'WCA', 'YNC', 'ZOZ', 'SHT', 'TP7', 'TPZ', 'TXZ', 'XP8', 'XP9', '4LS', '4LU', '9O9', '9P3', '9PF', '9Q6', '9QF', 'F7F', 'FMN', 'FNR', 'FNS', 'IRF', 'RBF', 'MQ7', 'COM', '1FH', '2FH', '522', '6HE', '76R', '7HE', 'BW9', 'CCH', 'COH', 'CV0', 'DDH', 'DHE', 'F0L', 'F0X', 'FDD', 'FDE', 'FEC', 'FMI', 'H02', 'HAS', 'HDD', 'HDE', 'HEA', 'HEB', 'HEC', 'HEM', 'HEO', 'HEV', 'HIF', 'HP5', 'ISW', 'MH0', 'MI9', 'MNH', 'MNR', 'MP1', 'N7H', 'OBV', 'PP9', 'SH0', 'SIR', 'SRM', 'UFE', 'VEA', 'VER', 'VOV', 'ZEM', 'ZNH', '4AB', '7AP', 'BHS', 'BIO', 'H2B', 'H4B', 'HBI', 'WSD', 'PQQ', 'BC4', 'BTI', 'BTN', 'BYT', 'DTB', 'Y7Y', 'LPA', 'LPB', '4YP', '9BL', 'AT5', 'DBT', 'RQX', 'UHD', 'UQ1', 'UQ2', 'UQ5', 'UQ6', '0HG', '0HH', '1JO', '1JP', '1R4', '3GC', '48T', '5AU', '6SG', 'ABY', 'AHE', 'ATA', 'BOB', 'BWS', 'BYG', 'EPY', 'ESG', 'GBI', 'GBP', 'GBX', 'GDN', 'GDS', 'GF5', 'GGC', 'GIP', 'GNB', 'GPR', 'GPS', 'GS8', 'GSB', 'GSF', 'GSH', 'GSM', 'GSN', 'GSO',
'GTB', 'GTD', 'GTS', 'GTX', 'GTY', 'GVX', 'HAG', 'HGS', 'IBG', 'ICY', 'JM2', 'JM5', 'JM7', 'L9X', 'LEE', 'LZ6', 'P9H', 'RGE', 'TGG', 'TS5', 'VDW', 'VWW', 'ZBF', '0AF', 'TOQ', 'TQQ', 'TRQ', '0UM', '0XU', '0Y0', '0Y1', '0Y2', '36A', '37H', '4IK', '62X', '6D6', '6NR', '76H', '76J', '76K', '76L', '76M', 'AN6', 'EEM', 'K15', 'P2J', 'SA8', 'SAH', 'SAM', 'SFG', 'SMM', 'SX0', 'TT8', '1TP', '1U0', '2TP', '5GY', '5SR', '8EF', '8EL', '8EO', '8FL', '8ML', '8N9', '8PA', 'A5X', 'D7K', 'EN0', 'HTL', 'M6T', 'N1T', 'N3T', 'NDQ', 'O2T', 'QSP', 'R1T', 'S1T', 'T5X', 'T6F', 'TD5', 'TD6', 'TD7', 'TD8', 'TD9', 'TDK', 'TDL', 'TDM', 'TDN', 'TDP', 'TDW', 'THD', 'THV', 'THW', 'THY', 'TOG', 'TOI', 'TP8', 'TPP', 'TPU', 'TPW', 'TZD', 'WWF', 'ZP1', 'EM2', 'MPL', 'NOP', 'NPL', 'PDP', 'PLP', 'PLR', 'PMP', 'PXP', 'PZP', 'UAH', 'X04',
'1TY', '2TY', '3TY', '4HL', 'AGQ', 'ESB', 'G27', 'HCC', 'P2Q', 'P3Q', 'PAQ', 'T0I', 'TPQ', 'TTS', 'TYQ', 'TYY', 'YPZ', 'ATP']

with open(my_file_name, 'r', newline='') as infile, \
     open(cleaned_file, 'w',newline='') as outfile:
    writer = csv.writer(outfile)
    for line in csv.reader(infile, delimiter=','):
        if any(save_word in element
                      for element in line
                      for save_word in save_words):
            writer.writerow(line)

## Filter Cofactors

In [None]:
def get_cofactors(pdbId):
    URL_base = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/cofactor"
    query = URL_base + "/" + pdbId
    response = requests.get(query)
    if response.status_code == 200:
        return response.json()
    else:
        print("No data available")
        return None


def save_cofactors(pdbId, filename):
    try:
        response = get_cofactors(pdbId)
        with open(filename, "a") as file:
            for ligand in response[pdbId]:
                print(pdbId, ligand["entity_id"], ligand["author_residue_number"], ligand["chain_id"], ligand["chem_comp_id"], ligand["class"], sep=",", file=file)
    except TypeError:
        print("badpdbforcofactor")
    return None


def iterate_list_save_cofactors(my_list):
    for pdbId in pdb_list:
        save_cofactors(pdbId, "get_cofactors_list.csv")
 

In [None]:
iterate_list_save_cofactors(pdb_list)

## Distinguishing between side chain or backbone amino acid interactions

The information on whether this is side chain or backbone (or main chain) interaction is encoded in the assumption that interactions to the backbone of standard and modified amino acids are made through the interactions with the following atom names: "N",  "C", "CA", "O".


In [None]:
results.head()

In [None]:
conditions = [
    (results['atom_names_features'] == 'N'), (results['atom_names_features'] == 'C'), (results['atom_names_features'] == 'CA'),
    (results['atom_names_features'] == 'O')]
choices = ['main_chain', 'main_chain', 'main_chain', 'main_chain']
results['chain_atom_type'] = np.select(conditions, choices, default='side_chain')

conditions1 = [
    (results['amino_acid'] == 'GLY'), (results['amino_acid'] == 'ALA'), (results['amino_acid'] == 'VAL'),
    (results['amino_acid'] == 'LEU'), (results['amino_acid'] == 'ILE'), (results['amino_acid'] == 'PRO'), 
    (results['amino_acid'] == 'THR'), (results['amino_acid'] == 'SER'), (results['amino_acid'] == 'GLU'),
    (results['amino_acid'] == 'ASP'), (results['amino_acid'] == 'ARG'), (results['amino_acid'] == 'LYS'),
    (results['amino_acid'] == 'HIS'), (results['amino_acid'] == 'PHE'), (results['amino_acid'] == 'TRP'), 
    (results['amino_acid'] == 'TYR'), (results['amino_acid'] == 'CYS'), (results['amino_acid'] == 'MET'), 
    (results['amino_acid'] == 'GLN'), (results['amino_acid'] == 'ASN')]
choices = ['early_AA', 'early_AA', 'early_AA', 'early_AA', 'early_AA', 'early_AA', 'early_AA',
           'early_AA', 'early_AA', 'early_AA', 'late_AA', 'late_AA', 'late_AA', 'late_AA',
           'late_AA', 'late_AA', 'late_AA', 'late_AA', 'late_AA', 'late_AA']
results['aminoacid_type'] = np.select(conditions1, choices, default='other')
results
print(results)
 

In [None]:
results.to_csv("results_api.csv")
results.head()