In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

In [2]:
raw_data = pd.read_csv('S4_LigBoundConf_information.csv')
raw_data.head()

Unnamed: 0,LigandID,PDBid,Resid,Chain,3-letter-code,LigandSmiles,Resolution,EDIAm
0,9O3_6A1B_A_402,6A1B,402,A,9O3,C(=O)(C(C(F)(F)F)(O)O)O,1.47,1.05
1,OGA_2QRL_A_500,2QRL,500,A,OGA,C(C(=O)O)NC(=O)C(=O)O,1.6,0.86
2,AKG_2CJH_A_1451,2CJH,1451,A,AKG,C(CC(=O)O)C(=O)C(=O)O,2.0,0.89
3,G3H_1UXU_A_504,1UXU,504,A,G3H,C([C@H](C=O)O)OP(=O)(O)O,2.25,0.85
4,AKG_2YKX_C_1446,2YKX,1446,C,AKG,C(CC(=O)O)C(=O)C(=O)O,1.85,0.93


In [24]:
# Specify the path to the sdf file
sdf_file = "S2_LigBoundConf_minimized.sdf"

# Read the sdf file
suppl = Chem.SDMolSupplier(sdf_file)

# Create empty lists to hold the properties
names = []
molecules = []
chiral_flags = []

# Iterate over the molecules in the sdf file
for mol in suppl:
    if mol is not None:
        # Get the name of the molecule if it exists
        name = mol.GetProp('_Name') if mol.HasProp('_Name') else None
        # Get the chiral flag of the molecule if it exists
        chiral_flag = mol.GetProp('chiral flag') if mol.HasProp('chiral flag') else None
        
        # Append the properties to the respective lists
        names.append(name)
        molecules.append(mol)
        chiral_flags.append(chiral_flag)

# Create a dictionary with the properties
data = {'Name': names, 'Molecule': molecules, 'Chiral Flag': chiral_flags}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)


In [28]:
df.describe()


Unnamed: 0,Name,Molecule,Chiral Flag
count,8134,8134,8134
unique,8134,8134,1
top,0A1_3QTC_A_811,<rdkit.Chem.rdchem.Mol object at 0x16328e110>,1
freq,1,1,8134


In [29]:
merged_df = df.merge(raw_data, left_on='Name', right_on='LigandID')
display(merged_df.head())
display(merged_df.describe())

Unnamed: 0,Name,Molecule,Chiral Flag,LigandID,PDBid,Resid,Chain,3-letter-code,LigandSmiles,Resolution,EDIAm
0,0A1_3QTC_A_811,<rdkit.Chem.rdchem.Mol object at 0x16328e110>,1,0A1_3QTC_A_811,3QTC,811,A,0A1,COc1ccc(cc1)C[C@@H](C(=O)O)N,1.75,0.9
1,0A9_3QGO_A_407,<rdkit.Chem.rdchem.Mol object at 0x16328e180>,1,0A9_3QGO_A_407,3QGO,407,A,0A9,COC(=O)[C@H](Cc1ccccc1)N,1.45,0.92
2,0B1_3VV7_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e1f0>,1,0B1_3VV7_A_509,3VV7,509,A,0B1,CN1C(=O)C=C(N=C1N)[C@H]2C[C@H]2c3cccc(c3)c4ccc...,2.1,0.84
3,0B3_3WB4_A_507,<rdkit.Chem.rdchem.Mol object at 0x16328e260>,1,0B3_3WB4_A_507,3WB4,507,A,0B3,C[C@]1(CC(=O)N(C(=N1)N)C)CCc2ccccc2,2.25,0.86
4,0B5_5YGX_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e2d0>,1,0B5_5YGX_A_509,5YGX,509,A,0B5,C[C@]1([C@H]([C@H](OC(=N1)N)C(C)(F)F)F)c2cc(cc...,2.2,0.97


Unnamed: 0,Resid,Resolution,EDIAm
count,8134.0,8134.0,8134.0
mean,706.231006,1.880633,0.965022
std,762.932234,0.287971,0.067923
min,0.0,0.86,0.8
25%,301.0,1.7,0.92
50%,501.0,1.9,0.98
75%,901.0,2.1,1.02
max,9901.0,2.5,1.12


In [31]:
import requests

# Specify the PDBid
pdbid = '5XRA'  # Replace with your PDBid

# Send a GET request to the PDB API
response = requests.get(f'https://files.rcsb.org/view/{pdbid}.pdb')

# Check if the request was successful
if response.status_code == 200:
    # Print the content of the response
    print(response.text)
else:
    print(f'Failed to get PDB file for {pdbid}')

HEADER    SIGNALING PROTEIN                       08-JUN-17   5XRA              
TITLE     CRYSTAL STRUCTURE OF THE HUMAN CB1 IN COMPLEX WITH AGONIST AM11542    
COMPND    MOL_ID: 1;                                                            
COMPND   2 MOLECULE: CANNABINOID RECEPTOR 1,FLAVODOXIN,CANNABINOID RECEPTOR 1;  
COMPND   3 CHAIN: A;                                                            
COMPND   4 FRAGMENT: UNP RESIDUES 99-306,UNP RESIDUES 3-148,UNP RESIDUES 332-   
COMPND   5 414;                                                                 
COMPND   6 SYNONYM: CB1,CANN6,CB1,CANN6;                                        
COMPND   7 ENGINEERED: YES;                                                     
COMPND   8 MUTATION: YES                                                        
SOURCE    MOL_ID: 1;                                                            
SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS, DESULFOVIBRIO VULGARIS (STRAIN    
SOURCE   3 HILDENBOROUGH / A

In [32]:
import requests

# Example for querying UniProt API for PDB ID "5XRA"


def query_uniprot(pdb_id):
    uniprot_api_url = f"https://www.uniprot.org/uploadlists/"

    params = {"from": "PDB_ID", "to": "ACC", "format": "tab", "query": pdb_id}

    response = requests.get(uniprot_api_url, params=params)
    if response.ok:
        return response.text
    else:
        return None


# Querying UniProt for PDB ID 5XRA
uniprot_response = query_uniprot("5XRA")
uniprot_response

In [34]:
print(uniprot_response)

None


In [35]:
import requests


def get_protein_data(pdb_id):
    url = f"https://www.rcsb.org/pdb/rest/customReport.xml?pdbids={pdb_id}&customReportColumns=structureId,structureTitle,classification"
    response = requests.get(url)

    if response.status_code == 200:
        from xml.etree import ElementTree as ET

        tree = ET.fromstring(response.content)
        for child in tree:
            if child.tag == "record":
                structure_title = child.find("dimStructure.structureTitle").text
                classification = child.find("dimStructure.classification").text
                return structure_title, classification
    else:
        return None, None


pdb_id = "5XRA"
name, target_class = get_protein_data(pdb_id)
if name and target_class:
    print(f"Name: {name}\nTarget Class: {target_class}")
else:
    print("Data not found or an error occurred.")

Data not found or an error occurred.


In [36]:
import requests


def get_from_rcsb(pdb_id):
    url = f"https://www.rcsb.org/pdb/rest/customReport.xml?pdbids={pdb_id}&customReportColumns=structureId,structureTitle,classification"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            from xml.etree import ElementTree as ET

            tree = ET.fromstring(response.content)
            for child in tree:
                if child.tag == "record":
                    title = child.find("dimStructure.structureTitle").text
                    classification = child.find("dimStructure.classification").text
                    return title, classification
    except Exception as e:
        print(f"Error accessing RCSB PDB: {e}")
    return None, None


def get_from_uniprot(pdb_id):
    url = f"https://www.ebi.ac.uk/proteins/api/proteins/{pdb_id}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            name = data.get("name")
            classification = (
                data.get("protein", {})
                .get("recommendedName", {})
                .get("fullName", {})
                .get("value")
            )
            return name, classification
    except Exception as e:
        print(f"Error accessing UniProt: {e}")
    return None, None


def get_from_pdbe(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/{pdb_id}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            entry = data.get(pdb_id, [{}])[0]
            title = entry.get("title")
            classification = entry.get("experimental_method")
            return title, classification
    except Exception as e:
        print(f"Error accessing PDBe: {e}")
    return None, None


pdb_id = "5XRA"
print("Attempting to fetch data from multiple sources...")

# Attempt to fetch from RCSB PDB
name, target_class = get_from_rcsb(pdb_id)
if name and target_class:
    print(f"RCSB PDB - Name: {name}, Target Class: {target_class}")

# Attempt to fetch from UniProt
name, target_class = get_from_uniprot(pdb_id)
if name and target_class:
    print(f"UniProt - Name: {name}, Target Class: {target_class}")

# Attempt to fetch from PDBe
name, target_class = get_from_pdbe(pdb_id)
if name and target_class:
    print(f"PDBe - Name: {name}, Target Class: {target_class}")

Attempting to fetch data from multiple sources...


In [37]:
import requests
import time

uniprot_ids = ["P26378", "O35433", "Q02910"]
url = "https://www.uniprot.org/uniprot/"

protein_to_pdb = {}
for protein in uniprot_ids:
    params = {
        "format": "tab",
        "query": "ID:{}".format(protein),
        "columns": "id,database(PDB)",
    }
    contact = ""  # Please set your email address here.
    headers = {"User-Agent": "Python {}".format(contact)}
    r = requests.get(url, params=params, headers=headers)

    protein_to_pdb[protein] = str(r.text).splitlines()[-1].split("\t")[-1].split(";")
    protein_to_pdb[protein].pop(-1)
    time.sleep(1)  # be respectful and don't overwhelm the server with requests

print(protein_to_pdb)

{'P26378': ['{"url":"http://rest.uniprot.org/uniprotkb/format=tab&amp', 'query=ID:P26378&amp'], 'O35433': ['{"url":"http://rest.uniprot.org/uniprotkb/format=tab&amp', 'query=ID:O35433&amp'], 'Q02910': ['{"url":"http://rest.uniprot.org/uniprotkb/format=tab&amp', 'query=ID:Q02910&amp']}


In [38]:
import requests


def get_protein_data(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
    response = requests.get(url)
    data = response.json()

    return data


# Example call for PDB ID '5XRA'
protein_data = get_protein_data("5XRA")
protein_data

{'5xra': {'UniProt': {'P21554': {'name': 'CNR1_HUMAN',
    'mappings': [{'entity_id': 1,
      'chain_id': 'A',
      'struct_asym_id': 'A',
      'unp_start': 99,
      'unp_end': 306,
      'start': {'residue_number': 1,
       'author_residue_number': None,
       'author_insertion_code': ''},
      'end': {'residue_number': 208,
       'author_residue_number': 306,
       'author_insertion_code': ''}},
     {'entity_id': 1,
      'chain_id': 'A',
      'struct_asym_id': 'A',
      'unp_start': 332,
      'unp_end': 414,
      'start': {'residue_number': 356,
       'author_residue_number': None,
       'author_insertion_code': ''},
      'end': {'residue_number': 438,
       'author_residue_number': 414,
       'author_insertion_code': ''}}],
    'identifier': 'CNR1_HUMAN'},
   'P00323': {'name': 'FLAV_DESVH',
    'mappings': [{'entity_id': 1,
      'chain_id': 'A',
      'struct_asym_id': 'A',
      'unp_start': 3,
      'unp_end': 148,
      'start': {'residue_number': 210,
     

In [39]:
# Analyzing the provided JSON data to identify the primary protein

data = {
    "5xra": {
        "UniProt": {
            "P21554": {
                "name": "CNR1_HUMAN",
                "mappings": [
                    {
                        "entity_id": 1,
                        "chain_id": "A",
                        "struct_asym_id": "A",
                        "unp_start": 99,
                        "unp_end": 306,
                    },
                    {
                        "entity_id": 1,
                        "chain_id": "A",
                        "struct_asym_id": "A",
                        "unp_start": 332,
                        "unp_end": 414,
                    },
                ],
                "identifier": "CNR1_HUMAN",
            },
            "P00323": {
                "name": "FLAV_DESVH",
                "mappings": [
                    {
                        "entity_id": 1,
                        "chain_id": "A",
                        "struct_asym_id": "A",
                        "unp_start": 3,
                        "unp_end": 148,
                    }
                ],
                "identifier": "FLAV_DESVH",
            },
        }
    }
}


def identify_primary_protein(data):
    proteins = data["5xra"]["UniProt"]
    primary_protein = None
    max_coverage = 0

    for uniprot_id, protein_info in proteins.items():
        total_coverage = sum(
            [
                mapping["unp_end"] - mapping["unp_start"] + 1
                for mapping in protein_info["mappings"]
            ]
        )

        if total_coverage > max_coverage:
            max_coverage = total_coverage
            primary_protein = protein_info

    return primary_protein


primary_protein = identify_primary_protein(data)
primary_protein_name = primary_protein["name"] if primary_protein else "Not found"
primary_protein_name

'CNR1_HUMAN'

In [40]:
import requests


def get_aggregated_mappings(pdb_id):
    base_url = "https://www.ebi.ac.uk/pdbe/api/mappings"
    data = {}

    # For UniProt
    uniprot_url = f"{base_url}/uniprot/{pdb_id}"
    data["UniProt"] = requests.get(uniprot_url).json()

    # For Pfam
    pfam_url = f"{base_url}/pfam/{pdb_id}"
    data["Pfam"] = requests.get(pfam_url).json()

    # For GO
    go_url = f"{base_url}/go/{pdb_id}"
    data["GO"] = requests.get(go_url).json()

    return data


# Example usage
pdb_id = "5XRA"  # Replace with your PDB ID
aggregated_data = get_aggregated_mappings(pdb_id)

In [42]:
aggregated_data

{'UniProt': {'5xra': {'UniProt': {'P21554': {'name': 'CNR1_HUMAN',
     'mappings': [{'entity_id': 1,
       'chain_id': 'A',
       'struct_asym_id': 'A',
       'unp_start': 99,
       'unp_end': 306,
       'start': {'residue_number': 1,
        'author_residue_number': None,
        'author_insertion_code': ''},
       'end': {'residue_number': 208,
        'author_residue_number': 306,
        'author_insertion_code': ''}},
      {'entity_id': 1,
       'chain_id': 'A',
       'struct_asym_id': 'A',
       'unp_start': 332,
       'unp_end': 414,
       'start': {'residue_number': 356,
        'author_residue_number': None,
        'author_insertion_code': ''},
       'end': {'residue_number': 438,
        'author_residue_number': 414,
        'author_insertion_code': ''}}],
     'identifier': 'CNR1_HUMAN'},
    'P00323': {'name': 'FLAV_DESVH',
     'mappings': [{'entity_id': 1,
       'chain_id': 'A',
       'struct_asym_id': 'A',
       'unp_start': 3,
       'unp_end': 148,
   

In [43]:
import requests
import pandas as pd


def get_and_process_data(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
    response = requests.get(url)
    data = response.json()

    # Process the data to extract only the necessary information
    processed_data = []
    for entry in data[pdb_id]["UniProt"]:
        processed_entry = {
            "UniProt_ID": entry,
            "Protein_Name": data[pdb_id]["UniProt"][entry]["name"],
            # Add other fields as needed
        }
        processed_data.append(processed_entry)

    return processed_data


# Example usage
pdb_id = "5XRA"
subset_data = get_and_process_data(pdb_id)
df = pd.DataFrame(subset_data)


KeyError: '5XRA'

In [44]:
import pandas as pd
import requests

# Sample DataFrame
data = {"pdb": ["5XRA"], "ligand": ["placeholder"]}
df = pd.DataFrame(data)


# Function to get and process data
def get_and_process_data(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
    response = requests.get(url)
    data = response.json()

    # Extract the necessary information (example: UniProt_ID)
    if pdb_id in data and "UniProt" in data[pdb_id]:
        for entry in data[pdb_id]["UniProt"]:
            return {
                "UniProt_ID": entry,
                "Protein_Name": data[pdb_id]["UniProt"][entry]["name"],
            }
    return {"UniProt_ID": None, "Protein_Name": None}


# Iterate over DataFrame and update with API data
for index, row in df.iterrows():
    pdb_id = row["pdb"]
    api_data = get_and_process_data(pdb_id)
    df.at[index, "UniProt_ID"] = api_data["UniProt_ID"]
    df.at[index, "Protein_Name"] = api_data["Protein_Name"]

# Display the updated DataFrame
df

Unnamed: 0,pdb,ligand,UniProt_ID,Protein_Name
0,5XRA,placeholder,,


In [45]:
aggregated_data

{'UniProt': {'5xra': {'UniProt': {'P21554': {'name': 'CNR1_HUMAN',
     'mappings': [{'entity_id': 1,
       'chain_id': 'A',
       'struct_asym_id': 'A',
       'unp_start': 99,
       'unp_end': 306,
       'start': {'residue_number': 1,
        'author_residue_number': None,
        'author_insertion_code': ''},
       'end': {'residue_number': 208,
        'author_residue_number': 306,
        'author_insertion_code': ''}},
      {'entity_id': 1,
       'chain_id': 'A',
       'struct_asym_id': 'A',
       'unp_start': 332,
       'unp_end': 414,
       'start': {'residue_number': 356,
        'author_residue_number': None,
        'author_insertion_code': ''},
       'end': {'residue_number': 438,
        'author_residue_number': 414,
        'author_insertion_code': ''}}],
     'identifier': 'CNR1_HUMAN'},
    'P00323': {'name': 'FLAV_DESVH',
     'mappings': [{'entity_id': 1,
       'chain_id': 'A',
       'struct_asym_id': 'A',
       'unp_start': 3,
       'unp_end': 148,
   

In [46]:
import pandas as pd
import requests

# Sample DataFrame
data = {"pdb": ["5XRA"], "ligand": ["placeholder"]}
df = pd.DataFrame(data)


# Function to get Pfam data and process it
def get_pfam_data(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/pfam/{pdb_id}"
    response = requests.get(url)
    data = response.json()

    # Find the Pfam entry for "7 transmembrane receptor (rhodopsin family)" or similar
    if pdb_id in data and "Pfam" in data[pdb_id]:
        for pfam_id, pfam_info in data[pdb_id]["Pfam"].items():
            if "transmembrane receptor" in pfam_info["description"]:
                return pfam_info["identifier"]
    return None


# Iterate over DataFrame and update with API data
for index, row in df.iterrows():
    pdb_id = row["pdb"]
    pfam_identifier = get_pfam_data(pdb_id)
    df.at[index, "Pfam_Identifier"] = pfam_identifier

# Display the updated DataFrame
df

Unnamed: 0,pdb,ligand,Pfam_Identifier
0,5XRA,placeholder,


In [47]:
import pandas as pd
import requests


def get_first_pfam_identifier(pdb_id):
    try:
        url = f"https://www.ebi.ac.uk/pdbe/api/mappings/pfam/{pdb_id}"
        response = requests.get(url)
        data = response.json()

        # Get the first Pfam identifier
        if pdb_id in data and "Pfam" in data[pdb_id]:
            first_pfam_id = next(iter(data[pdb_id]["Pfam"]), None)
            if first_pfam_id:
                return data[pdb_id]["Pfam"][first_pfam_id]["identifier"]
        return None
    except Exception as e:
        print(f"Error fetching data for PDB ID {pdb_id}: {e}")
        return None


# Sample DataFrame
data = {"pdb": ["5XRA"], "ligand": ["placeholder"]}
df = pd.DataFrame(data)

# Iterate over DataFrame and update with API data
for index, row in df.iterrows():
    pdb_id = row["pdb"]
    pfam_identifier = get_first_pfam_identifier(pdb_id)
    df.at[index, "Pfam_Identifier"] = pfam_identifier

# Print the updated DataFrame
print(df)

    pdb       ligand  Pfam_Identifier
0  5XRA  placeholder              NaN


In [48]:
import pandas as pd

# Sample DataFrame with initial data
data = {"pdb": ["5XRA"], "ligand": ["placeholder"]}
df = pd.DataFrame(data)

# Provided JSON data for the PDB ID '5XRA'
json_data = {
    "5xra": {
        "Pfam": {
            "PF00001": {
                "description": "7 transmembrane receptor (rhodopsin family)",
                "identifier": "7 transmembrane receptor (rhodopsin family)",
                # ... other data omitted for brevity
            },
            # ... other entries omitted for brevity
        }
    }
}


# Function to extract the first identifier from the JSON data
def get_first_identifier(pdb_id, json_data):
    pfam_data = json_data[pdb_id.lower()]["Pfam"]
    first_pfam_id = next(iter(pfam_data), None)
    return pfam_data[first_pfam_id]["identifier"] if first_pfam_id else None


# Update the DataFrame with the extracted identifier
df["Pfam_Identifier"] = df["pdb"].apply(
    lambda pdb_id: get_first_identifier(pdb_id, json_data)
)

# Display the updated DataFrame
df

Unnamed: 0,pdb,ligand,Pfam_Identifier
0,5XRA,placeholder,7 transmembrane receptor (rhodopsin family)


In [49]:
def get_protein_data(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/pfam/{pdb_id}"
    response = requests.get(url)
    data = response.json()

    return data


# Example call for PDB ID '5XRA'
protein_data = get_protein_data("5XRA")
protein_data

{'5xra': {'Pfam': {'PF00001': {'description': '7 transmembrane receptor (rhodopsin family)',
    'mappings': [{'chain_id': 'A',
      'entity_id': 1,
      'struct_asym_id': 'A',
      'coverage': 0.906,
      'start': {'residue_number': 35,
       'author_residue_number': 133,
       'author_insertion_code': ''},
      'end': {'residue_number': 421,
       'author_residue_number': 397,
       'author_insertion_code': ''}}],
    'identifier': '7 transmembrane receptor (rhodopsin family)',
    'name': '7tm_1'},
   'PF00258': {'description': 'Flavodoxin',
    'mappings': [{'chain_id': 'A',
      'entity_id': 1,
      'struct_asym_id': 'A',
      'coverage': 1,
      'start': {'residue_number': 213,
       'author_residue_number': 1006,
       'author_insertion_code': ''},
      'end': {'residue_number': 347,
       'author_residue_number': 1140,
       'author_insertion_code': ''}}],
    'identifier': 'Flavodoxin',
    'name': 'Flavodoxin_1'}}}}

In [52]:
# Sample DataFrame with initial data
data = {"pdb": ["5XRA"], "ligand": ["placeholder"]}
df = pd.DataFrame(data)
display(df)


def get_protein_data(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/pfam/{pdb_id}"
    response = requests.get(url)
    data = response.json()

    return data


# Example call for PDB ID '5XRA'
protein_data = get_protein_data("5XRA")
protein_data


# Function to extract the first identifier from the JSON data
def get_first_identifier(pdb_id, json_data):
    pfam_data = json_data[pdb_id.lower()]["Pfam"]
    first_pfam_id = next(iter(pfam_data), None)
    return pfam_data[first_pfam_id]["identifier"] if first_pfam_id else None


# Update the DataFrame with the extracted identifier
df["Pfam_Identifier"] = df["pdb"].apply(
    lambda pdb_id: get_first_identifier(pdb_id, protein_data)
)

display(df)

Unnamed: 0,pdb,ligand
0,5XRA,placeholder


Unnamed: 0,pdb,ligand,Pfam_Identifier
0,5XRA,placeholder,7 transmembrane receptor (rhodopsin family)


Unnamed: 0,pdb,ligand,Pfam_Identifier
0,5XRA,placeholder,7 transmembrane receptor (rhodopsin family)


In [53]:
import pandas as pd
import requests

# Sample DataFrame with initial data
data = {"pdb": ["5XRA", "1A80", "2HI1"], "ligand": ["placeholder", "placeholder", "placeholder"]}
df = pd.DataFrame(data)

def get_protein_data(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/pfam/{pdb_id}"
    response = requests.get(url)
    data = response.json()
    return data

# Function to extract the first identifier from the JSON data
def get_first_identifier(pdb_id, json_data):
    pfam_data = json_data[pdb_id.lower()]["Pfam"]
    first_pfam_id = next(iter(pfam_data), None)
    return pfam_data[first_pfam_id]["identifier"] if first_pfam_id else None

# Create a new column 'Pfam_Identifier' in the DataFrame
df['Pfam_Identifier'] = None

# Loop through the 'pdb' column in the DataFrame
for index, row in df.iterrows():
    # Get the PDB ID
    pdb_id = row['pdb']
    # Call get_protein_data for the PDB ID
    protein_data = get_protein_data(pdb_id)
    # Get the identifier
    identifier = get_first_identifier(pdb_id, protein_data)
    # Add the identifier to the DataFrame
    df.loc[index, 'Pfam_Identifier'] = identifier

display(df)

Unnamed: 0,pdb,ligand,Pfam_Identifier
0,5XRA,placeholder,7 transmembrane receptor (rhodopsin family)
1,1A80,placeholder,Aldo/keto reductase family
2,2HI1,placeholder,Pyridoxal phosphate biosynthetic protein PdxA


In [54]:
merged_df

Unnamed: 0,Name,Molecule,Chiral Flag,LigandID,PDBid,Resid,Chain,3-letter-code,LigandSmiles,Resolution,EDIAm
0,0A1_3QTC_A_811,<rdkit.Chem.rdchem.Mol object at 0x16328e110>,1,0A1_3QTC_A_811,3QTC,811,A,0A1,COc1ccc(cc1)C[C@@H](C(=O)O)N,1.75,0.90
1,0A9_3QGO_A_407,<rdkit.Chem.rdchem.Mol object at 0x16328e180>,1,0A9_3QGO_A_407,3QGO,407,A,0A9,COC(=O)[C@H](Cc1ccccc1)N,1.45,0.92
2,0B1_3VV7_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e1f0>,1,0B1_3VV7_A_509,3VV7,509,A,0B1,CN1C(=O)C=C(N=C1N)[C@H]2C[C@H]2c3cccc(c3)c4ccc...,2.10,0.84
3,0B3_3WB4_A_507,<rdkit.Chem.rdchem.Mol object at 0x16328e260>,1,0B3_3WB4_A_507,3WB4,507,A,0B3,C[C@]1(CC(=O)N(C(=N1)N)C)CCc2ccccc2,2.25,0.86
4,0B5_5YGX_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e2d0>,1,0B5_5YGX_A_509,5YGX,509,A,0B5,C[C@]1([C@H]([C@H](OC(=N1)N)C(C)(F)F)F)c2cc(cc...,2.20,0.97
...,...,...,...,...,...,...,...,...,...,...,...
8129,ZZD_2WOG_B_1365,<rdkit.Chem.rdchem.Mol object at 0x2b275ce40>,1,ZZD_2WOG_B_1365,2WOG,1365,B,ZZD,c1ccc(cc1)C(c2ccccc2)(c3ccccc3)SC[C@@H](C(=O)O)N,2.00,1.01
8130,ZZG_2WOT_A_1500,<rdkit.Chem.rdchem.Mol object at 0x2b275cdd0>,1,ZZG_2WOT_A_1500,2WOT,1500,A,ZZG,Cc1cc(c(nc1C)c2ccccn2)Oc3ccnc(c3)Nc4cc(c(c(c4)...,1.85,0.98
8131,ZZO_2WXH_A_1500,<rdkit.Chem.rdchem.Mol object at 0x2b275cd60>,1,ZZO_2WXH_A_1500,2WXH,1500,A,ZZO,Cc1ccccc1N2C(=Nc3cccc(c3C2=O)C)Cn4c5c(c(n4)c6c...,1.90,1.01
8132,ZZY_2WD1_A_2347,<rdkit.Chem.rdchem.Mol object at 0x2b275ccf0>,1,ZZY_2WD1_A_2347,2WD1,2347,A,ZZY,c1ccc(c(c1)[N+](=O)[O-])S(=O)(=O)n2ccc3c2cc(cn...,2.00,0.98


In [56]:
def get_pdbe_data(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/graph-api/pdbe_pages/annotations/{pdb_id}/1"
    response = requests.get(url)
    data = response.json()
    return data

pdbe_5xra = get_pdbe_data('5XRA')
pdbe_5xra

{'5xra': {'sequence': 'GENFMDIECFMVLNPSQQLAIAVLSLTLGTFTVLENLLVLCVILHSRSLRCRPSYHFIGSLAVADLLGSVIFVYSFIDFHVFHRKDSRNVFLFKLGGVTASFTASVGSLFLAAIDRYISIHRPLAYKRIVTRPKAVVAFCLMWTIAIVIAVLPLLGWNCEKLQSVCSDIFPHIDKTYLMFWIGVVSVLLLFIVYAYMYILWKAHSHAVAKALIVYGSTTGNTEYTAETIARELADAGYEVDSRDAASVEAGGLFEGFDLVLLGCSTWGDDSIELQDDFIPLFDSLEETGAQGRKVACFGCGDSSWEYFCGAVDAIEEKLKNLGAEIVQDGLRIDGDPRAARDDIVGWAHDVRGAIPDQARMDIELAKTLVLILVVLIICWGPLLAIMVYDVFGKMNKLIKTVFAFCSMLCLLNSTVNPIIYALRSKDLRHAFRSMFPS',
  'length': 438,
  'dataType': 'ANNOTATIONS',
  'data': [{'name': 'Total SASA',
    'accession': 'Total SASA',
    'dataType': 'Total SASA',
    'residues': [{'startIndex': 6,
      'endIndex': 6,
      'indexType': 'PDB',
      'startCode': 'ASP',
      'endCode': 'ASP',
      'additionalData': {'resourceUrl': 'https://github.com/Fraternalilab/POPScomp',
       'rawScore': 108.1286,
       'confidenceScore': 0.9,
       'confidenceLevel': 'high',
       'groupLabel': 'total SASA [A^2]',
       'ordinalId': 3}},
     {'startIndex'

In [65]:
test_df = merged_df.copy()

In [66]:
test_df

Unnamed: 0,Name,Molecule,Chiral Flag,LigandID,PDBid,Resid,Chain,3-letter-code,LigandSmiles,Resolution,EDIAm
0,0A1_3QTC_A_811,<rdkit.Chem.rdchem.Mol object at 0x16328e110>,1,0A1_3QTC_A_811,3QTC,811,A,0A1,COc1ccc(cc1)C[C@@H](C(=O)O)N,1.75,0.90
1,0A9_3QGO_A_407,<rdkit.Chem.rdchem.Mol object at 0x16328e180>,1,0A9_3QGO_A_407,3QGO,407,A,0A9,COC(=O)[C@H](Cc1ccccc1)N,1.45,0.92
2,0B1_3VV7_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e1f0>,1,0B1_3VV7_A_509,3VV7,509,A,0B1,CN1C(=O)C=C(N=C1N)[C@H]2C[C@H]2c3cccc(c3)c4ccc...,2.10,0.84
3,0B3_3WB4_A_507,<rdkit.Chem.rdchem.Mol object at 0x16328e260>,1,0B3_3WB4_A_507,3WB4,507,A,0B3,C[C@]1(CC(=O)N(C(=N1)N)C)CCc2ccccc2,2.25,0.86
4,0B5_5YGX_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e2d0>,1,0B5_5YGX_A_509,5YGX,509,A,0B5,C[C@]1([C@H]([C@H](OC(=N1)N)C(C)(F)F)F)c2cc(cc...,2.20,0.97
...,...,...,...,...,...,...,...,...,...,...,...
8129,ZZD_2WOG_B_1365,<rdkit.Chem.rdchem.Mol object at 0x2b275ce40>,1,ZZD_2WOG_B_1365,2WOG,1365,B,ZZD,c1ccc(cc1)C(c2ccccc2)(c3ccccc3)SC[C@@H](C(=O)O)N,2.00,1.01
8130,ZZG_2WOT_A_1500,<rdkit.Chem.rdchem.Mol object at 0x2b275cdd0>,1,ZZG_2WOT_A_1500,2WOT,1500,A,ZZG,Cc1cc(c(nc1C)c2ccccn2)Oc3ccnc(c3)Nc4cc(c(c(c4)...,1.85,0.98
8131,ZZO_2WXH_A_1500,<rdkit.Chem.rdchem.Mol object at 0x2b275cd60>,1,ZZO_2WXH_A_1500,2WXH,1500,A,ZZO,Cc1ccccc1N2C(=Nc3cccc(c3C2=O)C)Cn4c5c(c(n4)c6c...,1.90,1.01
8132,ZZY_2WD1_A_2347,<rdkit.Chem.rdchem.Mol object at 0x2b275ccf0>,1,ZZY_2WD1_A_2347,2WD1,2347,A,ZZY,c1ccc(c(c1)[N+](=O)[O-])S(=O)(=O)n2ccc3c2cc(cn...,2.00,0.98


In [67]:
test_df_10 = test_df.head(10)

In [68]:
test_df_10

Unnamed: 0,Name,Molecule,Chiral Flag,LigandID,PDBid,Resid,Chain,3-letter-code,LigandSmiles,Resolution,EDIAm
0,0A1_3QTC_A_811,<rdkit.Chem.rdchem.Mol object at 0x16328e110>,1,0A1_3QTC_A_811,3QTC,811,A,0A1,COc1ccc(cc1)C[C@@H](C(=O)O)N,1.75,0.9
1,0A9_3QGO_A_407,<rdkit.Chem.rdchem.Mol object at 0x16328e180>,1,0A9_3QGO_A_407,3QGO,407,A,0A9,COC(=O)[C@H](Cc1ccccc1)N,1.45,0.92
2,0B1_3VV7_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e1f0>,1,0B1_3VV7_A_509,3VV7,509,A,0B1,CN1C(=O)C=C(N=C1N)[C@H]2C[C@H]2c3cccc(c3)c4ccc...,2.1,0.84
3,0B3_3WB4_A_507,<rdkit.Chem.rdchem.Mol object at 0x16328e260>,1,0B3_3WB4_A_507,3WB4,507,A,0B3,C[C@]1(CC(=O)N(C(=N1)N)C)CCc2ccccc2,2.25,0.86
4,0B5_5YGX_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e2d0>,1,0B5_5YGX_A_509,5YGX,509,A,0B5,C[C@]1([C@H]([C@H](OC(=N1)N)C(C)(F)F)F)c2cc(cc...,2.2,0.97
5,0BI_3K5C_B_387,<rdkit.Chem.rdchem.Mol object at 0x16328e340>,1,0BI_3K5C_B_387,3K5C,387,B,0BI,CC(C)c1cccc(c1)C2(CC2)NC[C@H]([C@@H]3Cc4cccc(c...,2.12,0.89
6,0C1_3UPH_A_578,<rdkit.Chem.rdchem.Mol object at 0x16328e3b0>,1,0C1_3UPH_A_578,3UPH,578,A,0C1,CS(=O)(=O)NC(=O)c1c(c2c(n1Cc3cc(ccc3F)F)ccc4c2...,2.0,0.94
7,0C8_3UP2_A_1,<rdkit.Chem.rdchem.Mol object at 0x16328e420>,1,0C8_3UP2_A_1,3UP2,1,A,0C8,c1ccc(c(c1)Nc2ccnc(n2)Nc3ccc(cc3)C(=O)O)OC(F)(F)F,2.3,0.95
8,0CA_3UPE_A_481,<rdkit.Chem.rdchem.Mol object at 0x16328e490>,1,0CA_3UPE_A_481,3UPE,481,A,0CA,[H]/N=C(\c1ccc2ccc(cc2c1)Cn3c4cc(cc(c4cc3C(C)C...,1.54,1.0
9,0CB_3UUZ_B_481,<rdkit.Chem.rdchem.Mol object at 0x16328e500>,1,0CB_3UUZ_B_481,3UUZ,481,B,0CB,[H]/N=C(\C)/N1CCC(CC1)Oc2ccc(cc2)N(Cc3ccc4ccc(...,2.1,0.91


In [70]:
def add_pfam_identifiers(df):
    # Create a new column 'Pfam_Identifier' in the DataFrame
    df['Pfam_Identifier'] = None

    # Loop through the 'pdb' column in the DataFrame
    for index, row in df.iterrows():
        # Get the PDB ID
        pdb_id = row['PDBid']
        # Call get_protein_data for the PDB ID
        protein_data = get_protein_data(pdb_id)
        # Get the identifier
        identifier = get_first_identifier(pdb_id, protein_data)
        # Add the identifier to the DataFrame
        df.loc[index, 'Pfam_Identifier'] = identifier

    return df

# Use the function to add Pfam identifiers to the DataFrame
test_df_10 = add_pfam_identifiers(test_df_10)
display(test_df_10)

Unnamed: 0,Name,Molecule,Chiral Flag,LigandID,PDBid,Resid,Chain,3-letter-code,LigandSmiles,Resolution,EDIAm,Pfam_Identifier
0,0A1_3QTC_A_811,<rdkit.Chem.rdchem.Mol object at 0x16328e110>,1,0A1_3QTC_A_811,3QTC,811,A,0A1,COc1ccc(cc1)C[C@@H](C(=O)O)N,1.75,0.9,tRNA synthetases class II core domain (F)
1,0A9_3QGO_A_407,<rdkit.Chem.rdchem.Mol object at 0x16328e180>,1,0A9_3QGO_A_407,3QGO,407,A,0A9,COC(=O)[C@H](Cc1ccccc1)N,1.45,0.92,"Thermolysin metallopeptidase, catalytic domain"
2,0B1_3VV7_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e1f0>,1,0B1_3VV7_A_509,3VV7,509,A,0B1,CN1C(=O)C=C(N=C1N)[C@H]2C[C@H]2c3cccc(c3)c4ccc...,2.1,0.84,Eukaryotic aspartyl protease
3,0B3_3WB4_A_507,<rdkit.Chem.rdchem.Mol object at 0x16328e260>,1,0B3_3WB4_A_507,3WB4,507,A,0B3,C[C@]1(CC(=O)N(C(=N1)N)C)CCc2ccccc2,2.25,0.86,Eukaryotic aspartyl protease
4,0B5_5YGX_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e2d0>,1,0B5_5YGX_A_509,5YGX,509,A,0B5,C[C@]1([C@H]([C@H](OC(=N1)N)C(C)(F)F)F)c2cc(cc...,2.2,0.97,Eukaryotic aspartyl protease
5,0BI_3K5C_B_387,<rdkit.Chem.rdchem.Mol object at 0x16328e340>,1,0BI_3K5C_B_387,3K5C,387,B,0BI,CC(C)c1cccc(c1)C2(CC2)NC[C@H]([C@@H]3Cc4cccc(c...,2.12,0.89,Eukaryotic aspartyl protease
6,0C1_3UPH_A_578,<rdkit.Chem.rdchem.Mol object at 0x16328e3b0>,1,0C1_3UPH_A_578,3UPH,578,A,0C1,CS(=O)(=O)NC(=O)c1c(c2c(n1Cc3cc(ccc3F)F)ccc4c2...,2.0,0.94,Viral RNA dependent RNA polymerase
7,0C8_3UP2_A_1,<rdkit.Chem.rdchem.Mol object at 0x16328e420>,1,0C8_3UP2_A_1,3UP2,1,A,0C8,c1ccc(c(c1)Nc2ccnc(n2)Nc3ccc(cc3)C(=O)O)OC(F)(F)F,2.3,0.95,Protein kinase domain
8,0CA_3UPE_A_481,<rdkit.Chem.rdchem.Mol object at 0x16328e490>,1,0CA_3UPE_A_481,3UPE,481,A,0CA,[H]/N=C(\c1ccc2ccc(cc2c1)Cn3c4cc(cc(c4cc3C(C)C...,1.54,1.0,Trypsin
9,0CB_3UUZ_B_481,<rdkit.Chem.rdchem.Mol object at 0x16328e500>,1,0CB_3UUZ_B_481,3UUZ,481,B,0CB,[H]/N=C(\C)/N1CCC(CC1)Oc2ccc(cc2)N(Cc3ccc4ccc(...,2.1,0.91,Trypsin


In [71]:
#display value counts for 'Pfam_Identifier' in test_df_10 
test_df_10['Pfam_Identifier'].value_counts()

Pfam_Identifier
Eukaryotic aspartyl protease                      4
Trypsin                                           2
tRNA synthetases class II core domain (F)         1
Thermolysin metallopeptidase, catalytic domain    1
Viral RNA dependent RNA polymerase                1
Protein kinase domain                             1
Name: count, dtype: int64

In [72]:
test_df = merged_df.copy()
hundred_df = test_df.head(100)
hundred_df = add_pfam_identifiers(hundred_df)
display(hundred_df)
hundred_df['Pfam_Identifier'].value_counts()


Unnamed: 0,Name,Molecule,Chiral Flag,LigandID,PDBid,Resid,Chain,3-letter-code,LigandSmiles,Resolution,EDIAm,Pfam_Identifier
0,0A1_3QTC_A_811,<rdkit.Chem.rdchem.Mol object at 0x16328e110>,1,0A1_3QTC_A_811,3QTC,811,A,0A1,COc1ccc(cc1)C[C@@H](C(=O)O)N,1.75,0.90,tRNA synthetases class II core domain (F)
1,0A9_3QGO_A_407,<rdkit.Chem.rdchem.Mol object at 0x16328e180>,1,0A9_3QGO_A_407,3QGO,407,A,0A9,COC(=O)[C@H](Cc1ccccc1)N,1.45,0.92,"Thermolysin metallopeptidase, catalytic domain"
2,0B1_3VV7_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e1f0>,1,0B1_3VV7_A_509,3VV7,509,A,0B1,CN1C(=O)C=C(N=C1N)[C@H]2C[C@H]2c3cccc(c3)c4ccc...,2.10,0.84,Eukaryotic aspartyl protease
3,0B3_3WB4_A_507,<rdkit.Chem.rdchem.Mol object at 0x16328e260>,1,0B3_3WB4_A_507,3WB4,507,A,0B3,C[C@]1(CC(=O)N(C(=N1)N)C)CCc2ccccc2,2.25,0.86,Eukaryotic aspartyl protease
4,0B5_5YGX_A_509,<rdkit.Chem.rdchem.Mol object at 0x16328e2d0>,1,0B5_5YGX_A_509,5YGX,509,A,0B5,C[C@]1([C@H]([C@H](OC(=N1)N)C(C)(F)F)F)c2cc(cc...,2.20,0.97,Eukaryotic aspartyl protease
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0O1_4GKT_A_403,<rdkit.Chem.rdchem.Mol object at 0x1652a8a50>,1,0O1_4GKT_A_403,4GKT,403,A,0O1,c1ccc(cc1)CCNc2[nH]c3c(n2)cc4c(c3CCNCC5CCCC5)N...,1.53,0.87,Queuine tRNA-ribosyltransferase
96,0O4_4EE0_A_202,<rdkit.Chem.rdchem.Mol object at 0x1652a8ac0>,1,0O4_4EE0_A_202,4EE0,202,A,0O4,c1ccc2c(c1)ccnc2c3ccc(cc3)C(=O)NCCN4CCOCC4,1.75,0.89,"Glutathione S-transferase, N-terminal domain"
97,0O5_4EDZ_D_202,<rdkit.Chem.rdchem.Mol object at 0x1652a8b30>,1,0O5_4EDZ_D_202,4EDZ,202,D,0O5,Cc1cc2ccccc2c(n1)c3ccc(cc3)C(=O)NCCN4CCOCC4,2.00,0.93,"Glutathione S-transferase, N-terminal domain"
98,0O7_4EBV_A_700,<rdkit.Chem.rdchem.Mol object at 0x1652a8ba0>,1,0O7_4EBV_A_700,4EBV,700,A,0O7,CCc1ccc(cc1)c2ccc3c(c2)-c4c(c[nH]n4)S(=O)(=O)N3C,1.67,1.01,Protein tyrosine and serine/threonine kinase


Pfam_Identifier
Protein tyrosine and serine/threonine kinase             19
Eukaryotic aspartyl protease                             10
Protein kinase domain                                     6
Formyl transferase                                        4
Beta-lactamase                                            3
Pantoate-beta-alanine ligase                              3
Trypsin                                                   3
LytB protein                                              3
Aminotransferase class I and II                           3
Thymidylate kinase                                        3
14-3-3 protein                                            2
Pyridoxal-phosphate dependent enzyme                      2
Phosphotransferase enzyme family                          2
Pterin binding enzyme                                     2
Glutathione S-transferase, N-terminal domain              2
Nitric oxide synthase, oxygenase domain                   2
Poly(ADP-ribose) polymer

In [73]:
import pickle

with open('merged_df.pkl', 'wb') as f:
    pickle.dump(merged_df, f)

In [None]:
from tqdm import tqdm

def add_pfam_identifiers(df):
    """
    Adds Pfam identifiers to the DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the protein data.

    Returns:
    pandas.DataFrame: The DataFrame with the added Pfam identifiers.
    """
    # Create a new column 'Pfam_Identifier' in the DataFrame
    df["Pfam_Identifier"] = None

    # Wrap the DataFrame with tqdm for a progress bar
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        # Get the PDB ID
        pdb_id = row["PDBid"]
        # Call get_protein_data for the PDB ID
        protein_data = get_pfam_data(pdb_id)
        # Get the identifier
        identifier = get_first_pfam_identifier(pdb_id, protein_data)
        # Add the identifier to the DataFrame
        df.loc[index, "Pfam_Identifier"] = identifier

    return df