In [3]:
import requests
import json
import csv

base_url = "https://www.ebi.ac.uk/interpro/api/entry/interpro/IPR000276" ## "IPR000276" is the fmaily for gpcrs, can fetch this id and then the info
headers = {"Accept": "application/json"}

response = requests.get(base_url, headers=headers)

if response.status_code == 200:
    gpcr_data = response.json()
    
    #JSON
    with open('gpcr_data.json', 'w') as json_file:
        json.dump(gpcr_data, json_file, indent=4)
    print("Data saved to gpcr_data.json")
    
else:
    print(f"Error: {response.status_code}")

Data saved to gpcr_data.json


## to fetch all possible domains for a 'protein' with interpro


In [9]:
import requests
import json

def get_protein_data(uniprot_id):
    base_url = "https://www.ebi.ac.uk/interpro/api/protein/reviewed/"
    endpoint = f"{uniprot_id}/entry"
    
    url = f"{base_url}{endpoint}"
    headers = {"Accept": "application/json"}
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data: {response.status_code}")
        return None

def print_structure(data, indent=0):
    if isinstance(data, dict):
        for key, value in data.items():
            print("  " * indent + str(key))
            if isinstance(value, (dict, list)):
                print_structure(value, indent + 1)
            else:
                print("  " * (indent + 1) + str(type(value)))
    elif isinstance(data, list):
        print("  " * indent + f"List with {len(data)} items")
        if data:
            print_structure(data[0], indent + 1)

# UniProt ID for human beta-2 adrenergic receptor
uniprot_id = "P07550"

# Fetch the data
protein_data = get_protein_data(uniprot_id)

if protein_data:
    print("API Response Structure:")
    print_structure(protein_data)
    
    # Save the entire protein data to a JSON file
    with open(f'adrb2_full_data.json', 'w') as f:
        json.dump(protein_data, f, indent=4)
    print(f"\nFull protein data for ADRB2 (UniProt ID: {uniprot_id}) has been saved to adrb2_full_data.json")
    
    # Print the first few entries (if available)
    if isinstance(protein_data, dict) and 'results' in protein_data:
        print("\nFirst few entries:")
        for entry in protein_data['results'][:3]:  # Print details of first 3 entries
            print(json.dumps(entry, indent=2))
    elif isinstance(protein_data, list):
        print("\nFirst few entries:")
        for entry in protein_data[:3]:  # Print details of first 3 entries
            print(json.dumps(entry, indent=2))
    else:
        print("\nUnexpected data structure. Check the full data in adrb2_full_data.json")
else:
    print("Failed to fetch protein data.")

API Response Structure:
metadata
  accession
    <class 'str'>
  id
    <class 'str'>
  source_organism
    taxId
      <class 'str'>
    scientificName
      <class 'str'>
    fullName
      <class 'str'>
  name
    <class 'str'>
  description
    List with 1 items
  length
    <class 'int'>
  sequence
    <class 'str'>
  proteome
    <class 'str'>
  gene
    <class 'str'>
  go_terms
    List with 8 items
      identifier
        <class 'str'>
      name
        <class 'str'>
      category
        code
          <class 'str'>
        name
          <class 'str'>
  protein_evidence
    <class 'int'>
  source_database
    <class 'str'>
  is_fragment
    <class 'bool'>
  in_alphafold
    <class 'bool'>
  ida_accession
    <class 'str'>
  counters
    entries
      <class 'int'>
    structures
      <class 'int'>
    taxa
      <class 'int'>
    proteomes
      <class 'int'>
    sets
      <class 'int'>
    similar_proteins
      <class 'int'>
entries
  member_databases
    prints
      

## for almost all properties for a particular protein

In [6]:
import requests
import json

def get_interpro_protein_info(protein_id):
    base_url = "https://www.ebi.ac.uk/interpro/api/protein/reviewed/"
    endpoints = [
        f"{protein_id}",
        f"{protein_id}/entry",
        f"{protein_id}/structure",
        f"{protein_id}/taxonomy",
        f"{protein_id}/proteome"
    ]
    
    protein_data = {}
    
    for endpoint in endpoints:
        url = f"{base_url}{endpoint}"
        headers = {"Accept": "application/json"}
        
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            endpoint_name = endpoint.split('/')[-1] if '/' in endpoint else 'metadata'
            protein_data[endpoint_name] = data
        else:
            print(f"Error fetching {endpoint}: {response.status_code}")
    
    return protein_data

# Example usage
protein_id = "P31946"  # Example: 14-3-3 protein beta/alpha
protein_info = get_interpro_protein_info(protein_id)

# Save the data to a JSON file
with open(f'interpro_protein_{protein_id}.json', 'w') as f:
    json.dump(protein_info, f, indent=4)

print(f"Protein information for {protein_id} has been saved to interpro_protein_{protein_id}.json")

# Print a summary of the data
print("\nSummary of fetched data:")
for key, value in protein_info.items():
    if isinstance(value, dict):
        print(f"{key}: {len(value)} items")
    elif isinstance(value, list):
        print(f"{key}: {len(value)} items")
    else:
        print(f"{key}: {type(value)}")

Protein information for P31946 has been saved to interpro_protein_P31946.json

Summary of fetched data:
metadata: 1 items
entry: 2 items
structure: 2 items
taxonomy: 2 items
proteome: 2 items


In [4]:
import requests
import json

base_url = "https://rest.uniprot.org/uniprotkb/search"
query = "keyword:KW-0297"  # KW-0297 is the keyword for G-protein coupled receptor
params = {
    "query": query,
    "format": "json",
    "size": 10  # Adjust this to get more results
}

response = requests.get(base_url, params=params)

if response.status_code == 200:
    gpcr_data = response.json()
    
    # Save to JSON file
    with open('uniprot_gpcr_data.json', 'w') as json_file:
        json.dump(gpcr_data, json_file, indent=4)
    print("Data saved to uniprot_gpcr_data.json")
    
else:
    print(f"Error: {response.status_code}")

Data saved to uniprot_gpcr_data.json


In [5]:
import requests
import json

base_url = "https://gpcrdb.org/services/protein/"
gpcr_id = "adrb2_human"  # Example: beta-2 adrenergic receptor
params = {"format": "json"}

response = requests.get(f"{base_url}{gpcr_id}", params=params)

if response.status_code == 200:
    gpcr_data = response.json()
    
    # Save to JSON file
    with open('gpcrdb_gpcr_data.json', 'w') as json_file:
        json.dump(gpcr_data, json_file, indent=4)
    print("Data saved to gpcrdb_gpcr_data.json")
    
else:
    print(f"Error: {response.status_code}")

Data saved to gpcrdb_gpcr_data.json


## For all gpcrs from gpcrdb

In [None]:
import requests
import json
from time import sleep

base_url = "https://gpcrdb.org/services/"

def get_all_gpcrs():
    response = requests.get(f"{base_url}protein")
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching GPCR list: {response.status_code}")
        return []

def get_gpcr_details(protein_id):
    endpoints = [
        f"protein/{protein_id}",
        f"protein/{protein_id}/family",
        f"protein/{protein_id}/feature",
        f"protein/{protein_id}/residue"
    ]
    
    details = {}
    for endpoint in endpoints:
        response = requests.get(f"{base_url}{endpoint}")
        if response.status_code == 200:
            details[endpoint.split('/')[-1]] = response.json()
        else:
            print(f"Error fetching {endpoint}: {response.status_code}")
    
    return details

# Get list of all GPCRs
all_gpcrs = get_all_gpcrs()

# Fetch details for each GPCR
gpcr_data = {}
for i, gpcr in enumerate(all_gpcrs):
    print(f"Fetching data for GPCR {i+1}/{len(all_gpcrs)}: {gpcr['entry_name']}")
    gpcr_data[gpcr['entry_name']] = get_gpcr_details(gpcr['entry_name'])
    sleep(1)  # To avoid overwhelming the API

# Save all data to a JSON file
with open('all_gpcr_data.json', 'w') as f:
    json.dump(gpcr_data, f, indent=4)

print("All GPCR data has been saved to all_gpcr_data.json")

## protein domains 

In [17]:
import requests

# Define the InterProScan REST API URL
url = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5/run"

# Define the protein sequence in FASTA format
fasta_sequence = """>ALI87457.1 OR51D1 [Homo sapiens]
MQKPQLLVPIIATSNGNLVHAAYFLLVGIPGLGPTIHFWLAFPLCFMYALATLGNLTIVLIIRVERRLHE
PMYLFLAMLSTIDLVLSSITMPKMASLFLMGIQEIEFNICLAQMFLIHALSAVESAVLLAMAFDRFVAIC
HPLRHASVLTGCTVAKIGLSALTRGFVFFFPLPFILKWLSYCQTHTVTHSFCLHQDIMKLSCTDTRVNVV
YGLFIILSVMGVDSLFIGFSYILILWAVLELSSRRAALKAFNTCISHLCAVLVFYVPLIGLSVVHRLGGP
TSLLHVVMANTYLLLPPVVNPLVYGAKTKEICSRVLCMFSQGGK
"""

# Define the parameters for the API request
params = {
    "email": "bhavikaberwal131@gmail.com",
    "title": "protein_domain_search",
    "sequence": fasta_sequence
}

# Submit the job to the InterProScan API
response = requests.post(url, data=params)
job_id = response.text

# Check the status of the job
status_url = f"https://www.ebi.ac.uk/Tools/services/rest/iprscan5/status/{job_id}"
while True:
    status_response = requests.get(status_url)
    if status_response.text == "FINISHED":
        break
    elif status_response.text in ["RUNNING", "PENDING"]:
        continue
    else:
        raise Exception(f"Job failed with status: {status_response.text}")

# Retrieve the results
result_url = f"https://www.ebi.ac.uk/Tools/services/rest/iprscan5/result/{job_id}/xml"
result_response = requests.get(result_url)
results = result_response.text

# Save the results to a file
with open("protein_domain_results.xml", "w") as file:
    file.write(results)

print("Domain information fetched successfully!")


Domain information fetched successfully!


### xml parsing


In [14]:
import xml.etree.ElementTree as ET

# Parse the XML results
tree = ET.parse('protein_domain_results.xml')
root = tree.getroot()

# Open a file to write the results
with open("parsed_domain_information.txt", "w") as file:
    # Extract domain information
    for entry in root.findall('.//protein/matches/match'):
        db_name = entry.get('dbname')
        db_ac = entry.get('dbkey')
        entry_name = entry.find('.//entry').get('name') if entry.find('.//entry') is not None else "No entry name"
        entry_ac = entry.find('.//entry').get('ac') if entry.find('.//entry') is not None else "No entry accession"
        description = entry.find('.//entry').get('desc') if entry.find('.//entry') is not None else "No description"
        
        # Format the information
        info = (f"Database: {db_name}, DB Accession: {db_ac}, "
                f"Entry Name: {entry_name}, Entry Accession: {entry_ac}, "
                f"Description: {description}\n")
        
        # Write the information to the file
        file.write(info)

print("Domain information saved successfully to parsed_domain_information.txt")


Domain information saved successfully to parsed_domain_information.txt


In [15]:
import xml.etree.ElementTree as ET

# Parse the XML results
tree = ET.parse('protein_domain_results.xml')
root = tree.getroot()

# Function to print XML structure
def print_xml_structure(element, indent=""):
    print(indent + element.tag)
    for child in element:
        print_xml_structure(child, indent + "  ")

# Print the structure of the XML
print_xml_structure(root)


{https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}protein-matches
  {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}protein
    {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}sequence
    {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}xref
    {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}matches
      {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}fingerprints-match
        {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}signature
          {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}entry
            {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}go-xref
            {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}go-xref
            {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}go-xref
            {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}pathway-xref
            {https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas}pathway-xref
    

In [16]:
import xml.etree.ElementTree as ET

# Parse the XML results
tree = ET.parse('protein_domain_results.xml')
root = tree.getroot()

# Open a file to write the results
with open("parsed_domain_information.txt", "w") as file:
    # Extract domain information
    for entry in root.findall('.//protein/matches/match'):
        db_name = entry.get('dbname')
        db_ac = entry.get('dbkey')
        entry_element = entry.find('entry')
        if entry_element is not None:
            entry_name = entry_element.get('name', "No entry name")
            entry_ac = entry_element.get('ac', "No entry accession")
            description = entry_element.get('desc', "No description")
        else:
            entry_name = "No entry name"
            entry_ac = "No entry accession"
            description = "No description"
        
        # Format the information
        info = (f"Database: {db_name}, DB Accession: {db_ac}, "
                f"Entry Name: {entry_name}, Entry Accession: {entry_ac}, "
                f"Description: {description}\n")
        
        # Write the information to the file
        file.write(info)

print("Domain information saved successfully to parsed_domain_information.txt")


Domain information saved successfully to parsed_domain_information.txt
