<a href="https://colab.research.google.com/github/ErickArtola/AboutMe/blob/main/Programmatic_access_to_UniProt_2024_10_course.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Programmatic access to UniProt (2024-10)

# Basic setup

In [None]:
import sys
import json
import requests
from prettytable import PrettyTable

# Documentation: https://www.uniprot.org/help/api
WEBSITE_API = "https://rest.uniprot.org/"

# Documentation: https://www.ebi.ac.uk/proteins/api/doc/
PROTEINS_API = "https://www.ebi.ac.uk/proteins/api"

# Helper function to download data
def get_url(url, **kwargs):
  response = requests.get(url, **kwargs);

  if not response.ok:
    print(response.text)
    response.raise_for_status()
    sys.exit()

  return response

# Basic search request

In [None]:
r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=*")

data = r.json()

# print the number of results in the payload
n_results = len(data["results"])
print(f"Number of results: {n_results}\n")

# print all headers in the server response
table = PrettyTable()
headers = dict(sorted(r.headers.items()))
table.add_column("Header", list(headers.keys()))
table.add_column("Value", list(headers.values()))
table.align = "l"
print(table)

Number of results: 25

+----------------------------------+-----------------------------------------------------------------------------------------------------------------------+
| Header                           | Value                                                                                                                 |
+----------------------------------+-----------------------------------------------------------------------------------------------------------------------+
| Accept-Ranges                    | bytes                                                                                                                 |
| Access-Control-Allow-Credentials | true                                                                                                                  |
| Access-Control-Allow-Headers     | DNT,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization           |
| Access-Control-Allow-Methods     

*Back to "search query format" slide*

# Complex search request, paginated results

In [None]:
# Search for gene parkin, in human
url = f"{WEBSITE_API}/uniprotkb/search?query=Parkin AND (taxonomy_id:9606)"

# while there are next pages, paginate through them
tally = 0
while url:
  r = get_url(url)
  data = r.json()
  total = r.headers.get("x-total-results")
  tally += len(data["results"])
  print(f"{tally:<3} / {total} | {url}")

  # Get next url from r.links.next.url handily provided by Requests from r.headers["Link"]
  url = r.links.get("next", {}).get("url")

25  / 144 | https://rest.uniprot.org//uniprotkb/search?query=Parkin AND (taxonomy_id:9606)
50  / 144 | https://rest.uniprot.org/uniprotkb/search?query=Parkin%20AND%20%28taxonomy_id%3A9606%29&cursor=1mkycb2xwxbouubkzhokb8yviqricsifugqc&size=25
75  / 144 | https://rest.uniprot.org/uniprotkb/search?query=Parkin%20AND%20%28taxonomy_id%3A9606%29&cursor=1q25awc2b43xtmcx4f3iwkmx8gea4vm8lkj7&size=25
100 / 144 | https://rest.uniprot.org/uniprotkb/search?query=Parkin%20AND%20%28taxonomy_id%3A9606%29&cursor=1sez60yweyzjbr3c4p47jm3jrz10mhmb3sc4&size=25
125 / 144 | https://rest.uniprot.org/uniprotkb/search?query=Parkin%20AND%20%28taxonomy_id%3A9606%29&cursor=bc4hnkgsgmiobbq4mx3bip47w77b4e6aro2ux&size=25
144 / 144 | https://rest.uniprot.org/uniprotkb/search?query=Parkin%20AND%20%28taxonomy_id%3A9606%29&cursor=bc6uhflfaqdjwtt47gq6avw4zxroupluizgol&size=25


# Complex search request, stream

In [None]:
# stream good for simplicity (no pagination), but...
#  - harder to follow progress
#  - harder to resume on failure
#  - not sorted by score
#  - limited to <10M entries
r = get_url(f"{WEBSITE_API}/uniprotkb/stream?query=parkin AND (taxonomy_id:9606)")

data = r.json()

total = len(data["results"])
print(f"total: {total}")

total: 144


# Complex search request, other formats

In [None]:
# For all supported formats see https://www.uniprot.org/help/api_queries#list-of-all-formats

# No format defined, defaults to JSON
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=parkin AND (taxonomy_id:9606)&size=1")

# XML - specify format explicitely in the URL
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=parkin AND (taxonomy_id:9606)&size=1&format=xml")

# XML - specify format using request headers
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=parkin AND (taxonomy_id:9606)&size=1", headers={"Accept": "application/xml"})

# Accession list
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=parkin AND (taxonomy_id:9606)&format=list")

# FASTA
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=parkin AND (taxonomy_id:9606)&format=fasta")

# TSV
r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=parkin AND (taxonomy_id:9606)&format=tsv")

print(r.text)

Entry	Entry Name	Reviewed	Protein names	Gene Names	Organism	Length
O60260	PRKN_HUMAN	reviewed	E3 ubiquitin-protein ligase parkin (Parkin) (EC 2.3.2.31) (Parkin RBR E3 ubiquitin-protein ligase) (Parkinson juvenile disease protein 2) (Parkinson disease protein 2)	PRKN PARK2	Homo sapiens (Human)	465
Q6NUN9	ZN746_HUMAN	reviewed	Zinc finger protein 746 (Parkin-interacting substrate) (PARIS)	ZNF746 PARIS	Homo sapiens (Human)	644
Q96M98	PACRG_HUMAN	reviewed	Parkin coregulated gene protein (Molecular chaperone/chaperonin-binding protein) (PARK2 coregulated gene protein)	PACRG GLUP	Homo sapiens (Human)	296
Q8IWT3	CUL9_HUMAN	reviewed	Cullin-9 (CUL-9) (UbcH7-associated protein 1) (p53-associated parkin-like cytoplasmic protein)	CUL9 H7AP1 KIAA0708 PARC	Homo sapiens (Human)	2517
X5DR79	X5DR79_HUMAN	unreviewed	E3 ubiquitin-protein ligase parkin (EC 2.3.2.31)	PARK2	Homo sapiens (Human)	465
O15354	GPR37_HUMAN	reviewed	Prosaposin receptor GPR37 (Endothelin B receptor-like protein 1) (ETBR-LP-1) (G-pro

# Complex search request, customise column choice

In [None]:
r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=parkin AND (taxonomy_id:9606)&fields=id,accession,length,cc_catalytic_activity&format=tsv")
print(r.text)

Entry Name	Entry	Length	Catalytic activity
PRKN_HUMAN	O60260	465	CATALYTIC ACTIVITY: Reaction=[E2 ubiquitin-conjugating enzyme]-S-ubiquitinyl-L-cysteine + [acceptor protein]-L-lysine = [E2 ubiquitin-conjugating enzyme]-L-cysteine + [acceptor protein]-N(6)-ubiquitinyl-L-lysine.; EC=2.3.2.31; Evidence={ECO:0000269|PubMed:23770887};
ZN746_HUMAN	Q6NUN9	644	
PACRG_HUMAN	Q96M98	296	
CUL9_HUMAN	Q8IWT3	2517	
X5DR79_HUMAN	X5DR79	465	CATALYTIC ACTIVITY: Reaction=[E2 ubiquitin-conjugating enzyme]-S-ubiquitinyl-L-cysteine + [acceptor protein]-L-lysine = [E2 ubiquitin-conjugating enzyme]-L-cysteine + [acceptor protein]-N(6)-ubiquitinyl-L-lysine.; EC=2.3.2.31; Evidence={ECO:0000256|ARBA:ARBA00001798};
GPR37_HUMAN	O15354	613	
UBP30_HUMAN	Q70CQ3	517	CATALYTIC ACTIVITY: Reaction=Thiol-dependent hydrolysis of ester, thioester, amide, peptide and isopeptide bonds formed by the C-terminal Gly of ubiquitin (a 76-residue protein attached to proteins as an intracellular targeting signal).; EC=3.4.19.12; Evid

*Back to "Retrieve an entry" slide*

# Single entry

In [None]:
# all of the entry
# r = get_url(f"{WEBSITE_API}/uniprotkb/O60260")
# only the catalytic activity comments
r = get_url(f"{WEBSITE_API}/uniprotkb/O60260?fields=cc_catalytic_activity")
print(json.dumps(r.json(), indent=2))

{
  "entryType": "UniProtKB reviewed (Swiss-Prot)",
  "primaryAccession": "O60260",
  "comments": [
    {
      "commentType": "CATALYTIC ACTIVITY",
      "reaction": {
        "name": "[E2 ubiquitin-conjugating enzyme]-S-ubiquitinyl-L-cysteine + [acceptor protein]-L-lysine = [E2 ubiquitin-conjugating enzyme]-L-cysteine + [acceptor protein]-N(6)-ubiquitinyl-L-lysine.",
        "ecNumber": "2.3.2.31",
        "evidences": [
          {
            "evidenceCode": "ECO:0000269",
            "source": "PubMed",
            "id": "23770887"
          }
        ]
      }
    }
  ],
  "extraAttributes": {
    "uniParcId": "UPI00003673FE"
  }
}


*Back to "understanding annotation types" slide*

# All isoforms of an entry

In [None]:
# isoform info for XBP1_HUMAN
r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=P17861&includeIsoform=true&fields=accession,cc_function,cc_subcellular_location,cc_ptm,sequence&format=tsv")

print(r.text)

# Natural variants information for an entry

In [None]:
# natural variants info for O60260 / PRKN_HUMAN
r_uniprot_api = get_url(f"{WEBSITE_API}/uniprotkb/O60260?fields=ft_variant")
print(f"number of variants in UniProt website API: {len(r_uniprot_api.json()['features'])}")
print(json.dumps(r_uniprot_api.json()['features'][0], indent=2))

number of variants in UniProt website API: 40
{
  "type": "Natural variant",
  "location": {
    "start": {
      "value": 15,
      "modifier": "EXACT"
    },
    "end": {
      "value": 15,
      "modifier": "EXACT"
    }
  },
  "description": "in PARK2; dbSNP:rs532703934",
  "featureCrossReferences": [
    {
      "database": "dbSNP",
      "id": "rs532703934"
    }
  ],
  "evidences": [
    {
      "evidenceCode": "ECO:0000269",
      "source": "PubMed",
      "id": "12397156"
    }
  ],
  "featureId": "VAR_019733",
  "alternativeSequence": {
    "originalSequence": "V",
    "alternativeSequences": [
      "M"
    ]
  }
}


In [None]:
# using proteins API variation endpoint
r_proteins_api = get_url(f"{PROTEINS_API}/variation/O60260")
print(f"number of variants in Proteins API: {len(r_proteins_api.json()['features'])}")
# show second variant from list
print(json.dumps(r_proteins_api.json()['features'][1], indent=2))

number of variants in Proteins API: 896
{
  "type": "VARIANT",
  "alternativeSequence": "T",
  "begin": "1",
  "end": "1",
  "xrefs": [
    {
      "name": "ClinVar",
      "id": "RCV000992706",
      "url": "https://www.ncbi.nlm.nih.gov/clinvar/RCV000992706",
      "alternativeUrl": "https://www.ensembl.org/homo_sapiens/Variation/Explore?v=RCV000992706"
    },
    {
      "name": "ClinVar",
      "id": "RCV001784521",
      "url": "https://www.ncbi.nlm.nih.gov/clinvar/RCV001784521",
      "alternativeUrl": "https://www.ensembl.org/homo_sapiens/Variation/Explore?v=RCV001784521"
    },
    {
      "name": "dbSNP",
      "id": "rs771586218",
      "url": "https://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs771586218"
    }
  ],
  "cytogeneticBand": "6q26",
  "genomicLocation": [
    "NC_000006.12:g.162727667A>G"
  ],
  "locations": [
    {
      "loc": "p.Met1Thr",
      "seqId": "ENST00000366898",
      "source": "ClinVar"
    }
  ],
  "consequenceType": "missense",
  "wildType": 

*Back to "retrieve and compare orthologs" slide*

# Alignment flow

In [None]:
# manually selected accessions
accessions = ",".join(["O60260", "Q7KTX7", "Q9WVS6", "Q9JK66"])

r = get_url(f"{WEBSITE_API}/uniprotkb/accessions?accessions={accessions}&format=fasta")
fasta = r.text
print(fasta)

>sp|O60260|PRKN_HUMAN E3 ubiquitin-protein ligase parkin OS=Homo sapiens OX=9606 GN=PRKN PE=1 SV=2
MIVFVRFNSSHGFPVEVDSDTSIFQLKEVVAKRQGVPADQLRVIFAGKELRNDWTVQNCD
LDQQSIVHIVQRPWRKGQEMNATGGDDPRNAAGGCEREPQSLTRVDLSSSVLPGDSVGLA
VILHTDSRKDSPPAGSPAGRSIYNSFYVYCKGPCQRVQPGKLRVQCSTCRQATLTLTQGP
SCWDDVLIPNRMSGECQSPHCPGTSAEFFFKCGAHPTSDKETSVALHLIATNSRNITCIT
CTDVRSPVLVFQCNSRHVICLDCFHLYCVTRLNDRQFVHDPQLGYSLPCVAGCPNSLIKE
LHHFRILGEEQYNRYQQYGAEECVLQMGGVLCPRPGCGAGLLPEPDQRKVTCEGGNGLGC
GFAFCRECKEAYHEGECSAVFEASGTTTQAYRVDERAAEQARWEAASKETIKKTTKPCPR
CHVPVEKNGGCMHMKCPQPQCRLEWCWNCGCEWNRVCMGDHWFDV
>sp|Q7KTX7|PRKN_DROME E3 ubiquitin-protein ligase parkin OS=Drosophila melanogaster OX=7227 GN=park PE=1 SV=1
MSFIFKFIATFVRKMLELLQFGGKTLTHTLSIYVKTNTGKTLTVNLEPQWDIKNVKELVA
PQLGLQPDDLKIIFAGKELSDATTIEQCDLGQQSVLHAIRLRPPVQRQKIQSATLEEEEP
SLSDEASKPLNETLLDLQLESEERLNITDEERVRAKAHFFVHCSQCDKLCNGKLRVRCAL
CKGGAFTVHRDPECWDDVLKSRRIPGHCESLEVACVDNAAGDPPFAEFFFKCAEHVSGGE
KDFAAPLNLIKNNIKNVPCLACTDVSDTVLVFPCASQHVTCIDCFRHYCRSRLGERQFMP
HPDFGYTLPCPAG

In [None]:
# submit align job using clustalo
r = requests.post("https://www.ebi.ac.uk/Tools/services/rest/clustalo/run", data={
    "email": "example@example.com",
    "iterations": 0,
    "outfmt": "clustal_num",
    "order": "aligned",
    "sequence": fasta
})

# documentation here https://www.ebi.ac.uk/seqdb/confluence/display/JDSAT/Clustal+Omega+Help+and+Documentation#ClustalOmegaHelpandDocumentation-RESTAPI

job_id = r.text
print(job_id)

# get job status
r = get_url(f"https://www.ebi.ac.uk/Tools/services/rest/clustalo/status/{job_id}")
print(r.text)

clustalo-R20241031-153802-0309-29252165-p1m
QUEUED


In [None]:
# Run the following again to check the status until finished
r = get_url(f"https://www.ebi.ac.uk/Tools/services/rest/clustalo/status/{job_id}")
print(r.text)

FINISHED


In [None]:
r = get_url(f"https://www.ebi.ac.uk/Tools/services/rest/clustalo/result/{job_id}/aln-clustal_num")
print(r.text)

# * : Fully conserved residues.
# : : Conservation between groups of strongly similar properties (Gonnet PAM 250 score > 0.5).
# . : Conservation between groups of weakly similar properties (Gonnet PAM 250 score ≤ 0.5).
#   : Non-conserved residues.

CLUSTAL O(1.2.4) multiple sequence alignment


sp|Q7KTX7|PRKN_DROME      MSFIFKFIATFVRKMLELLQFGGKTLTHTLSIYVKTNTGKTLTVNLEPQWDIKNVKELVA	60
sp|O60260|PRKN_HUMAN      -----------------------------MIVFVRFNSSHGFPVEVDSDTSIFQLKEVVA	31
sp|Q9WVS6|PRKN_MOUSE      -----------------------------MIVFVRFNSSYGFPVEVDSDTSILQLKEVVA	31
sp|Q9JK66|PRKN_RAT        -----------------------------MIVFVRFNSSYGFPVEVDSDTSIFQLKEVVA	31
                                                       : ::*: *:.  : *::: : .* ::**:**

sp|Q7KTX7|PRKN_DROME      PQLGLQPDDLKIIFAGKELSDATTIEQCDLGQQSVLHAIRLRPPVQRQKIQSATLEEEEP	120
sp|O60260|PRKN_HUMAN      KRQGVPADQLRVIFAGKELRNDWTVQNCDLDQQSIVHIVQRP-WRKGQEMNAT--GGDDP	88
sp|Q9WVS6|PRKN_MOUSE      KRQGVPADQLRVIFAGKELPNHLTVQNCDLEQQSIVHIVQRP-RRRSHETNAS--GGDEP	88
sp|Q9JK66|PRKN_RAT        KRQGVPADQLRVIFAGKELQNHLTVQNCDLEQQSIVHIVQRP-QRKSHETNAS--GGDKP	88
                           : *:  *:*::******* :  *:::*** ***::* ::     : :: :::    :.*

sp|Q7KTX7|PRKN_DROME      SLSDEA--SKPL--------------NETL

*Back to "retrieve ID mapping service" slide*

# ID mapping flow

In [None]:
# Manually selected accessions
accessions = ["O60260", "Q7KTX7", "Q9WVS6", "Q9JK66"]

# Send job to ID mapping endpoint
r = requests.post(f"{WEBSITE_API}/idmapping/run", data={
    "from": "UniProtKB_AC-ID",
    "to": "RefSeq_Nucleotide",
    "ids": accessions
})
job_id = r.json()['jobId']
print("job ID:", job_id)

r = get_url(f"{WEBSITE_API}/idmapping/status/{job_id}")
print(json.dumps(r.json(), indent=2))

job ID: a1e2f0093beee90a823830662be4c961d685852c
{
  "results": [
    {
      "from": "O60260",
      "to": "NM_004562.2"
    },
    {
      "from": "O60260",
      "to": "NM_013987.2"
    },
    {
      "from": "O60260",
      "to": "NM_013988.2"
    },
    {
      "from": "Q7KTX7",
      "to": "NM_168884.2"
    },
    {
      "from": "Q7KTX7",
      "to": "NM_168885.3"
    },
    {
      "from": "Q9WVS6",
      "to": "NM_001317726.1"
    },
    {
      "from": "Q9WVS6",
      "to": "NM_016694.4"
    },
    {
      "from": "Q9JK66",
      "to": "NM_020093.1"
    }
  ]
}


In [None]:
# Run the following again to check the status until finished
# r = get_url(f"{WEBSITE_API}/idmapping/status/{job_id}")
# print(json.dumps(r.json(), indent=2))

*Back to last slides*

# *Extra:* BLAST flow

In [None]:
# get FASTA file
r = get_url(f"{WEBSITE_API}/uniprotkb/Q13496?format=fasta")
print(r.text)

# submit blast job
r = requests.post("https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/run", data={
    "email": "example@example.com",
    "program": "blastp",
    "matrix": "BLOSUM62",
    "alignments": 250,
    "scores": 250,
    "exp": 10,
    "filter": "F",
    "gapalign": "true",
    "stype": "protein",
    "database": "uniprotkb_refprotswissprot",
    # restricted to Mammalia in this example
    "taxids": "40674",
    "sequence": r.text,
})
# documentation here https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?pageId=94147939#NCBIBLAST+HelpandDocumentation-RESTAPI

job_id = r.text
print(job_id)

# get job status
r = get_url(f"https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/status/{job_id}")
print(r.text)

In [None]:
# Run the following again to check the status until finished
r = get_url(f"https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/status/{job_id}")
print(r.text)

In [None]:
r = get_url(f"https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/result/{job_id}/out")
print(r.text)

# *Extra:* PTM retrievals

In [None]:
# Use B9FXV5 protein (Eukaryotic translation initiation factor 4G in rice)
accession = "B9FXV5"
# documentation https://www.ebi.ac.uk/proteins/api/doc/#!/proteomics-ptm/getByAccession
r = get_url(f"{PROTEINS_API}/proteomics-ptm/{accession}")

data = r.json()

t = PrettyTable(['name', 'position', 'sources', 'id', 'confidence'])
table_data = []
for feature in data['features']:
  for ptm in feature['ptms']:
    for dbRef in ptm['dbReferences']:
      table_data.append(
          (ptm['name'],
           int(feature['begin']) + int(ptm['position']) - 1,
           ','.join(ptm['sources']),
           dbRef['id'],
           dbRef['properties']['Confidence score']))

# sort by "position" column
table_data = sorted(table_data, key=lambda x: x[1])
# filter by "confidence" column, only "Gold" values
table_data = filter(lambda x: x[4] == 'Gold', table_data)

t.add_rows(table_data)

print(t)

# *Extra:* Visualisation example

*Note: These examples don't have a biological meaning, they are just there as an example in order to throw data on the screen! Please adjust accordingly to your interests*

In [None]:
r = get_url(f"{WEBSITE_API}/uniprotkb/stream?query=(gene:MTM1)&fields=mass,reviewed,length,cc_catalytic_activity,annotation_score")
data = r.json()

print(len(data["results"]), data["results"][0])

In [None]:
import matplotlib.pyplot as plt

reviewed = ["grey" if "unreviewed" in entry["entryType"] else "gold" for entry in data["results"]]

mass = [entry["sequence"]["molWeight"] for entry in data["results"]]
plt.hist(mass, bins=100)
plt.show()

length = [entry["sequence"]["length"] for entry in data["results"]]
plt.hist(length, bins=100)
plt.show()

n_cat = [len(entry.get("comments", [])) for entry in data["results"]]
plt.hist(n_cat, bins=100)
plt.show()

score = [entry["annotationScore"] for entry in data["results"]]
plt.hist(score, bins=100)
plt.show()

In [None]:
plt.scatter(length, mass, alpha=0.3)
plt.xlabel("length (aa)")
plt.ylabel("mass (Da)")
plt.show()

plt.scatter(length, n_cat, c=reviewed, alpha=0.3)
plt.xlabel("length (aa)")
plt.ylabel("# catalytic activity comments")
plt.show()

plt.scatter(score, n_cat, c=reviewed, alpha=0.3)
plt.xlabel("annotation score")
plt.ylabel("# catalytic activity comments")
plt.show()