# Proteins API

Documentation: https://www.ebi.ac.uk/proteins/api/doc/

The API domain is **https://www.ebi.ac.uk/proteins/api**. It offers many end-points, we will see here the **proteins**, **taxonomy** and **uniparc** end-points. 

The output format have to be specified in the request header under the **Accept** field. 
Available options for the proteins end-point are: 

- text/x-flatfile
- text/x-fasta
- application/json
- application/xml


In [None]:
# Documentaiton of the requests module 
# https://docs.python-requests.org/en/latest/

import requests

In [1]:
URL = "https://www.ebi.ac.uk/proteins/api"
URL_proteins = "{}/proteins".format(URL)

# Retrieve one protein
r = requests.get("{}/P04050".format(URL_proteins), headers={'Accept': 'text/x-fasta'})
print(r.status_code)
print(r.text)

200
>sp|P04050|RPB1_YEAST DNA-directed RNA polymerase II subunit RPB1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=RPO21 PE=1 SV=2
MVGQQYSSAPLRTVKEVQFGLFSPEEVRAISVAKIRFPETMDETQTRAKIGGLNDPRLGS
IDRNLKCQTCQEGMNECPGHFGHIDLAKPVFHVGFIAKIKKVCECVCMHCGKLLLDEHNE
LMRQALAIKDSKKRFAAIWTLCKTKMVCETDVPSEDDPTQLVSRGGCGNTQPTIRKDGLK
LVGSWKKDRATGDADEPELRVLSTEEILNIFKHISVKDFTSLGFNEVFSRPEWMILTCLP
VPPPPVRPSISFNESQRGEDDLTFKLADILKANISLETLEHNGAPHHAIEEAESLLQFHV
ATYMDNDIAGQPQALQKSGRPVKSIRARLKGKEGRIRGNLMGKRVDFSARTVISGDPNLE
LDQVGVPKSIAKTLTYPEVVTPYNIDRLTQLVRNGPNEHPGAKYVIRDSGDRIDLRYSKR
AGDIQLQYGWKVERHIMDNDPVLFNRQPSLHKMSMMAHRVKVIPYSTFRLNLSVTSPYNA
DFDGDEMNLHVPQSEETRAELSQLCAVPLQIVSPQSNKPCMGIVQDTLCGIRKLTLRDTF
IELDQVLNMLYWVPDWDGVIPTPAIIKPKPLWSGKQILSVAIPNGIHLQRFDEGTTLLSP
KDNGMLIIDGQIIFGVVEKKTVGSSNGGLIHVVTREKGPQVCAKLFGNIQKVVNFWLLHN
GFSTGIGDTIADGPTMREITETIAEAKKKVLDVTKEAQANLLTAKHGMTLRESFEDNVVR
FLNEARDKAGRLAEVNLKDLNNVKQMVMAGSKGSFINIAQMSACVGQQSVEGKRIAFGFV
DRTLPHFSKDDYSPESKGFVENSYLRGLTPQEFFFHAMGGREGLIDTAVKT

In [2]:
# Retrieve more proteins (query parameters)
r = requests.get(URL_proteins, params={'accession': 'P04050,P04637'}, headers={'Accept': 'text/x-fasta'})
print(r.status_code)
print(r.text)

200
>sp|P04050|RPB1_YEAST DNA-directed RNA polymerase II subunit RPB1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=RPO21 PE=1 SV=2
MVGQQYSSAPLRTVKEVQFGLFSPEEVRAISVAKIRFPETMDETQTRAKIGGLNDPRLGS
IDRNLKCQTCQEGMNECPGHFGHIDLAKPVFHVGFIAKIKKVCECVCMHCGKLLLDEHNE
LMRQALAIKDSKKRFAAIWTLCKTKMVCETDVPSEDDPTQLVSRGGCGNTQPTIRKDGLK
LVGSWKKDRATGDADEPELRVLSTEEILNIFKHISVKDFTSLGFNEVFSRPEWMILTCLP
VPPPPVRPSISFNESQRGEDDLTFKLADILKANISLETLEHNGAPHHAIEEAESLLQFHV
ATYMDNDIAGQPQALQKSGRPVKSIRARLKGKEGRIRGNLMGKRVDFSARTVISGDPNLE
LDQVGVPKSIAKTLTYPEVVTPYNIDRLTQLVRNGPNEHPGAKYVIRDSGDRIDLRYSKR
AGDIQLQYGWKVERHIMDNDPVLFNRQPSLHKMSMMAHRVKVIPYSTFRLNLSVTSPYNA
DFDGDEMNLHVPQSEETRAELSQLCAVPLQIVSPQSNKPCMGIVQDTLCGIRKLTLRDTF
IELDQVLNMLYWVPDWDGVIPTPAIIKPKPLWSGKQILSVAIPNGIHLQRFDEGTTLLSP
KDNGMLIIDGQIIFGVVEKKTVGSSNGGLIHVVTREKGPQVCAKLFGNIQKVVNFWLLHN
GFSTGIGDTIADGPTMREITETIAEAKKKVLDVTKEAQANLLTAKHGMTLRESFEDNVVR
FLNEARDKAGRLAEVNLKDLNNVKQMVMAGSKGSFINIAQMSACVGQQSVEGKRIAFGFV
DRTLPHFSKDDYSPESKGFVENSYLRGLTPQEFFFHAMGGREGLIDTAVKT

### Taxonomy

Sub end-points: 
- id
- lineage

Available formats:
- application/json
- application/xml

In [4]:

URL_taxonomy = "{}/taxonomy".format(URL)

# Find single node
r = requests.get("{}/id/9606".format(URL_taxonomy), headers={'Accept': 'application/json'})
print(r.status_code)
print(r.json())


200
{'taxonomyId': 9606, 'mnemonic': 'HUMAN', 'scientificName': 'Homo sapiens', 'commonName': 'Human', 'rank': 'species', 'superregnum': 'E', 'hidden': True, 'parentLink': 'https://www.ebi.ac.uk/proteins/api/taxonomy/id/9605', 'childrenLinks': ['https://www.ebi.ac.uk/proteins/api/taxonomy/id/741158', 'https://www.ebi.ac.uk/proteins/api/taxonomy/id/63221'], 'siblingsLinks': ['https://www.ebi.ac.uk/proteins/api/taxonomy/id/2665952', 'https://www.ebi.ac.uk/proteins/api/taxonomy/id/1425170']}


In [16]:
# Full lineage
# Note some nodes are annotated as hidden. Hidden nodes are not included in the UniProt standard lineage
r = requests.get("{}/lineage/9606".format(URL_taxonomy), headers={'Accept': 'application/json'})
print(r.status_code)
for node in r.json()['taxonomies']:
    print(node)

200
{'taxonomyId': 9606, 'scientificName': 'Homo sapiens', 'rank': 'species', 'hidden': True}
{'taxonomyId': 9605, 'scientificName': 'Homo', 'rank': 'genus', 'hidden': False}
{'taxonomyId': 207598, 'scientificName': 'Homininae', 'rank': 'subfamily', 'hidden': True}
{'taxonomyId': 9604, 'scientificName': 'Hominidae', 'rank': 'family', 'hidden': False}
{'taxonomyId': 314295, 'scientificName': 'Hominoidea', 'rank': 'superfamily', 'hidden': True}
{'taxonomyId': 9526, 'scientificName': 'Catarrhini', 'rank': 'parvorder', 'hidden': False}
{'taxonomyId': 314293, 'scientificName': 'Simiiformes', 'rank': 'infraorder', 'hidden': True}
{'taxonomyId': 376913, 'scientificName': 'Haplorrhini', 'rank': 'suborder', 'hidden': False}
{'taxonomyId': 9443, 'scientificName': 'Primates', 'rank': 'order', 'hidden': False}
{'taxonomyId': 314146, 'scientificName': 'Euarchontoglires', 'rank': 'superorder', 'hidden': False}
{'taxonomyId': 1437010, 'scientificName': 'Boreoeutheria', 'rank': 'no rank', 'hidden': Tr

In [6]:
# Find full Homo sapiens lineage (taxon ID: 9606) using a recursive function

def build_lineage(taxon_id, url, lineage=[], include_hidden=False):
    req = "{}/{}".format(url, taxon_id)
    r = requests.get(req, headers={'Accept': 'application/json'})
    if r.status_code == 200:
        node = r.json()
        if include_hidden or not node.get("hidden"):
            lineage.append(node)
        if node.get("parentLink"):
            parent_id = node["parentLink"].split("/")[-1]
            return build_lineage(parent_id, url, lineage, include_hidden)
        else:
            return lineage
    else:
        return r


path = build_lineage("9606", "{}/taxonomy/id".format(URL), include_hidden=True)
fields = ['rank', 'scientificName', 'commonName']
print("{:<20}{:<25}{:<20}".format(*fields))
for node in path:
    print("{:<20}{:<25}{:<20}".format(*[str(node.get(field)) for field in fields]))

rank                scientificName           commonName          
species             Homo sapiens             Human               
genus               Homo                     None                
subfamily           Homininae                None                
family              Hominidae                great apes          
superfamily         Hominoidea               apes                
parvorder           Catarrhini               None                
infraorder          Simiiformes              None                
suborder            Haplorrhini              None                
order               Primates                 None                
superorder          Euarchontoglires         None                
no rank             Boreoeutheria            None                
no rank             Eutheria                 placentals          
no rank             Theria                   None                
class               Mammalia                 mammals             
no rank   

# Website API

Documentation: https://www.uniprot.org/help/api

The API domain is **https://www.uniprot.org**. It offers many end-points, we will see here the **uniprot** **uploadlists** end-points. 

The **uniprot** end-point allows to search (**query**) and download UniProt entries in various formats. The format can be specified either in the URL (as extention) or in the request headers (as in the "Proteins API" above).


In [8]:
import requests

URL = "https://www.uniprot.org"

# Find a single protein
r = requests.get("{}/uniprot/P04050.txt".format(URL))  # Without extension, it will look at the 'Accept' header
print(r.status_code)
print(r.text)


200
ID   RPB1_YEAST              Reviewed;        1733 AA.
AC   P04050; D6VRK8; Q12364; Q92315;
DT   01-NOV-1986, integrated into UniProtKB/Swiss-Prot.
DT   01-NOV-1997, sequence version 2.
DT   29-SEP-2021, entry version 233.
DE   RecName: Full=DNA-directed RNA polymerase II subunit RPB1;
DE            Short=RNA polymerase II subunit 1;
DE            Short=RNA polymerase II subunit B1;
DE            EC=2.7.7.6;
DE   AltName: Full=DNA-directed RNA polymerase III largest subunit;
DE   AltName: Full=RNA polymerase II subunit B220;
GN   Name=RPO21; Synonyms=RPB1, RPB220, SUA8; OrderedLocusNames=YDL140C;
GN   ORFNames=D2150;
OS   Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast).
OC   Eukaryota; Fungi; Dikarya; Ascomycota; Saccharomycotina; Saccharomycetes;
OC   Saccharomycetales; Saccharomycetaceae; Saccharomyces.
OX   NCBI_TaxID=559292;
RN   [1]
RP   NUCLEOTIDE SEQUENCE [GENOMIC DNA].
RC   STRAIN=ATCC 204626 / S288c / A364A;
RX   PubMed=3896517; DOI=10.1016/0092-8674(

In [9]:
# Make a query
r = requests.get("{}/uniprot/?query=reviewed:yes+AND+organism:\"homo sapiens\"&format=list".format(URL))
print(r.status_code)
print(r.text[:100])
r_name = set(r.text.strip().split("\n"))

200
Q96NG5
Q6ZN19
Q9UI25
Q86XN6
A8MUZ8
Q08ER8
Q9GZX5
Q9GZP7
Q3MJ13
Q9NY84
B1APH4
Q9HBT8
Q76KX8
Q96E35
P3


In [10]:
# Why we get a different number of proteins using 9606 instead of "homo sapiens" in the organism field?
r_name = requests.get("{}/uniprot/?query=reviewed:yes+AND+organism:\"homo sapiens\"&format=list".format(URL))
r_taxon = requests.get("{}/uniprot/?query=reviewed:yes+AND+organism:9606&format=list".format(URL))
print(r_name.status_code, r_taxon.status_code)

r_name = set(r_name.text.strip().split("\n"))
r_taxon = set(r_taxon.text.strip().split("\n"))
print(r_name - r_taxon, r_taxon - r_name, sep="\n")

200 200
{'D4N3P2', 'P84351', 'D4N3P3'}
set()


In [11]:
# Find UniProt/UniRef
r = requests.get("{}/uploadlists/".format(URL), params={'from': 'ACC', 
                                                        'to': 'PDB_ID', 
                                                        'format': 'tab', 
                                                        'query': 'P04050 P04637'})
print(r.status_code)
print(r.text)

200
From	To
P04050	1I3Q
P04050	1I50
P04050	1I6H
P04050	1K83
P04050	1NIK
P04050	1NT9
P04050	1PQV
P04050	1R5U
P04050	1R9S
P04050	1R9T
P04050	1SFO
P04050	1TWA
P04050	1TWC
P04050	1TWF
P04050	1TWG
P04050	1TWH
P04050	1WCM
P04050	1Y1V
P04050	1Y1W
P04050	1Y1Y
P04050	1Y77
P04050	2B63
P04050	2B8K
P04050	2E2H
P04050	2E2I
P04050	2E2J
P04050	2JA5
P04050	2JA6
P04050	2JA7
P04050	2JA8
P04050	2L0I
P04050	2LO6
P04050	2NVQ
P04050	2NVT
P04050	2NVX
P04050	2NVY
P04050	2NVZ
P04050	2R7Z
P04050	2R92
P04050	2R93
P04050	2VUM
P04050	2YU9
P04050	3CQZ
P04050	3FKI
P04050	3GTG
P04050	3GTJ
P04050	3GTK
P04050	3GTL
P04050	3GTM
P04050	3GTO
P04050	3GTP
P04050	3GTQ
P04050	3H3V
P04050	3HOU
P04050	3HOV
P04050	3HOW
P04050	3HOX
P04050	3HOY
P04050	3HOZ
P04050	3I4M
P04050	3I4N
P04050	3J0K
P04050	3J1N
P04050	3K1F
P04050	3K7A
P04050	3M3Y
P04050	3M4O
P04050	3PO2
P04050	3PO3
P04050	3QT1
P04050	3RZD
P04050	3RZO
P04050	3S14
P04050	3S15
P04050	3S16
P04050	3S17
P04050	3S1M
P04050	3S1N
P04050	3S1Q
P04050	3S1R
P04050	3S2D
P04050	3S2H
P040

In [15]:
# Working with XML
import xml.etree.ElementTree as ET

r = requests.get("{}/uniprot/P33313.xml".format(URL))  # Without extension, it will look at the 'Accept' header
# print(r.text[:1000])

NS = {'uniprot': 'http://uniprot.org/uniprot'}
root = ET.fromstring(r.text)

print([(ele.attrib, ele.text) for ele in root.findall("uniprot:entry/uniprot:organism/uniprot:name", NS)])

[({'type': 'scientific'}, 'Saccharomyces cerevisiae (strain ATCC 204508 / S288c)'), ({'type': 'common'}, "Baker's yeast")]


In [19]:
# Find GO terms annotated to P33313


r = requests.get("{}/uniprot/P33313.xml".format(URL))  # Without extension, it will look at the 'Accept' header
NS = {'uniprot': 'http://uniprot.org/uniprot'}
root = ET.fromstring(r.text)

for ele in root.findall("uniprot:entry/uniprot:dbReference", NS):
    if ele.attrib["type"] == "GO":
        print(ele.attrib)
        for ele2 in ele.findall("uniprot:property", NS):
            print(ele2.attrib)
#             if ele2.attrib["type"] == "chains":
#                 chains = ele2.attrib["value"].split(", ")
#                 for chain in chains:
#                     ch, pos = chain.split("=")
#                     start, end = pos.split("-")
#                     start = int(start)
#                     end = int(end)
#                     print(ch, start, end)
                    

{'type': 'GO', 'id': 'GO:0005737'}
{'type': 'term', 'value': 'C:cytoplasm'}
{'type': 'evidence', 'value': 'ECO:0000353'}
{'type': 'project', 'value': 'SGD'}
{'type': 'GO', 'id': 'GO:0030544'}
{'type': 'term', 'value': 'F:Hsp70 protein binding'}
{'type': 'evidence', 'value': 'ECO:0000314'}
{'type': 'project', 'value': 'SGD'}
{'type': 'GO', 'id': 'GO:0051879'}
{'type': 'term', 'value': 'F:Hsp90 protein binding'}
{'type': 'evidence', 'value': 'ECO:0000314'}
{'type': 'project', 'value': 'SGD'}
{'type': 'GO', 'id': 'GO:0043022'}
{'type': 'term', 'value': 'F:ribosome binding'}
{'type': 'evidence', 'value': 'ECO:0000314'}
{'type': 'project', 'value': 'SGD'}
{'type': 'GO', 'id': 'GO:0006457'}
{'type': 'term', 'value': 'P:protein folding'}
{'type': 'evidence', 'value': 'ECO:0000353'}
{'type': 'project', 'value': 'SGD'}
{'type': 'GO', 'id': 'GO:0042026'}
{'type': 'term', 'value': 'P:protein refolding'}
{'type': 'evidence', 'value': 'ECO:0000316'}
{'type': 'project', 'value': 'SGD'}


In [14]:
# Find the fraction of residues covered by PDB structures in P33313. How many residues are covered by at least 2 structures?

r = requests.get("{}/uniprot/P33313.xml".format(URL))  # Without extension, it will look at the 'Accept' header
NS = {'uniprot': 'http://uniprot.org/uniprot'}
root = ET.fromstring(r.text)

sequence_length = int(root.find("uniprot:entry/uniprot:sequence", NS).attrib["length"])

sequence_state = [0] * sequence_length
for ele in root.findall("uniprot:entry/uniprot:dbReference", NS):
    if ele.attrib["type"] == "PDB":
        print(ele.attrib)
        for ele2 in ele.findall("uniprot:property", NS):
            if ele2.attrib["type"] == "chains":
                chains = ele2.attrib["value"].split(", ")
                for chain in chains:
                    ch, pos = chain.split("=")
                    start, end = pos.split("-")
                    start = int(start)
                    end = int(end)
                    print(ch, start, end)
                    
                    # For each sequence position count the number of PDB structures covering that residue
                    for i in range(start - 1, end):
                        sequence_state[i] += 1

# PDB coverage for each sequence residues
print(sequence_state)

# Fraction of sequence residues covered by PDB structures
print(sum([1 for r in sequence_state if r > 1]) / len(sequence_state))

{'type': 'PDB', 'id': '6HFM'}
A/B 221 385
{'type': 'PDB', 'id': '6HFT'}
A 70 385
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 