In [12]:
import requests
import csv
import io
import gzip
import pandas

In [20]:
request_url = "https://rest.uniprot.org/idmapping/uniprotkb/results/stream/560328d6804c8449f2e60266cf18bf468be93f39?compressed=true&fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cxref_geneid&format=tsv"

response = requests.get(request_url)

# the results are compressed to make it easier to retrieve data of the API, so we need to decompress it
data_filename = "full-mapping"

decompressed_file = gzip.decompress(response.content)

# save the data to a file of format csv
with open(data_filename + ".csv", "wb") as file:
    file.write(decompressed_file)
    
# visualize info of the data

data = pandas.read_csv(data_filename + ".csv", sep="\t")

data

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,GeneID
0,258469,Q0VEL5,unreviewed,Q0VEL5_MOUSE,Olfactory receptor,Or2h2 Olfr90,Mus musculus (Mouse),310,258469;
1,5008,P13725,reviewed,ONCM_HUMAN,Oncostatin-M (OSM),OSM,Homo sapiens (Human),252,5008;
2,5008,B5MCX1,unreviewed,B5MCX1_HUMAN,Oncostatin M,OSM,Homo sapiens (Human),231,5008;
3,7078,P35625,reviewed,TIMP3_HUMAN,Metalloproteinase inhibitor 3 (Protein MIG-5) ...,TIMP3,Homo sapiens (Human),211,7078;
4,10236,O43390,reviewed,HNRPR_HUMAN,Heterogeneous nuclear ribonucleoprotein R (hnR...,HNRNPR HNRPR,Homo sapiens (Human),633,10236;
5,10236,B4DMB1,unreviewed,B4DMB1_HUMAN,"cDNA FLJ53358, highly similar to Heterogeneous...",,Homo sapiens (Human),595,10236;
6,10236,B4DMD1,unreviewed,B4DMD1_HUMAN,"cDNA FLJ53360, highly similar to Heterogeneous...",,Homo sapiens (Human),473,10236;
7,10236,B4DT28,unreviewed,B4DT28_HUMAN,Heterogeneous nuclear ribonucleoprotein R (cDN...,HNRNPR HNRPR hCG_38907,Homo sapiens (Human),494,10236;
8,10236,Q0VGD6,unreviewed,Q0VGD6_HUMAN,HNRPR protein,HNRPR,Homo sapiens (Human),607,10236;
9,10236,Q6MZS5,unreviewed,Q6MZS5_HUMAN,Uncharacterized protein DKFZp686A13234,DKFZp686A13234,Homo sapiens (Human),613,10236;


In [21]:
# Refinement

# Remove the column GeneId
data = data.drop(columns=["GeneID"])

# Rename the column "From" to "EntrezID"
data = data.rename(columns={"From": "EntrezID"})

# Rename "Entry" to "UniProtID"
data = data.rename(columns={"Entry": "UniProtID"})

# Rename "Entry Name" to "UniProtName"
data = data.rename(columns={"Entry Name": "UniProtName"})

data

Unnamed: 0,EntrezID,UniProtID,Reviewed,UniProtName,Protein names,Gene Names,Organism,Length
0,258469,Q0VEL5,unreviewed,Q0VEL5_MOUSE,Olfactory receptor,Or2h2 Olfr90,Mus musculus (Mouse),310
1,5008,P13725,reviewed,ONCM_HUMAN,Oncostatin-M (OSM),OSM,Homo sapiens (Human),252
2,5008,B5MCX1,unreviewed,B5MCX1_HUMAN,Oncostatin M,OSM,Homo sapiens (Human),231
3,7078,P35625,reviewed,TIMP3_HUMAN,Metalloproteinase inhibitor 3 (Protein MIG-5) ...,TIMP3,Homo sapiens (Human),211
4,10236,O43390,reviewed,HNRPR_HUMAN,Heterogeneous nuclear ribonucleoprotein R (hnR...,HNRNPR HNRPR,Homo sapiens (Human),633
5,10236,B4DMB1,unreviewed,B4DMB1_HUMAN,"cDNA FLJ53358, highly similar to Heterogeneous...",,Homo sapiens (Human),595
6,10236,B4DMD1,unreviewed,B4DMD1_HUMAN,"cDNA FLJ53360, highly similar to Heterogeneous...",,Homo sapiens (Human),473
7,10236,B4DT28,unreviewed,B4DT28_HUMAN,Heterogeneous nuclear ribonucleoprotein R (cDN...,HNRNPR HNRPR hCG_38907,Homo sapiens (Human),494
8,10236,Q0VGD6,unreviewed,Q0VGD6_HUMAN,HNRPR protein,HNRPR,Homo sapiens (Human),607
9,10236,Q6MZS5,unreviewed,Q6MZS5_HUMAN,Uncharacterized protein DKFZp686A13234,DKFZp686A13234,Homo sapiens (Human),613


In [22]:
# Save changes to the file
data.to_csv(data_filename + ".csv", index=False)

In [23]:
# Now we need to gather the reviewed proteins
reviwed_request_url = "https://rest.uniprot.org/idmapping/uniprotkb/results/stream/560328d6804c8449f2e60266cf18bf468be93f39?fields=accession%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Csequence%2Ccc_alternative_products%2Cgo_p%2Cgo_c%2Cgo_f&format=tsv&query=%28reviewed%3Atrue%29"

# The data is not compressed in this request so we do not require to decompress it, instead we store it in a file
reviewed_data_filename = "reviewed-mapping"

reviewed_response = requests.get(reviwed_request_url)

with open(reviewed_data_filename + ".csv", "wb") as file:
    file.write(reviewed_response.content)
    
# Visualize the data
reviewed_data = pandas.read_csv(reviewed_data_filename + ".csv", sep="\t")
 
reviewed_data

Unnamed: 0,From,Entry,Entry Name,Protein names,Gene Names,Organism,Length,Sequence,Alternative products (isoforms),Gene Ontology (biological process),Gene Ontology (cellular component),Gene Ontology (molecular function)
0,10236,O43390,HNRPR_HUMAN,Heterogeneous nuclear ribonucleoprotein R (hnR...,HNRNPR HNRPR,Homo sapiens (Human),633,MANQVNGNAVQLKEEEEPMDTSSVTHTEHYKTLIEAGLPQKVAERL...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,"mRNA processing [GO:0006397]; mRNA splicing, v...",catalytic step 2 spliceosome [GO:0071013]; end...,mRNA binding [GO:0003729]; RNA binding [GO:000...
1,8906,O75843,AP1G2_HUMAN,AP-1 complex subunit gamma-like 2 (Gamma2-adap...,AP1G2,Homo sapiens (Human),785,MVVPSLKLQDLIEEIRGAKTQAQEREVIQKECAHIRASFRDGDPVH...,ALTERNATIVE PRODUCTS:,Golgi to vacuole transport [GO:0006896]; intra...,AP-1 adaptor complex [GO:0030121]; endosome me...,clathrin adaptor activity [GO:0035615]
2,5008,P13725,ONCM_HUMAN,Oncostatin-M (OSM),OSM,Homo sapiens (Human),252,MGVLLTQRTLLSLVLALLFPSMASMAAIGSCSKEYRVLLGQLQKQT...,ALTERNATIVE PRODUCTS:,immune response [GO:0006955]; negative regulat...,extracellular region [GO:0005576]; extracellul...,cytokine activity [GO:0005125]; growth factor ...
3,7078,P35625,TIMP3_HUMAN,Metalloproteinase inhibitor 3 (Protein MIG-5) ...,TIMP3,Homo sapiens (Human),211,MTPWLGLIVLLGSWSLGDWGAEACTCSPSHPQDAFCNSDIVIRAKV...,ALTERNATIVE PRODUCTS:,cellular response to organic substance [GO:007...,basement membrane [GO:0005604]; collagen-conta...,metal ion binding [GO:0046872]; metalloendopep...


In [24]:
# Refinement

# Rename "Entry" to "UniProtID"
reviewed_data = reviewed_data.rename(columns={"Entry": "UniProtID"})

# Rename "Entry Name" to "UniProtName"
reviewed_data = reviewed_data.rename(columns={"Entry Name": "UniProtName"})

# Rename "From" to "EntrezID"
reviewed_data = reviewed_data.rename(columns={"From": "EntrezID"})

# Save changes to the file
reviewed_data.to_csv(reviewed_data_filename + ".csv", index=False)

reviewed_data

Unnamed: 0,EntrezID,UniProtID,UniProtName,Protein names,Gene Names,Organism,Length,Sequence,Alternative products (isoforms),Gene Ontology (biological process),Gene Ontology (cellular component),Gene Ontology (molecular function)
0,10236,O43390,HNRPR_HUMAN,Heterogeneous nuclear ribonucleoprotein R (hnR...,HNRNPR HNRPR,Homo sapiens (Human),633,MANQVNGNAVQLKEEEEPMDTSSVTHTEHYKTLIEAGLPQKVAERL...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,"mRNA processing [GO:0006397]; mRNA splicing, v...",catalytic step 2 spliceosome [GO:0071013]; end...,mRNA binding [GO:0003729]; RNA binding [GO:000...
1,8906,O75843,AP1G2_HUMAN,AP-1 complex subunit gamma-like 2 (Gamma2-adap...,AP1G2,Homo sapiens (Human),785,MVVPSLKLQDLIEEIRGAKTQAQEREVIQKECAHIRASFRDGDPVH...,ALTERNATIVE PRODUCTS:,Golgi to vacuole transport [GO:0006896]; intra...,AP-1 adaptor complex [GO:0030121]; endosome me...,clathrin adaptor activity [GO:0035615]
2,5008,P13725,ONCM_HUMAN,Oncostatin-M (OSM),OSM,Homo sapiens (Human),252,MGVLLTQRTLLSLVLALLFPSMASMAAIGSCSKEYRVLLGQLQKQT...,ALTERNATIVE PRODUCTS:,immune response [GO:0006955]; negative regulat...,extracellular region [GO:0005576]; extracellul...,cytokine activity [GO:0005125]; growth factor ...
3,7078,P35625,TIMP3_HUMAN,Metalloproteinase inhibitor 3 (Protein MIG-5) ...,TIMP3,Homo sapiens (Human),211,MTPWLGLIVLLGSWSLGDWGAEACTCSPSHPQDAFCNSDIVIRAKV...,ALTERNATIVE PRODUCTS:,cellular response to organic substance [GO:007...,basement membrane [GO:0005604]; collagen-conta...,metal ion binding [GO:0046872]; metalloendopep...
