In [32]:
import requests
import json
import pandas as pd

"requests" is the recommended module for requesting and sending resources to a web-based API endpoint

"json" is the built-in module for working with text data in JSON format

In [3]:
uniprot_url = "https://www.uniprot.org/uploadlists"
headers = {
    "User-Agent": "Python, toan.phung@uq.net.au"
}
acc_file = "./data/testlist.txt"

"https://www.uniprot.org/uploadlists" is the url of the uniprot REST API that we will used to request information

"headers" is the metadata that should be include with every api requests for potential debugging purpose from uniprot admin

In [47]:
with open(acc_file, "rt") as source_acc:
    
    l = [i.strip() for i in source_acc]
    parameters = {
                "query": " ".join(l),
                "format": "tab",
                "from": "ACC,ID",
                "to": "ACC",
                "columns": "id,entry name,reviewed,protein names,genes,organism,length,database(RefSeq)," \
                                   "organism-id,go-id,go(cellular component),comment(SUBCELLULAR LOCATION)," \
                                   "feature(TOPOLOGICAL_DOMAIN),feature(GLYCOSYLATION),comment(MASS SPECTROMETRY)," \
                                   "sequence,feature(ALTERNATIVE SEQUENCE),comment(ALTERNATIVE PRODUCTS) "
            }
    response = requests.get(uniprot_url, params=parameters, headers=headers)

Opening the file containing our list of Uniprot accession id and store as an string array

Build a dictionary with 5 keys:
- "query": value is a string constructed from the array above with each item joined by a space
- "format": the desired return format
- "from": input format id type
- "to": output format id type
- "columns": string composed of the columns name of desired data corresponding to the id

In [48]:
response.content

b"Entry\tEntry name\tStatus\tProtein names\tGene names\tOrganism\tLength\tCross-reference (RefSeq)\tOrganism ID\tGene ontology IDs\tGene ontology (cellular component)\tSubcellular location [CC]\tTopological domain\tGlycosylation\tMass spectrometry\tSequence\tAlternative sequence\tAlternative products (isoforms)\tyourlist:M20190510216DA2B77BFBD2E6699CA9B6D1C41EB212C540E\nP25045\tLCB1_YEAST\treviewed\tSerine palmitoyltransferase 1 (SPT 1) (SPT1) (EC 2.3.1.50) (Long chain base biosynthesis protein 1)\tLCB1 END8 TSC2 YMR296C\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t558\tNP_014025.1;\t559292\tGO:0004758; GO:0005783; GO:0016021; GO:0017059; GO:0030148; GO:0030170; GO:0035339\tendoplasmic reticulum [GO:0005783]; integral component of membrane [GO:0016021]; serine C-palmitoyltransferase complex [GO:0017059]; SPOTS complex [GO:0035339]\tSUBCELLULAR LOCATION: Cytoplasm. Endoplasmic reticulum membrane; Multi-pass membrane protein.\tTOPO_DOM 1 49 Lumenal. {ECO:000026

In [50]:
acc_list = ['NM_009417','NM_000547','NM_001003009','NM_019353']
query = ""
for i in range(len(acc_list)):
    acc_list[i] = acc_list[i] + "[accn]"
query = "+OR+".join(acc_list)
params = {
    "db": "nuccore",
    "term": query
}


In [58]:
res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nuccore&term=NM_009417[accn]+OR+NM_000547[accn]+OR+NM_001003009[accn]+OR+NM_019353[accn]&usehistory=y")

In [59]:
res.content

b'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">\n<eSearchResult><Count>4</Count><RetMax>4</RetMax><RetStart>0</RetStart><QueryKey>1</QueryKey><WebEnv>NCID_1_64190802_130.14.18.48_9001_1557463661_1443873040_0MetA0_S_MegaStore</WebEnv><IdList>\n<Id>253735815</Id>\n<Id>927442695</Id>\n<Id>402766536</Id>\n<Id>350529408</Id>\n</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>NM_009417[accn]</Term>    <Field>accn</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <TermSet>    <Term>NM_000547[accn]</Term>    <Field>accn</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>OR</OP>   <TermSet>    <Term>NM_001003009[accn]</Term>    <Field>accn</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>OR</OP>   <TermSet>    <Term>NM_019353[accn]</Term>    <Field>accn</Field>    <Count>1</Count>    <Explode>N<

In [52]:
from bs4 import BeautifulSoup

In [60]:
soup = BeautifulSoup(res.content)

In [63]:
es_result = soup.find("eSearchResult")
print(es_result)

None


In [57]:
print(params)

{'db': 'nuccore', 'term': 'NM_009417[accn]+OR+NM_000547[accn]+OR+NM_001003009[accn]+OR+NM_019353[accn]'}
