In [1]:
import requests
import json
import pandas as pd
from io import StringIO

`requests` is the recommended module for requesting and sending resources to a web-based API endpoint

`json` is the built-in module for working with text data in JSON format

In [21]:
uniprot_url = "https://www.uniprot.org/uploadlists"
headers = {
    "User-Agent": "Python, toan.phung@uq.net.au"
}
acc_file = "../data/testlist.txt"

`https://www.uniprot.org/uploadlists` is the url of the uniprot REST API that we will used to request information

`headers` is the metadata that should be include with every api requests for potential debugging purpose from uniprot admin

In [7]:
with open(acc_file, "rt") as source_acc:
    
    l = [i.strip() for i in source_acc]
    parameters = {
                "query": " ".join(l),
                "format": "tab",
                "from": "ACC,ID",
                "to": "ACC",
                "columns": "id,entry name,reviewed,protein names,genes,organism,length,database(RefSeq)," \
                                   "organism-id,go-id,go(cellular component),comment(SUBCELLULAR LOCATION)," \
                                   "feature(TOPOLOGICAL_DOMAIN),feature(GLYCOSYLATION),comment(MASS SPECTROMETRY)," \
                                   "sequence,feature(ALTERNATIVE SEQUENCE),comment(ALTERNATIVE PRODUCTS) "
            }
    response = requests.get(uniprot_url, params=parameters, headers=headers)

Opening the file containing our list of Uniprot accession id and store as an string array

Build a dictionary with 5 keys:
- `query` value is a string constructed from the array above with each item joined by a space
- `format` the desired return format
- `from` input format id type
- `to` output format id type
- `columns` string composed of the columns name of desired data corresponding to the id

`response` is the variable containing the request result from Uniprot.

In [9]:
result = pd.read_csv(StringIO(response.text), sep="\t")


`result` store uniprot tabulated data in a `pandas` dataframe.


In [4]:
eutil_path = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
acc_list = ['NM_009417','NM_000547','NM_001003009','NM_019353']
query = ""
for i in range(len(acc_list)):
    acc_list[i] = acc_list[i] + "[accn]"
query = "+OR+".join(acc_list)
params = [
    "db=nuccore",
    "term={}".format(query),
    "usehistory=y"
]
url = "&".join(params)

In [5]:
res = requests.get(eutil_path + "esearch.fcgi?" + url, headers=headers)

In [6]:
res.content

b'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">\n<eSearchResult><Count>4</Count><RetMax>4</RetMax><RetStart>0</RetStart><QueryKey>1</QueryKey><WebEnv>NCID_1_54036580_130.14.22.33_9001_1559264498_1086444762_0MetA0_S_MegaStore</WebEnv><IdList>\n<Id>253735815</Id>\n<Id>927442695</Id>\n<Id>402766536</Id>\n<Id>350529408</Id>\n</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>NM_009417[accn]</Term>    <Field>accn</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <TermSet>    <Term>NM_000547[accn]</Term>    <Field>accn</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>OR</OP>   <TermSet>    <Term>NM_001003009[accn]</Term>    <Field>accn</Field>    <Count>1</Count>    <Explode>N</Explode>   </TermSet>   <OP>OR</OP>   <TermSet>    <Term>NM_019353[accn]</Term>    <Field>accn</Field>    <Count>1</Count>    <Explode>N<

In [7]:
from bs4 import BeautifulSoup

In [15]:
soup = BeautifulSoup(res.content, features="lxml-xml")

In [17]:
query_key = soup.find("QueryKey")
web_env = soup.find("WebEnv")

In [30]:
retrieve_params = [
    "db=nuccore",
    "query_key={}".format(query_key.text),
    "WebEnv={}".format(web_env.text),
    "rettype=gb",
    "retmode=xml"
]
retrieve_url = "&".join(retrieve_params)

More information on return datatype
`https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/`

In [31]:
res = requests.get(eutil_path + "efetch.fcgi?" + retrieve_url, headers=headers)
with open("test.gb", "wb") as test_write:
    test_write.write(res.content)
