## Convert GenBank ID (Accession number) to UniProt ID (Accession number)

In [10]:
import requests

result_converter = lambda row: row.split("\t")[-1]

# Retrieve/ID mapping service from UniProt
url = 'https://www.uniprot.org/uploadlists/'

params = {
    'from': 'EMBL',
    'to': 'ACC',
    'format': 'tab',
    'query': 'ATE90961.1 YP_010084900.1'
}

r = requests.post(url, data=params)
print(r.text)
print([i.split("\t")[-1] for i in r.text.splitlines()[1:]])

KeyboardInterrupt: 

## Read a sample protein sequence from FASTA file

In [None]:
from Bio import SeqIO
record = SeqIO.read("sample.fasta", format="fasta")

print("ID:", record.id.split("|")[1])
print("Description:", record.description)
print("Sequence:", record.seq)

ID: A0A291B645
Description: tr|A0A291B645|A0A291B645_9VIRU Major capsid protein OS=Shrimp hemocyte iridescent virus OX=2039780 GN=148L PE=3 SV=1
Sequence: MLRFIYEKKILLIKNCKMANIAGALQDMANLGAVERYQYGTTNAVTYFIRETRKSTLFSQLPIQLSSKNGNPDFDREWSVEPSKAFDYLIHMWIRVTVPEVKLLAGNVYKEHGRIRWTRNFMHNLIKKVSFNVNDLEIEKFDNYFLDFWNQFTLSSSKKDGYNNMIGNDDDLLIPKSKDGKIESKSLTLPIPFFFSRDSGLALPVGGVKWNKLRIDFEFRNWTELLILENVGAAHNGEKNPCKVPQVGSDIAVAPSLSNVQCWVNGGLIPEAERARMGCVHRDMLIESIQTSSKLNFNPVLNPNPSYDIRFQRTVKALFFGVRNTTNPNVWSNYTTASPVPDADKIDFDPDQSAFDPIGTANIRYESSDRIPVMTADYFSLIEPYYKAPAIPELTGYHMFSYALKMNNVDPSGSANYSILNNVSIQLQCSEAAIKAAKGEGEAKTGTDYAQSFQFLVIAISQNVLTLKNGMLGLPFM


## Check accession number

In [None]:
import re

UNIPROT_ACCESSION_NUMBER_FORMAT = r"^([A-N,R-Z][0-9]([A-Z][A-Z,0-9][A-Z,0-9][0-9]){1,2})|([O,P,Q][0-9][A-Z,0-9][A-Z,0-9][A-Z,0-9][0-9])(\.\d+)?$"  # noqa: E501
GENBANK_ACCESSION_NUMBER_FORMAT = r"^([A-Za-z0-9]+\d+(\.\d+)?)|((N|X|W|A)P_\d+)$"


def is_accession_format(v: str) -> bool:
    return (
        re.fullmatch(GENBANK_ACCESSION_NUMBER_FORMAT, v) is not None
        or re.fullmatch(UNIPROT_ACCESSION_NUMBER_FORMAT, v) is not None
    )

is_accession_format("WP_123456")

True

## NCBI Blast service test

In [None]:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SeqIO
record = SeqIO.read("sample.fasta", format="fasta")
# entrez query builder: https://www.ncbi.nlm.nih.gov/protein/advanced
result = NCBIWWW.qblast("blastp", "nr", record.format("fasta"),
                        expect=0.05,
                        hitlist_size=5)
record = NCBIXML.read(result)
result.close()

In [None]:
for description in record.descriptions:
    print(description.title)

print()
for alignment in record.alignments:
    print(alignment.hit_def)

ref|YP_009552282.1| 001R, partial [Cherax quadricarinatus iridovirus] >ref|YP_010084900.1| major capsid Protein [Shrimp hemocyte iridescent virus] >gb|ASZ84981.1| 001R, partial [Cherax quadricarinatus iridovirus] >gb|ATE87157.1| major capsid Protein [Shrimp hemocyte iridescent virus] >gb|ATE90961.1| major capsid protein [Shrimp hemocyte iridescent virus]
gb|QUI88197.1| major capsid protein, partial [Decapod iridescent virus 1] >gb|QUI88198.1| major capsid protein, partial [Decapod iridescent virus 1] >gb|QUI88199.1| major capsid protein, partial [Decapod iridescent virus 1]
gb|QXT57809.1| major capsid protein [Rhinella marina erythrocytic-like virus]
ref|NP_149737.1| 274L [Invertebrate iridescent virus 6] >sp|Q05815.2| RecName: Full=Major capsid protein; Short=MCP; AltName: Full=P50 [Invertebrate iridescent virus 6] >gb|AAK82135.1| 274L [Invertebrate iridescent virus 6]
ref|YP_008357369.1| Major Capsid Protein [Invertebrate iridovirus 22] >ref|YP_009010836.1| Major Capsid Protein [Inve

In [None]:
descs = record.alignments[0].hit_def.split(" >")

target = None
for desc in descs:
    if desc.find("partial") == -1:
        target = desc
        break

if target:
    print("Found sequence accession number!")
    print("Accession:", target.split("|")[1])
else:
    print("404 not found!")

Found sequence accession number!
Accession: YP_010084900.1


## Uniprot Blast
##### Documentation: https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?pageId=94147939

In [None]:
import requests

r = requests.post(url, data=params)

In [8]:
import time, asyncio

def blocking_io():
    print(f"start blocking_io at {time.strftime('%X')}")
    # Note that time.sleep() can be replaced with any blocking
    # IO-bound operation, such as file operations.
    time.sleep(1)
    print(f"blocking_io complete at {time.strftime('%X')}")
    return True

async def main():
    print(f"started main at {time.strftime('%X')}")

    result = await asyncio.gather(asyncio.to_thread(blocking_io))

    print(f"finished main at {time.strftime('%X')}")

    return result


task = asyncio.create_task(main())
print("sease")

await asyncio.sleep(1)

print("sease")
print(await task)

sease
started main at 22:10:14
start blocking_io at 22:10:14
blocking_io complete at 22:10:15
sease
finished main at 22:10:15
[True]
