## Convert GenBank ID(Accession number) to UniProt ID(Accession number)

In [17]:
import requests

result_converter = lambda row: row.split("\t")[-1]

# Retrieve/ID mapping service from UniProt
url = 'https://www.uniprot.org/uploadlists/'

params = {
    'from': 'EMBL',
    'to': 'ACC',
    'format': 'tab',
    'query': 'ATE90961.1'
}

r = requests.post(url, data=params)
print(r.text)
print([i.split("\t")[-1] for i in r.text.splitlines()[1:]])

From	To
ATE90961.1	A0A291B645

['A0A291B645']


## Read a sample protein sequence from FASTA file

In [15]:
from Bio import SeqIO
record = SeqIO.read("sample.fasta", format="fasta")

print("ID:", record.id.split("|")[1])
print("Description:", record.description)
print("Sequence:", record.seq)

ID: A0A291B645
Description: tr|A0A291B645|A0A291B645_9VIRU Major capsid protein OS=Shrimp hemocyte iridescent virus OX=2039780 GN=148L PE=3 SV=1
Sequence: MLRFIYEKKILLIKNCKMANIAGALQDMANLGAVERYQYGTTNAVTYFIRETRKSTLFSQLPIQLSSKNGNPDFDREWSVEPSKAFDYLIHMWIRVTVPEVKLLAGNVYKEHGRIRWTRNFMHNLIKKVSFNVNDLEIEKFDNYFLDFWNQFTLSSSKKDGYNNMIGNDDDLLIPKSKDGKIESKSLTLPIPFFFSRDSGLALPVGGVKWNKLRIDFEFRNWTELLILENVGAAHNGEKNPCKVPQVGSDIAVAPSLSNVQCWVNGGLIPEAERARMGCVHRDMLIESIQTSSKLNFNPVLNPNPSYDIRFQRTVKALFFGVRNTTNPNVWSNYTTASPVPDADKIDFDPDQSAFDPIGTANIRYESSDRIPVMTADYFSLIEPYYKAPAIPELTGYHMFSYALKMNNVDPSGSANYSILNNVSIQLQCSEAAIKAAKGEGEAKTGTDYAQSFQFLVIAISQNVLTLKNGMLGLPFM


## Identify accession number

In [38]:
import re

UNIPROT_ACCESSION_NUMBER_FORMAT = r"^([A-N,R-Z][0-9]([A-Z][A-Z,0-9][A-Z,0-9][0-9]){1,2})|([O,P,Q][0-9][A-Z,0-9][A-Z,0-9][A-Z,0-9][0-9])(\.\d+)?$"  # noqa: E501
GENBANK_ACCESSION_NUMBER_FORMAT = r"^([A-Za-z0-9]+\d+(\.\d+)?)|((N|X|W|A)P_\d+)$"


def is_accession_format(v: str) -> bool:
    return (
        re.fullmatch(GENBANK_ACCESSION_NUMBER_FORMAT, v.upper()) is not None
        or re.fullmatch(UNIPROT_ACCESSION_NUMBER_FORMAT, v.upper()) is not None
    )

is_accession_format("WP_123456")

True

## NCBI Blast service test

In [30]:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio import SeqIO
record = SeqIO.read("sample.fasta", format="fasta")
# entrez query builder: https://www.ncbi.nlm.nih.gov/protein/advanced
result = NCBIWWW.qblast("blastp", "nr", record.format("fasta"),
                        expect=5.0,
                        hitlist_size=5)
record = NCBIXML.read(result)
result.close()

In [41]:
descs = record.alignments[0].hit_def.split(" >")

target = None
for desc in descs:
    if desc.find("partial") == -1:
        target = desc
        break

if target:
    print("Found sequence accession number!")
    print("Accession:", target.split("|")[1])
else:
    print("404 not found!")

Found sequence accession number!
Accession: YP_010084900.1
