In [52]:
# test uniprot api for domain annotation retrieval
import requests as req
import ast
import pandas as pd

In [43]:
# constants
BASE = "https://www.ebi.ac.uk/proteins/api/proteins"
TYPES = {"DOMAIN",
         "SIGNAL",
         "PROPEP",
         #"TRANSMEM",
         #"TOPO_DOM",
         #"INTRAMEME",
         #"REPEAT",
         #"DNA_BIND",
         #"STRAND",
         #"HELIX",
         #"COILED",
         #"MOTIF",
         #"ACT_SITE",
         #"BINDING",
         #"TURN",
         }  # select types taken from uniprot docs: https://www.uniprot.org/help/sequence_annotation

In [44]:
acc_id = "A8MYU2"
res = req.get(f"{BASE}/{acc_id}")
data = ast.literal_eval(res.text)
data

{'accession': 'A8MYU2',
 'id': 'KCNU1_HUMAN',
 'proteinExistence': 'Evidence at protein level',
 'info': {'type': 'Swiss-Prot',
  'created': '2008-09-02',
  'modified': '2025-06-18',
  'version': 142},
 'organism': {'taxonomy': 9606,
  'names': [{'type': 'scientific', 'value': 'Homo sapiens'},
   {'type': 'common', 'value': 'Human'}],
  'lineage': ['Eukaryota',
   'Metazoa',
   'Chordata',
   'Craniata',
   'Vertebrata',
   'Euteleostomi',
   'Mammalia',
   'Eutheria',
   'Euarchontoglires',
   'Primates',
   'Haplorrhini',
   'Catarrhini',
   'Hominidae',
   'Homo']},
 'secondaryAccession': ['B4DYE4'],
 'protein': {'recommendedName': {'fullName': {'value': 'Potassium channel subfamily U member 1'}},
  'alternativeName': [{'fullName': {'value': 'Calcium-activated potassium channel subunit alpha-3'}},
   {'fullName': {'value': 'Calcium-activated potassium channel, subfamily M subunit alpha-3'}},
   {'fullName': {'value': 'KCa5'}},
   {'fullName': {'value': 'Slowpoke homolog 3'}}]},
 'ge

In [45]:
feats = data["features"]
print(len(feats))
feats

87


[{'type': 'CHAIN',
  'category': 'MOLECULE_PROCESSING',
  'ftId': 'PRO_0000349186',
  'description': 'Potassium channel subfamily U member 1',
  'begin': '1',
  'end': '1149',
  'molecule': ''},
 {'type': 'TOPO_DOM',
  'category': 'TOPOLOGY',
  'description': 'Extracellular',
  'begin': '1',
  'end': '24',
  'molecule': '',
  'evidences': [{'code': 'ECO:0000255'}]},
 {'type': 'TRANSMEM',
  'category': 'TOPOLOGY',
  'description': 'Helical; Name=Segment S0',
  'begin': '25',
  'end': '45',
  'molecule': '',
  'evidences': [{'code': 'ECO:0000255'}]},
 {'type': 'TOPO_DOM',
  'category': 'TOPOLOGY',
  'description': 'Cytoplasmic',
  'begin': '46',
  'end': '101',
  'molecule': '',
  'evidences': [{'code': 'ECO:0000255'}]},
 {'type': 'TRANSMEM',
  'category': 'TOPOLOGY',
  'description': 'Helical; Name=Segment S1',
  'begin': '102',
  'end': '122',
  'molecule': '',
  'evidences': [{'code': 'ECO:0000255'}]},
 {'type': 'TOPO_DOM',
  'category': 'TOPOLOGY',
  'description': 'Extracellular',
 

In [46]:
for elem in feats:
    if elem["type"] in TYPES:
        print(elem)

{'type': 'DOMAIN', 'category': 'DOMAINS_AND_SITES', 'description': 'RCK N-terminal 1', 'begin': '331', 'end': '473', 'molecule': '', 'evidences': [{'code': 'ECO:0000255', 'source': {'name': 'PROSITE-ProRule', 'id': 'PRU00543', 'url': 'https://prosite.expasy.org/unirule/PRU00543'}}]}
{'type': 'DOMAIN', 'category': 'DOMAINS_AND_SITES', 'description': 'RCK N-terminal 2', 'begin': '713', 'end': '884', 'molecule': '', 'evidences': [{'code': 'ECO:0000255', 'source': {'name': 'PROSITE-ProRule', 'id': 'PRU00543', 'url': 'https://prosite.expasy.org/unirule/PRU00543'}}]}


In [47]:
## further info from data


In [56]:
## building dataset
def generate_fragments(acc, fragment_types, multi_frag=False):
    data = ast.literal_eval(req.get(f"{BASE}/{acc}").text)
    parsed_frags = [{"type": e["type"], "start": int(e["begin"]) - 1, "stop": int(e["end"])} for e in data["features"] if e["type"] in fragment_types]
    seq = data["sequence"]["sequence"]

    frags = []
    for f in parsed_frags:
        if f["start"] == 0:
            f_type = "n-term"
        elif f["stop"] == len(seq):
            f_type = "c-term"
        else:
            f_type = "internal"

        frags.append((acc, f_type, seq[f["start"]:f["stop"]], ))

    if multi_frag:
        raise "Not currently Implemented"


    return data["sequence"]["sequence"], frags

generate_fragments(acc_id, TYPES)

('MFQTKLRNETWEDLPKMSCTTEIQAAFILSSFVTFFSGLIILLIFRLIWRSVKKWQIIKGTGIILELFTSGTIARSHVRSLHFQGQFRDHIEMLLSAQTFVGQVLVILVFVLSIGSLIIYFINSADPVGSCSSYEDKTIPIDLVFNAFFSFYFGLRFMAADDKIKFWLEMNSIVDIFTIPPTFISYYLKSNWLGLRFLRALRLLELPQILQILRAIKTSNSVKFSKLLSIILSTWFTAAGFIHLVENSGDPWLKGRNSQNISYFESIYLVMATTSTVGFGDVVAKTSLGRTFIMFFTLGSLILFANYIPEMVELFANKRKYTSSYEALKGKKFIVVCGNITVDSVTAFLRNFLRDKSGEINTEIVFLGETPPSLELETIFKCYLAYTTFISGSAMKWEDLRRVAVESAEACLIIANPLCSDSHAEDISNIMRVLSIKNYDSTTRIIIQILQSHNKVYLPKIPSWNWDTGDNIICFAELKLGFIAQGCLVPGLCTFLTSLFVEQNKKVMPKQTWKKHFLNSMKNKILTQRLSDDFAGMSFPEVARLCFLKMHLLLIAIEYKSLFTDGFCGLILNPPPQVRIRKNTLGFFIAETPKDVRRALFYCSVCHDDVFIPELITNCGCKSRSRQHITVPSVKRMKKCLKGISSRISGQDSPPRVSASTSSISNFTTRTLQHDVEQDSDQLDSSGMFHWCKPTSLDKVTLKRTGKSKYKFRNHIVACVFGDAHSAPMGLRNFVMPLRASNYTRKELKDIVFIGSLDYLQREWRFLWNFPQIYILPGCALYSGDLHAANIEQCSMCAVLSPPPQPSSNQTLVDTEAIMATLTIGSLQIDSSSDPSPSVSEETPGYTNGHNEKSNCRKVPILTELKNPSNIHFIEQLGGLEGSLQETNLHLSTAFSTGTVFSGSFLDSLLATAFYNYHVLELLQMLVTGGVSSQLEQHLDKDKVYGVADSCTSLLSGRNRCKLGLLSLHETILSDVNPRNTFGQLFCGSLDLFGILCV

In [57]:
df = pd.DataFrame(columns=["uniprot", "frag_type", "sequence"])

In [61]:
for f in generate_fragments(acc_id, TYPES)[1]:
    print(f)
    df.add(f)

('A8MYU2', 'internal', 'KKFIVVCGNITVDSVTAFLRNFLRDKSGEINTEIVFLGETPPSLELETIFKCYLAYTTFISGSAMKWEDLRRVAVESAEACLIIANPLCSDSHAEDISNIMRVLSIKNYDSTTRIIIQILQSHNKVYLPKIPSWNWDTGDNII')
('A8MYU2', 'internal', 'RNHIVACVFGDAHSAPMGLRNFVMPLRASNYTRKELKDIVFIGSLDYLQREWRFLWNFPQIYILPGCALYSGDLHAANIEQCSMCAVLSPPPQPSSNQTLVDTEAIMATLTIGSLQIDSSSDPSPSVSEETPGYTNGHNEKSNCRKVPILTELKNPSNIHFIEQLGGLEGSL')


In [60]:
df

Unnamed: 0,uniprot,frag_type,sequence
