In [1]:
import pandas as pd
import requests as req
import ast
import tqdm

In [2]:
# constants
BASE = "https://www.ebi.ac.uk/proteins/api/proteins"
TYPES = {"DOMAIN",
         "SIGNAL",
         "PROPEP",
         #"TRANSMEM",
         #"TOPO_DOM",
         #"INTRAMEME",
         #"REPEAT",
         #"DNA_BIND",
         #"STRAND",
         #"HELIX",
         #"COILED",
         #"MOTIF",
         #"ACT_SITE",
         #"BINDING",
         #"TURN",
         }  # select types taken from uniprot docs: https://www.uniprot.org/help/sequence_annotation

In [3]:
def generate_fragments(acc, fragment_types, multi_frag=False):
    try:
        data = ast.literal_eval(req.get(f"{BASE}/{acc}").text)
        parsed_frags = [{"type": e["type"], "start": int(e["begin"]) - 1, "stop": int(e["end"])} for e in data["features"] if e["type"] in fragment_types]
        seq = data["sequence"]["sequence"]

        frags = []
        for f in parsed_frags:
            if f["start"] == 0:
                f_type = "n-term"
            elif f["stop"] == len(seq):
                f_type = "c-term"
            else:
                f_type = "internal"

            frags.append((acc, f_type, seq[f["start"]:f["stop"]], ))

        if multi_frag:
            raise "Not currently Implemented"


        return frags
    except Exception:
        return None, None, None

In [4]:
# get all uniprot ids
uni_ids = []
with open("../uniprot/uniprot_sprot.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            try:
                uni_ids.append(line.split("|")[1])
            except Exception:
                print(line)

uni_ids

['Q6GZX4',
 'Q6GZX3',
 'Q197F8',
 'Q197F7',
 'Q6GZX2',
 'Q6GZX1',
 'Q197F5',
 'Q6GZX0',
 'Q91G88',
 'Q6GZW9',
 'Q6GZW8',
 'Q197F3',
 'Q197F2',
 'Q6GZW6',
 'Q91G85',
 'Q6GZW5',
 'Q197E9',
 'Q6GZW4',
 'Q6GZW3',
 'Q197E7',
 'Q6GZW2',
 'Q6GZW1',
 'Q6GZW0',
 'Q6GZV8',
 'Q6GZV7',
 'Q6GZV6',
 'Q6GZV5',
 'Q6GZV4',
 'Q197D8',
 'Q6GZV2',
 'Q197D7',
 'Q6GZV1',
 'Q197D5',
 'Q91G70',
 'Q6GZU9',
 'Q6GZU8',
 'Q197D2',
 'Q6GZU7',
 'Q91G67',
 'Q197D0',
 'Q6GZU6',
 'Q6GZU5',
 'Q6GZU4',
 'Q197C8',
 'Q91G65',
 'Q6GZU3',
 'Q6GZU2',
 'Q91G63',
 'Q6GZU1',
 'Q6GZU0',
 'Q197C3',
 'Q6GZT9',
 'Q6GZT7',
 'Q6GZT6',
 'Q197C0',
 'Q91G57',
 'Q6GZT5',
 'Q6GZT4',
 'Q91G56',
 'Q91G55',
 'Q6GZT3',
 'Q197B6',
 'Q6GZN9',
 'Q91G54',
 'Q6GZT1',
 'Q197B5',
 'Q6GZT0',
 'Q6GZS9',
 'Q6GZS8',
 'Q6GZS7',
 'Q91G50',
 'Q197B1',
 'Q6GZS6',
 'Q6GZS5',
 'Q6GZS4',
 'Q197A7',
 'Q6GZS3',
 'Q6GZS2',
 'Q197A6',
 'Q6GZS1',
 'Q67475',
 'Q6GZR9',
 'O55703',
 'Q197A3',
 'Q6GZR8',
 'Q6GZR7',
 'Q6GZR6',
 'O55704',
 'Q6GZR4',
 'O55705',
 'Q196Z8',

In [5]:
len(uni_ids)

573661

In [6]:
fragments = []
for acc in tqdm.tqdm(uni_ids):
    fragments.extend(generate_fragments(acc, TYPES, multi_frag=False))

  1%|          | 4521/573661 [22:06<46:22:36,  3.41it/s]


KeyboardInterrupt: 

In [7]:
len(fragments)

2685

In [19]:
for frag in fragments:
    if frag is None:
        fragments.remove(frag)

In [8]:
for fragment in fragments:
    uni_ids.remove(fragment[0])

ValueError: list.remove(x): x not in list