In [1]:
import pandas as pd
import requests as req
import ast
import tqdm

In [2]:
# constants
BASE = "https://www.ebi.ac.uk/proteins/api/proteins"
TYPES = {"Domain",
         "Signal",
         "Propep",
         #"TRANSMEM",
         #"TOPO_DOM",
         #"INTRAMEME",
         #"REPEAT",
         #"DNA_BIND",
         #"STRAND",
         #"HELIX",
         #"COILED",
         #"MOTIF",
         #"ACT_SITE",
         #"BINDING",
         #"TURN",
         }  # select types taken from uniprot docs: https://www.uniprot.org/help/sequence_annotation

In [3]:
#### Local version
import tqdm
import ijson

In [4]:
def generate_fragments(data, fragment_types, multi_frag=False):
    try:
        parsed_frags = [{"type": e["type"], "start": int(e["location"]["start"]["value"]) - 1, "stop": int(e["location"]["end"]["value"])} for e in record["features"] if e["type"] in fragment_types]
        seq = data["sequence"]["value"]
        acc = data["primaryAccession"]

        frags = []
        for f in parsed_frags:
            if f["start"] == 0:
                f_type = "n-term"
            elif f["stop"] == len(seq):
                f_type = "c-term"
            else:
                f_type = "internal"

            frags.append((acc, f_type, seq[f["start"]:f["stop"]], ))

        if multi_frag:
            raise "Not currently Implemented"

        return frags
    except Exception:
        return None, None, None

In [5]:
fragment_types = TYPES

counter = 0
df = pd.DataFrame(columns=["acc_id", "type", "sequence"])

fragments = []

with open("./data/uniprotkb_reviewed_true_2025_11_07.json", "r") as f:
    for record in ijson.items(f, "results.item"):
        #parsed_frags = [{"type": e["type"], "start": int(e["location"]["start"]["value"]) - 1, "stop": int(e["location"]["end"]["value"])} for e in record["features"] if e["type"] in fragment_types]
        fragments.extend(generate_fragments(record, TYPES))

        if counter % 10000 == 0:
            new_df = pd.DataFrame(data=fragments, columns=["acc_id", "type", "sequence"])
            df = pd.concat([df, new_df], ignore_index=True)
            print(counter)

        if counter % 100000 == 0:
            df.to_csv(f"./data/{counter}_save_point.csv", index=False)

        counter += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000


In [6]:
df.to_csv(f"./data/fragments.csv", index=False)

In [7]:
df

Unnamed: 0,acc_id,type,sequence
0,A0A009IHW8,internal,PEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLR...
1,A0A009IHW8,internal,PEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLR...
2,A0A023I7E1,n-term,MRFQVIVAAATITMITSYIPGVAS
3,A0A023I7E1,internal,DDLFVPVSNFDPKSIFPEIKHPFEPMYANTENGKIVPTNSWISNLF...
4,A0A024B7W1,internal,SGALWDVPAPKEVKKGETTDGVYRVMTRRLLGSTQVGVGVMQEGVF...
...,...,...,...
10020191,Q8PGV3,internal,IGIVVDSACDLPQDFIQRHNIVVLPISVRIGEAVLADHRDEEATLS...
10020192,Q8PHB6,n-term,MRIEVWQGDITELDVDVIVNAANESLLGGGGVDGAIHRAAGPRLLE...
10020193,Q8PHP7,internal,KPWHLYLLLCRNGSYYAGITNDLERRFQAHLRGTGARYTRANPPVQ...
10020194,Q8PP26,c-term,DDPRYRVEVEVSPRFLAHQSTPDEGRYAFAYSIRIQNAGAVPARLV...


In [8]:
df["type"].value_counts()

type
internal    7427899
n-term      1943762
c-term       617239
Name: count, dtype: int64