In [1]:
import pandas as pd
import ijson
from tqdm import tqdm

In [3]:
# parse json
protein_data = []

with open("../data/uniprotkb_reviewed_true_2025_11_07.json", "r") as f:
    for record in tqdm(ijson.items(f, "results.item"), desc="Processing proteins"):
        acc_id = record["primaryAccession"]
        sequence = record["sequence"]["value"]
        protein_data.append((acc_id, sequence))

Processing proteins: 573661it [01:06, 8682.27it/s] 


In [4]:
# output fasta for homology reduction
fasta = "\n".join(f"> {acc_id}\n{seq}" for acc_id, seq in protein_data)

with open("swissprot.fasta", "w") as f:
    f.write(fasta)

In [2]:
# read from redundancy reduced fasta which proteins to keep
representatives = set()
with open("../data/swissprot30.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            representatives.add(line[1:-1])

print(len(representatives))

75630


In [7]:
real_fragments = set()

with open("../data/uniprotkb_reviewed_true_AND_fragment_tr_2025_12_11.json", "r") as f:
    for record in tqdm(ijson.items(f, "results.item"), desc="Processing proteins"):
        acc_id = record["primaryAccession"]
        real_fragments.add(acc_id)

Processing proteins: 9280it [00:00, 12980.46it/s]


In [8]:
len(real_fragments)


9280

In [9]:
len(representatives)

75630

In [10]:
len(representatives - real_fragments)

70726

In [11]:
not_clustered_fragments = real_fragments - representatives
len(not_clustered_fragments)

4376

In [12]:
representatives = (representatives - real_fragments)

In [13]:
representative_fragments = real_fragments - not_clustered_fragments
len(representative_fragments)

4904

In [14]:
# split data into train, validation and test
from sklearn.model_selection import train_test_split

In [15]:
train_val, test = train_test_split(list(representatives), test_size=0.2, shuffle=True, random_state=42)
print(len(train_val))
print(len(test))

56580
14146


In [16]:
train, val = train_test_split(train_val, test_size=0.2, shuffle=True, random_state=42)
print(len(train))
print(len(val))

45264
11316


In [18]:
# output to new files for continued processing
with open("../data/train30.txt", "w") as f:
    text = "\n".join(acc_id for acc_id in train)
    f.write(text)

with open("../data/val30.txt", "w") as f:
    text = "\n".join(acc_id for acc_id in val)
    f.write(text)

with open("../data/test30.txt", "w") as f:
    text = "\n".join(acc_id for acc_id in test)
    f.write(text)

with open("../data/real_fragments30.txt", "w") as f:
    text = "\n".join(acc_id for acc_id in representative_fragments)
    f.write(text)