In [2]:
import pandas as pd
import ijson
from tqdm import tqdm

In [3]:
# parse json
protein_data = []

with open("../data/uniprotkb_reviewed_true_2025_11_07.json", "r") as f:
    for record in tqdm(ijson.items(f, "results.item"), desc="Processing proteins"):
        acc_id = record["primaryAccession"]
        sequence = record["sequence"]["value"]
        protein_data.append((acc_id, sequence))

Processing proteins: 573661it [01:06, 8682.27it/s] 


In [4]:
# output fasta for homology reduction
fasta = "\n".join(f"> {acc_id}\n{seq}" for acc_id, seq in protein_data)

with open("swissprot.fasta", "w") as f:
    f.write(fasta)

In [5]:
# read from redundancy reduced fasta which proteins to keep
representatives = set()
with open("../data/reduced.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            representatives.add(line[1:])

print(len(representatives))

380382


In [6]:
# split data into train, validation and test
from sklearn.model_selection import train_test_split

In [10]:
train_val, test = train_test_split(list(representatives), test_size=0.2, shuffle=True, random_state=42)
print(len(train_val))
print(len(test))

304305
76077


In [13]:
train, val = train_test_split(train_val, test_size=0.2, shuffle=True, random_state=42)
print(len(train))
print(len(val))

243444
60861


In [16]:
# output to new files for continued processing
with open("../data/train.txt", "w") as f:
    text = "".join(acc_id for acc_id in train)
    f.write(text)

with open("../data/val.txt", "w") as f:
    text = "".join(acc_id for acc_id in val)
    f.write(text)

with open("../data/test.txt", "w") as f:
    text = "".join(acc_id for acc_id in test)
    f.write(text)