In [None]:
import pandas as pd
import random
from tqdm import tqdm
from joblib import Parallel, delayed
from Bio import ExPASy
from Bio import SwissProt

# --- Step 1: Load your CSF proteins ---
csf_df = pd.read_csv("/home/gdallagl/myworkdir/data/ESMSec/protein/Table S6 Proteins found only in the CSF.csv")
csf_proteins = csf_df['protein'].tolist()

# --- Step 2: Generate 800 random UniProt proteins ---
# Here we simulate random proteins, replace with actual UniProt fetching if needed
all_uniprot_ids = ["P12345", "Q67890", "O11111", "P99999", "Q54321"]  # example pool
random_proteins = random.sample(all_uniprot_ids * 200, 800)  # multiply to allow sampling

# Combine with CSF proteins
all_proteins = csf_proteins + random_proteins

# --- Step 3: Function to fetch protein sequence ---
def fetch_sequence(uniprot_id):
    try:
        handle = ExPASy.get_sprot_raw(uniprot_id)
        record = SwissProt.read(handle)
        return record.sequence
    except:
        return None

# --- Step 4: Parallel fetching with tqdm ---
sequences = Parallel(n_jobs=-1)(
    delayed(fetch_sequence)(pid) for pid in tqdm(all_proteins)
)

# Remove proteins with missing sequences
valid_entries = [(pid, seq) for pid, seq in zip(all_proteins, sequences) if seq]
all_proteins, sequences = zip(*valid_entries)

# --- Step 5: Build DataFrame ---
df = pd.DataFrame({
    'protein': all_proteins,
    'sequence': sequences,
    'label': [1 if pid in csf_proteins else 0 for pid in all_proteins]
})

# --- Step 6: Assign balanced train/test/validation sets ---
def assign_set(label):
    # We'll assign roughly equal numbers of 0s and 1s in each set
    r = random.random()
    if r < 0.7:
        return 'train'
    elif r < 0.85:
        return 'validation'
    else:
        return 'test'

df['set'] = df['label'].apply(assign_set)

print(df.head())


  0%|          | 0/1557 [00:00<?, ?it/s]

100%|██████████| 1557/1557 [07:35<00:00,  3.42it/s]


  protein                                           sequence  label  \
0  P22694  MGNAATAKKGSEVESVKEFLAKAKEDFLKKWENPTQNNAGLEDFER...      1   
1  Q8NEV1  MSGPVPSRARVYTDVNTHRPREYWDYESHVVEWGNQDDYQLVRKLG...      1   
2  Q6P3V2  MPANWTSPQKSSALAPEDHGSSYEGSVSFRDVAIDFSREEWRHLDP...      1   
3  Q8N0Y7  MAAYKLVLIRHGESTWNLENRFSCWYDADLSPAGHEEAKRGGQALR...      1   
4  Q9NY65  MRECISVHVGQAGVQIGNACWELFCLEHGIQADGTFDAQASKINDD...      1   

          set  
0       train  
1  validation  
2       train  
3       train  
4       train  


In [5]:

df.to_csv("/home/gdallagl/myworkdir/data/ESMSec/protein/CSF_my_dataset.csv", index=False)

df


Unnamed: 0,protein,sequence,label,set
0,P22694,MGNAATAKKGSEVESVKEFLAKAKEDFLKKWENPTQNNAGLEDFER...,1,train
1,Q8NEV1,MSGPVPSRARVYTDVNTHRPREYWDYESHVVEWGNQDDYQLVRKLG...,1,validation
2,Q6P3V2,MPANWTSPQKSSALAPEDHGSSYEGSVSFRDVAIDFSREEWRHLDP...,1,train
3,Q8N0Y7,MAAYKLVLIRHGESTWNLENRFSCWYDADLSPAGHEEAKRGGQALR...,1,train
4,Q9NY65,MRECISVHVGQAGVQIGNACWELFCLEHGIQADGTFDAQASKINDD...,1,train
...,...,...,...,...
1551,Q67890,MGQNLSTSNPLGFFPDHQLDPAFRANTANPDWDFNPNKDTWPDANK...,0,train
1552,Q54321,MLTMFPFYFHYLFLAFALFLYSLSTASLFFFFFQSLIFLTISRQIP...,0,train
1553,P12345,MALLHSARVLSGVASAFHPGLAAAASARASSWWAHVEMGPPDPILG...,0,train
1554,P99999,MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,0,test
