In [1]:
import pandas as pd

In [3]:
df =pd.read_csv("/home/gdallagl/myworkdir/ESMSec/data/UniProt/uniprotkb_reviewed_cellular_locations.tsv", sep="\t")

# Filter
df = df[
    (df["Organism"] == "Homo sapiens (Human)") 
    #(df["Gene Names (primary)"].notna()) &
    #(df["Length"] >= 100) &
    #(df["Length"] <= 1000) 
    ]

df

Unnamed: 0,Entry,Gene Names (primary),Organism,Length,Subcellular location [CC],Sequence
0,A0A087X1C5,CYP2D7,Homo sapiens (Human),515,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
1,A0A0B4J2F0,PIGBOS1,Homo sapiens (Human),54,SUBCELLULAR LOCATION: Mitochondrion outer memb...,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...
2,A0A0C5B5G6,MT-RNR1,Homo sapiens (Human),16,SUBCELLULAR LOCATION: Secreted {ECO:0000269|Pu...,MRWQEMGYIFYPRKLR
3,A0A0K2S4Q6,CD300H,Homo sapiens (Human),201,SUBCELLULAR LOCATION: [Isoform 1]: Membrane {E...,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...
4,A0A0U1RRE5,NBDY,Homo sapiens (Human),68,"SUBCELLULAR LOCATION: Cytoplasm, P-body {ECO:0...",MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
...,...,...,...,...,...,...
20415,Q9UI25,,Homo sapiens (Human),63,,MEEMSYGENSGTHVGSFSCSPQPSQQMKVLFVGNSFLLTPVLHRQP...
20416,Q9UI54,,Homo sapiens (Human),55,,MESPKCLYSRITVNTAFGTKFSHISFIILFKVFLFPRITISKKTKL...
20417,Q9UI72,,Homo sapiens (Human),69,,MGMALELYWLCGFRSYWPLGTNAENEGNRKENRRQMQSRNERGCNV...
20418,Q9Y3F1,,Homo sapiens (Human),56,,MSLLWTPQILTISFVSYILSLFPSPFPSCYTSCWFETSITTEKELN...


In [4]:
display(df.iloc[[10]]["Subcellular location [CC]"].values)

import re

def extract_locations_as_set(location_string):
    if not isinstance(location_string, str):
        return set()

    # Split by "SUBCELLULAR LOCATION:" to handle multi-protein entries
    sections = re.split(r'SUBCELLULAR LOCATION:\s*', location_string)

    locations = set()

    for section in sections:
        if not section.strip():
            continue

        # Remove protein-specific annotations like "[Capsid protein C]:" at the beginning
        section = re.sub(r'^\[.*?\]:\s*', '', section)

        # Remove "Note=" sections as they contain additional info, not locations
        section = re.sub(r'\.\s*Note=.*?(?=SUBCELLULAR LOCATION:|$)', '', section, flags=re.DOTALL)

        # Split by period to get individual location statements
        statements = re.split(r'\.\s*', section)

        for statement in statements:
            statement = statement.strip()
            if not statement:
                continue

            # Split by semicolon to separate location from type descriptors
            parts = statement.split(';')

            for part in parts:
                part = part.strip()
                
                # Remove everything from { onwards (ECO codes and other annotations)
                # This handles both complete {ECO:...} and incomplete {ECO:... patterns
                part = re.sub(r'\{.*$', '', part).strip()

                # Skip protein type descriptions
                if re.match(r'^(Single-pass|Multi-pass|Peripheral membrane protein|Lumenal side|Cytoplasmic side)', part):
                    continue
                
                # Skip if it starts with ECO: (these are evidence codes that weren't in braces)
                if part.startswith('ECO:'):
                    continue
                
                # Skip if it's just a number or number with }
                if re.match(r'^[\d}]+$', part):
                    continue

                # Handle comma-separated locations like "Secreted, cell wall"
                for loc in part.split(','):
                    loc = loc.strip()

                    # Skip empty strings and standalone type descriptors
                    if not loc:
                        continue
                    if re.match(r'^(Single-pass|Multi-pass).*protein$', loc):
                        continue
                    
                    # Skip pure numbers or ECO codes
                    if re.match(r'^[\d}]+$', loc) or loc.startswith('ECO:'):
                        continue

                    # Add valid location
                    if loc:
                        locations.add(loc)

    # Clean up any remaining empty strings
    locations.discard('')

    return locations

# Apply the function to the column to create a new column 'Locations_Set'
df['location'] = df['Subcellular location [CC]'].apply(extract_locations_as_set)
df['n_location'] = df['location'].apply(len)

# some gens are duplcoated
df['Gene Names (primary)'] = df['Gene Names (primary)'].str.split(";").str[0]


# Display the result for the specific row (iloc[0] since we have a 1-row DF here)
df

array(['SUBCELLULAR LOCATION: Nucleus {ECO:0000269|PubMed:15897886}.'],
      dtype=object)

Unnamed: 0,Entry,Gene Names (primary),Organism,Length,Subcellular location [CC],Sequence,location,n_location
0,A0A087X1C5,CYP2D7,Homo sapiens (Human),515,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"{Mitochondrion, Membrane, Cytoplasm}",3
1,A0A0B4J2F0,PIGBOS1,Homo sapiens (Human),54,SUBCELLULAR LOCATION: Mitochondrion outer memb...,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,{Mitochondrion outer membrane},1
2,A0A0C5B5G6,MT-RNR1,Homo sapiens (Human),16,SUBCELLULAR LOCATION: Secreted {ECO:0000269|Pu...,MRWQEMGYIFYPRKLR,"{Nucleus, Secreted, Mitochondrion}",3
3,A0A0K2S4Q6,CD300H,Homo sapiens (Human),201,SUBCELLULAR LOCATION: [Isoform 1]: Membrane {E...,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,"{Secreted, Membrane}",2
4,A0A0U1RRE5,NBDY,Homo sapiens (Human),68,"SUBCELLULAR LOCATION: Cytoplasm, P-body {ECO:0...",MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...,"{P-body, Cytoplasm}",2
...,...,...,...,...,...,...,...,...
20415,Q9UI25,,Homo sapiens (Human),63,,MEEMSYGENSGTHVGSFSCSPQPSQQMKVLFVGNSFLLTPVLHRQP...,{},0
20416,Q9UI54,,Homo sapiens (Human),55,,MESPKCLYSRITVNTAFGTKFSHISFIILFKVFLFPRITISKKTKL...,{},0
20417,Q9UI72,,Homo sapiens (Human),69,,MGMALELYWLCGFRSYWPLGTNAENEGNRKENRRQMQSRNERGCNV...,{},0
20418,Q9Y3F1,,Homo sapiens (Human),56,,MSLLWTPQILTISFVSYILSLFPSPFPSCYTSCWFETSITTEKELN...,{},0


In [31]:
#df[df["Subcellular location [CC]"].str.contains("known as junctional membrane complexes", case=False, na=False)]["Subcellular location [CC]"].to_list()

In [5]:
# Get all unique location values across all rows
all_locations = set()
for loc_set in df['location']:
    all_locations.update(loc_set)

# Sort alphabetically for easier reading
all_locations_sorted = sorted(all_locations)

print(f"Total unique locations: {len(all_locations_sorted)}\n")
for loc in all_locations_sorted:
    print(f"  - {loc}")


Total unique locations: 287

  - A band
  - Apical cell membrane
  - Apicolateral cell membrane
  - Autolysosome
  - Autolysosome membrane
  - Basal cell membrane
  - Basolateral cell membrane
  - COPI-coated vesicle
  - COPI-coated vesicle membrane
  - COPII-coated vesicle
  - COPII-coated vesicle membrane
  - Cajal body
  - Cell junction
  - Cell membrane
  - Cell projection
  - Cell surface
  - Cell tip
  - Chromosome
  - Cleavage furrow
  - Cornified envelope
  - Cortical granule
  - Cytolytic granule
  - Cytolytic granule membrane
  - Cytoplasm
  - Cytoplasmic granule
  - Cytoplasmic granule lumen
  - Cytoplasmic granule membrane
  - Cytoplasmic ribonucleoprotein granule
  - Cytoplasmic vesicle
  - Cytoplasmic vesicle lumen
  - Cytoplasmic vesicle membrane
  - DNAJB12 and DNAJB14 in punctate structures within the endoplasmic reticulum membrane
  - DNAJB12 and DNAJC18 in punctate structures within the endoplasmic reticulum membrane
  - DNAJC18 and DNAJB14 in punctate structures wit

In [6]:
# save processed df
df.to_csv("/home/gdallagl/myworkdir/ESMSec/data/UniProt/uniprotkb_reviewed_cellular_locations_formatted.csv", index=False)

# Secreted Proteins (no multiple locations)

With csv, run 02_makedataset.ipynb

In [6]:
df_secreted = df[
    (df["location"].apply(lambda x: len(set(["Secreted"]).intersection(x)) != 0 ))
    ]

df_secreted = df_secreted[["Entry", "Length", "location", "Gene Names (primary)", "n_location", "Sequence"]].rename(columns={"Entry": "protein", "Gene Names (primary)": "gene", "Sequence": "sequence"})
df_secreted["geneset_count"] = 1 # costant (sample with same probavility)
df_secreted["definitive_positive"] = df_secreted["n_location"].apply(lambda x: x == 1) # the False will be sondiered ambigues and thus removeed on the o2_makedataset scirpt

display(df_secreted)
df_secreted["definitive_positive"].value_counts()



Unnamed: 0,protein,Length,location,gene,n_location,sequence,geneset_count,definitive_positive
3,A0A0K2S4Q6,201,"{Secreted, Membrane}",CD300H,2,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,1,False
17,A0M8Q6,106,"{Secreted, Cell membrane}",IGLC7,2,GQPKAAPSVTLFPPSSEELQANKATLVCLVSDFNPGAVTVAWKADG...,1,False
28,A1E959,279,"{Secreted, Nucleus, Cytoplasm}",ODAM,3,MKIIILLGFLGATLSAPLIPQRLMSASNSNELLLNLNNGQLLPLQL...,1,False
63,A4D1T9,235,"{acrosome, Cytoplasmic vesicle, Secreted, secr...",PRSS37,4,MKYVFYLGVLAGTFFFADSSVQKEDPAPYLVYLKSHFNPCVGVLIK...,1,False
70,A5D8T8,446,"{Secreted, Golgi apparatus, Endoplasmic reticu...",CLEC18A,4,MLHPETSPGRGHLLAVLLALLGTAWAEVWPPQLQEQAPMAGALNRK...,1,False
...,...,...,...,...,...,...,...,...
19911,Q96EE4,140,{Secreted},CCDC126,1,MFFTISRKNMSQKLSLLLLVFGLIWGLMLLHYTFQQPRHQSSVKLR...,1,True
19946,Q96MU5,243,{Secreted},CD300LD-AS1,1,MDELALSFSLTCLLPENRASLSPSQPLSFQCLKAPATLTWEDEKQQ...,1,True
19993,Q9H106,197,{Secreted},SIRPD,1,MPIPASPLHPPLPSLLLYLLLELAGVTHVFHVQQTEMSQTVSTGES...,1,True
20008,Q9H7B7,122,{Secreted},PKD1L1-AS1,1,MGFHFCIWIIFLLPPPCKKCLSPPTMNLRPPKSCGNVFYWVLVLNS...,1,True


definitive_positive
False    922
True     815
Name: count, dtype: int64

In [8]:
print(df_secreted[df_secreted["definitive_positive"] == True].protein.to_list())

['O00175', 'O00187', 'O00292', 'O00300', 'O00339', 'O00585', 'O00622', 'O14791', 'O14793', 'O15041', 'O15123', 'O15130', 'O15204', 'O15232', 'O15444', 'O15467', 'O15520', 'O43240', 'O43320', 'O43827', 'O43854', 'O43915', 'O43927', 'O60258', 'O60383', 'O60565', 'O75462', 'O75610', 'O75629', 'O75636', 'O75888', 'O76061', 'O76076', 'O76093', 'O94907', 'O94919', 'O95388', 'O95390', 'O95393', 'O95399', 'O95407', 'O95445', 'O95460', 'O95715', 'O95750', 'O95813', 'O95965', 'O95972', 'O95998', 'O96009', 'P00709', 'P00738', 'P00739', 'P00740', 'P00742', 'P00746', 'P00747', 'P00748', 'P00749', 'P01011', 'P01019', 'P01033', 'P01034', 'P01036', 'P01037', 'P01127', 'P01178', 'P01185', 'P01189', 'P01213', 'P01215', 'P01222', 'P01225', 'P01229', 'P01236', 'P01241', 'P01242', 'P01258', 'P01270', 'P01275', 'P01282', 'P01286', 'P01308', 'P01344', 'P01350', 'P01562', 'P01563', 'P01567', 'P01568', 'P01570', 'P01571', 'P01574', 'P01579', 'P01588', 'P01591', 'P02647', 'P02652', 'P02655', 'P02671', 'P02675',

In [74]:
df_secreted.to_csv("/home/gdallagl/myworkdir/ESMSec/data/secreted/positive_secreted_proteins.csv", index=False)

# Lysosome (no multiple locations)

With csv, run 02_makedataset.ipynb

In [92]:
df_lyso = df[
    #(df["location"].apply(lambda x: len(set(["Lysosome lumen", "Lysosome membrane", "Lysosome"]).intersection(x)) != 0 )) &
    (df["location"].apply(lambda x: len(set(["Lysosome lumen", "Lysosome"]).intersection(x)) != 0 )) &
    (df["n_location"] == 1) 
    ]

df_lyso = df_lyso[["Entry", "Length", "location", "Gene Names (primary)", "n_location", "Sequence"]].rename(columns={"Entry": "protein", "Gene Names (primary)": "gene", "Sequence": "sequence"})
df_lyso["geneset_count"] = 1 # costant
df_lyso["definitive_positive"] = True # costant

print(df_lyso.shape)
df_lyso.sort_values("gene").head(10)



(38, 8)


Unnamed: 0,protein,Length,location,gene,n_location,sequence,geneset_count,definitive_positive
2714,P13686,325,{Lysosome},ACP5,1,MDMWTALLILQALLLPSLADGATPALRFVAVGDWGGVPNAPFHTAR...,1,True
3126,P20933,346,{Lysosome},AGA,1,MARKSNLPVLLVPFLLCQALVRCSSPLPLVVNTWPFKNATEAAWRA...,1,True
15020,P51689,593,{Lysosome},ARSD,1,MRSAARRGRAAPAARDSLPVLLFLCLLLKTCEPKTANAFKPNILLI...,1,True
10448,Q96EG1,525,{Lysosome},ARSG,1,MGWLFLKVLLAGVSFSGFLYPLVDFCISGKTRGQKPNFVIILADDM...,1,True
15107,Q01459,385,{Lysosome},CTBS,1,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...,1,True
2538,P10619,480,{Lysosome},CTSA,1,MIRAAPPPLFLLLLLLLLLVSWASRGEAAPDQDEIQRLPGLAKQPS...,1,True
4617,P53634,463,{Lysosome},CTSC,1,MGAGPSLLLAALLLLLSGDGAVRCDTPANCTYLDLLGTWVFQVGSS...,1,True
13470,Q9UBX1,484,{Lysosome},CTSF,1,MAPWLQLLSLLGLLPGAVAAPAQPRAASFQAWGPPSPELLAPTRFA...,1,True
2393,P09668,335,{Lysosome},CTSH,1,MWATLPLLCAGAWLLGVPVCGAAELCVNSLEKFHFKSWMSKHRKTY...,1,True
4066,P43234,321,{Lysosome},CTSO,1,MDVRALPWLPWLLWLLCRGGGDADSRAPFTPTWPRSREREAAAFRE...,1,True


In [76]:
df_lyso.to_csv("/home/gdallagl/myworkdir/ESMSec/data/secreted/positive_lyso_proteins.csv", index=False)

# Lyso Vs Secreted

No need for 02_makeDataset.ipynb

In [77]:
df_lyso.loc[:, "label"] = 1 # costant
df_secreted.loc[:, "label"] = 0 # costant

data = pd.concat([df_lyso, df_secreted])

# Remove ambigous from secretedS
data = data[data.definitive_positive == True]

data["geneset_count"] = 1 # costant

data

Unnamed: 0,protein,Length,location,gene,n_location,sequence,geneset_count,definitive_positive,label
190,O00115,360,{Lysosome},DNASE2,1,MIPLLLAALLCVPAGALTCYGDSGQPVDWFVVYKLPALRGSGEAAQ...,1,True,1
289,O00462,879,{Lysosome},MANBA,1,MRLHLLLLLALCGAGTTAAELSYSLRGNWSICNGNGSLELPGAVPG...,1,True,1
1096,O60911,334,{Lysosome},CTSV,1,MNLSLVLAAFCLGIASAVPKFDQNLDTKWYQWKATHRRLYGANEEG...,1,True,1
2032,P04066,466,{Lysosome},FUCA1,1,MRAPGMRSRPAGPALLLLLLFLGAAESVRRAQPPRRYTPDWPSLDS...,1,True,1
2119,P05164,745,{Lysosome},MPO,1,MGVPFFSSLRCMVDLGPCWAGGLTAEMKLLLALAGLLAILATPQPS...,1,True,1
...,...,...,...,...,...,...,...,...,...
19911,Q96EE4,140,{Secreted},CCDC126,1,MFFTISRKNMSQKLSLLLLVFGLIWGLMLLHYTFQQPRHQSSVKLR...,1,True,0
19946,Q96MU5,243,{Secreted},CD300LD-AS1,1,MDELALSFSLTCLLPENRASLSPSQPLSFQCLKAPATLTWEDEKQQ...,1,True,0
19993,Q9H106,197,{Secreted},SIRPD,1,MPIPASPLHPPLPSLLLYLLLELAGVTHVFHVQQTEMSQTVSTGES...,1,True,0
20008,Q9H7B7,122,{Secreted},PKD1L1-AS1,1,MGFHFCIWIIFLLPPPCKKCLSPPTMNLRPPKSCGNVFYWVLVLNS...,1,True,0


In [78]:
from sklearn.model_selection import train_test_split

# Stratified split by label
train_clusters, temp_clusters = train_test_split(
    data,
    test_size=0.2,
    stratify=data['label'],
    random_state=42
)
val_clusters, test_clusters = train_test_split(
    temp_clusters,
    test_size=0.5,
    stratify=temp_clusters['label'],
    random_state=42
)

train_prots = train_clusters["protein"].to_list()
val_prots = val_clusters["protein"].to_list()
test_prots = test_clusters["protein"].to_list()

test_prots
data['set'] = ["train" if p in train_prots else "val" if p in val_prots else "test" for p in data["protein"]]
data.loc[data[data.set == "val"].index[1:], "set"] = "test"  # just to have more

# Check split distribution 
print(data.groupby('set')['label'].value_counts().unstack(fill_value=0)) 
display(data)


label    0   1
set           
test   163   7
train  652  30
val      0   1


Unnamed: 0,protein,Length,location,gene,n_location,sequence,geneset_count,definitive_positive,label,set
190,O00115,360,{Lysosome},DNASE2,1,MIPLLLAALLCVPAGALTCYGDSGQPVDWFVVYKLPALRGSGEAAQ...,1,True,1,train
289,O00462,879,{Lysosome},MANBA,1,MRLHLLLLLALCGAGTTAAELSYSLRGNWSICNGNGSLELPGAVPG...,1,True,1,train
1096,O60911,334,{Lysosome},CTSV,1,MNLSLVLAAFCLGIASAVPKFDQNLDTKWYQWKATHRRLYGANEEG...,1,True,1,train
2032,P04066,466,{Lysosome},FUCA1,1,MRAPGMRSRPAGPALLLLLLFLGAAESVRRAQPPRRYTPDWPSLDS...,1,True,1,train
2119,P05164,745,{Lysosome},MPO,1,MGVPFFSSLRCMVDLGPCWAGGLTAEMKLLLALAGLLAILATPQPS...,1,True,1,val
...,...,...,...,...,...,...,...,...,...,...
19911,Q96EE4,140,{Secreted},CCDC126,1,MFFTISRKNMSQKLSLLLLVFGLIWGLMLLHYTFQQPRHQSSVKLR...,1,True,0,train
19946,Q96MU5,243,{Secreted},CD300LD-AS1,1,MDELALSFSLTCLLPENRASLSPSQPLSFQCLKAPATLTWEDEKQQ...,1,True,0,train
19993,Q9H106,197,{Secreted},SIRPD,1,MPIPASPLHPPLPSLLLYLLLELAGVTHVFHVQQTEMSQTVSTGES...,1,True,0,test
20008,Q9H7B7,122,{Secreted},PKD1L1-AS1,1,MGFHFCIWIIFLLPPPCKKCLSPPTMNLRPPKSCGNVFYWVLVLNS...,1,True,0,train


In [79]:
data.to_csv("/home/gdallagl/myworkdir/ESMSec/data/secreted/dataset_lysoVSsecreted.csv", index=False)

# Multi class, from CD-CODE

In [71]:
import pandas as pd

# Load Allowed proteins (reviewed unirpot)
uniprot = pd.read_csv("/home/gdallagl/myworkdir/ESMSec/data/UniProt/human_proteome.tsv", sep="\t")

# only reviewd
uniprot = uniprot[uniprot.Reviewed == "reviewed"]

# Rename
uniprot = uniprot[["Entry", "Sequence"]].rename(columns={"Entry": "protein", "Sequence": "sequence"})

uniprot

Unnamed: 0,protein,sequence
1,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
7,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...
11,A0A0C5B5G6,MRWQEMGYIFYPRKLR
14,A0A0K2S4Q6,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...
16,A0A0U1RRE5,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
...,...,...
82496,Q9UI25,MEEMSYGENSGTHVGSFSCSPQPSQQMKVLFVGNSFLLTPVLHRQP...
82497,Q9UI54,MESPKCLYSRITVNTAFGTKFSHISFIILFKVFLFPRITISKKTKL...
82498,Q9UI72,MGMALELYWLCGFRSYWPLGTNAENEGNRKENRRQMQSRNERGCNV...
82505,Q9Y3F1,MSLLWTPQILTISFVSYILSLFPSPFPSCYTSCWFETSITTEKELN...


In [72]:
# oad ocndesate locations
df = pd.read_csv("/home/gdallagl/myworkdir/ESMSec/data/UniProt/protein_in_condensates_CD-CODE.tsv", sep="\t")

# Drop proteins in MULTIPLE locations
df = df.drop_duplicates(subset=["uniprotkb_ac"], keep=False)
display(df[df.duplicated(subset=["uniprotkb_ac"], keep=False)].sort_values(by="uniprotkb_ac")) # check for duoslaictes

# condenste with at least 100 proteins
allowed_condensates = df.condensate_name.value_counts()[df.condensate_name.value_counts() >= 50].index.to_list()
print(allowed_condensates)

# Filter
df = df[
    (df.uniprotkb_ac.isin(uniprot.protein.to_list())) & # only allowed prots
    (df.condensate_name.isin(allowed_condensates))  # condensare
]

display(df)



Unnamed: 0,uniprotkb_ac,condensate_id,condensate_name


['Nucleolus', 'Postsynaptic density', 'Presynaptic clusters and postsynaptic densities', 'Centrosome', 'Stress granule', 'Mitochondrial cloud', 'P-body', 'Nuclear speckle', 'Pyrenoid', 'Chromatoid body', 'PML body']


Unnamed: 0,uniprotkb_ac,condensate_id,condensate_name
311,Q96J94,278829DE,Chromatoid body
312,Q9UPY3,278829DE,Chromatoid body
1264,Q14164,B5B9A610,PML body
1265,P27694,B5B9A610,PML body
1266,P61956,B5B9A610,PML body
...,...,...,...
10358,P30837,91857CE7,Nucleolus
10359,P61313,91857CE7,Nucleolus
10476,Q32NC0,91857CE7,Nucleolus
10477,Q86VY4,91857CE7,Nucleolus


In [73]:
#protein	sequence	label	set

condensate_to_int = {
    name: idx 
    for idx, name in enumerate(df['condensate_name'].unique())
}

df['label'] = df['condensate_name'].map(condensate_to_int)


df = df.rename(columns={"uniprotkb_ac": "protein"}).merge(uniprot)

df

Unnamed: 0,protein,condensate_id,condensate_name,label,sequence
0,Q96J94,278829DE,Chromatoid body,0,MTGRARARARGRARGQETAQLVGSTASQQPGYIQPRPQPPPAEGEL...
1,Q9UPY3,278829DE,Chromatoid body,0,MKSPALQPLSMAGLQLMTPASSPMGPFFGLPWQQEAIHDNIYTPRK...
2,Q14164,B5B9A610,PML body,1,MQSTANYLWHTDDLLGQGATASVYKARNKKSGELVAVKVFNTTSYL...
3,P27694,B5B9A610,PML body,1,MVGQLSEGAIAAIMQKGDTNIKPILQVINIRPITTGNSPPRYRLLM...
4,P61956,B5B9A610,PML body,1,MADEKPKEGVKTENNDHINLKVAGQDGSVVQFKIKRHTPLSKLMKA...
...,...,...,...,...,...
2265,P30837,91857CE7,Nucleolus,7,MLRFLAPRLLSLQGRTARYSSAAALPSPILNPDIPYNQLFINNEWQ...
2266,P61313,91857CE7,Nucleolus,7,MGAYKYIQELWRKKQSDVMRFLLRVRCWQYRQLSALHRAPRPTRPD...
2267,Q32NC0,91857CE7,Nucleolus,7,MRQKHYLEAAARGLHDSCPGQARYLLWAYTSSHDDKSTFEETCPYC...
2268,Q86VY4,91857CE7,Nucleolus,7,MSGRSRGRKSSRAKNRGKGRAKARVRPAPDDAPRDPDPSQYQSLGE...


In [74]:
from sklearn.model_selection import train_test_split

# Stratified split by label
train_clusters, temp_clusters = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)
val_clusters, test_clusters = train_test_split(
    temp_clusters,
    test_size=0.5,
    stratify=temp_clusters['label'],
    random_state=42
)

train_prots = train_clusters["protein"].to_list()
val_prots = val_clusters["protein"].to_list()
test_prots = test_clusters["protein"].to_list()

test_prots
df['set'] = ["train" if p in train_prots else "val" if p in val_prots else "test" for p in df["protein"]]
df.loc[df[df.set == "val"].index[1:], "set"] = "test"  # just to have more

# Check split distribution 
print(df.groupby('set')['label'].value_counts().unstack(fill_value=0)) 
display(df)


label  0   1   2    3   4    5    6    7
set                                     
test   0  12  14   65  15  177   49  121
train  2  53  57  261  59  709  194  481
val    0   1   0    0   0    0    0    0


Unnamed: 0,protein,condensate_id,condensate_name,label,sequence,set
0,Q96J94,278829DE,Chromatoid body,0,MTGRARARARGRARGQETAQLVGSTASQQPGYIQPRPQPPPAEGEL...,train
1,Q9UPY3,278829DE,Chromatoid body,0,MKSPALQPLSMAGLQLMTPASSPMGPFFGLPWQQEAIHDNIYTPRK...,train
2,Q14164,B5B9A610,PML body,1,MQSTANYLWHTDDLLGQGATASVYKARNKKSGELVAVKVFNTTSYL...,train
3,P27694,B5B9A610,PML body,1,MVGQLSEGAIAAIMQKGDTNIKPILQVINIRPITTGNSPPRYRLLM...,train
4,P61956,B5B9A610,PML body,1,MADEKPKEGVKTENNDHINLKVAGQDGSVVQFKIKRHTPLSKLMKA...,train
...,...,...,...,...,...,...
2265,P30837,91857CE7,Nucleolus,7,MLRFLAPRLLSLQGRTARYSSAAALPSPILNPDIPYNQLFINNEWQ...,train
2266,P61313,91857CE7,Nucleolus,7,MGAYKYIQELWRKKQSDVMRFLLRVRCWQYRQLSALHRAPRPTRPD...,train
2267,Q32NC0,91857CE7,Nucleolus,7,MRQKHYLEAAARGLHDSCPGQARYLLWAYTSSHDDKSTFEETCPYC...,train
2268,Q86VY4,91857CE7,Nucleolus,7,MSGRSRGRKSSRAKNRGKGRAKARVRPAPDDAPRDPDPSQYQSLGE...,train


In [75]:
df[df.duplicated(subset=["protein"], keep=False)].sort_values(by="protein")

Unnamed: 0,protein,condensate_id,condensate_name,label,sequence,set


In [76]:
df.to_csv("/home/gdallagl/myworkdir/ESMSec/data/secreted/dataset_condensate.csv", index=False)