In [69]:
import pandas as pd

In [70]:
df =pd.read_csv("/home/gdallagl/myworkdir/ESMSec/data/UniProt/uniprotkb_reviewed_cellular_locations.tsv", sep="\t")

# Filter
df = df[
    (df["Organism"] == "Homo sapiens (Human)") &
    (df["Gene Names (primary)"].notna()) &
    (df["Length"] >= 100) &
    (df["Length"] <= 1000) 
    ]

df

Unnamed: 0,Entry,Gene Names (primary),Organism,Length,Subcellular location [CC],Sequence
0,A0A087X1C5,CYP2D7,Homo sapiens (Human),515,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
3,A0A0K2S4Q6,CD300H,Homo sapiens (Human),201,SUBCELLULAR LOCATION: [Isoform 1]: Membrane {E...,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...
5,A0A1B0GTW7,CIROP,Homo sapiens (Human),788,SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ...,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...
6,A0AV02,SLC12A8,Homo sapiens (Human),714,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,MTQMSQVQELFHEAAQQDALAQPQPWWKTQLFMWEPVLFGTWDGVF...
7,A0AV96,RBM47,Homo sapiens (Human),593,SUBCELLULAR LOCATION: Nucleus {ECO:0000269|Pub...,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...
...,...,...,...,...,...,...
20395,Q9H0A3,TMEM191A,Homo sapiens (Human),160,SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ...,MMNNTDFLMLNNPWNKLCLVSMDFCFPLDFVSNLFWIFASKFIIVT...
20396,Q9H1L0,MIR1-1HG,Homo sapiens (Human),117,,MPSCSCALMAPCGPAAGPAAVERTQQVARGEPGSARGQLQVSPEMS...
20402,Q9H693,C16orf95,Homo sapiens (Human),158,,MRASRSPPSPRRCHHHHEATGAASGAAAGGPGAGCVGLCRLALTPS...
20403,Q9H8Q6,HEXA-AS1,Homo sapiens (Human),139,,MTGKNVYFQSQLEAFHCLQYELFPSRLTINLLVTTHIPFPQTKPHI...


In [71]:
display(df.iloc[[10]]["Subcellular location [CC]"].values)

import re

def extract_locations_as_set(location_string):
    if not isinstance(location_string, str):
        return set()

    # Split by "SUBCELLULAR LOCATION:" to handle multi-protein entries
    sections = re.split(r'SUBCELLULAR LOCATION:\s*', location_string)

    locations = set()

    for section in sections:
        if not section.strip():
            continue

        # Remove protein-specific annotations like "[Capsid protein C]:" at the beginning
        section = re.sub(r'^\[.*?\]:\s*', '', section)

        # Remove "Note=" sections as they contain additional info, not locations
        section = re.sub(r'\.\s*Note=.*?(?=SUBCELLULAR LOCATION:|$)', '', section, flags=re.DOTALL)

        # Split by period to get individual location statements
        statements = re.split(r'\.\s*', section)

        for statement in statements:
            statement = statement.strip()
            if not statement:
                continue

            # Split by semicolon to separate location from type descriptors
            parts = statement.split(';')

            for part in parts:
                part = part.strip()
                
                # Remove everything from { onwards (ECO codes and other annotations)
                # This handles both complete {ECO:...} and incomplete {ECO:... patterns
                part = re.sub(r'\{.*$', '', part).strip()

                # Skip protein type descriptions
                if re.match(r'^(Single-pass|Multi-pass|Peripheral membrane protein|Lumenal side|Cytoplasmic side)', part):
                    continue
                
                # Skip if it starts with ECO: (these are evidence codes that weren't in braces)
                if part.startswith('ECO:'):
                    continue
                
                # Skip if it's just a number or number with }
                if re.match(r'^[\d}]+$', part):
                    continue

                # Handle comma-separated locations like "Secreted, cell wall"
                for loc in part.split(','):
                    loc = loc.strip()

                    # Skip empty strings and standalone type descriptors
                    if not loc:
                        continue
                    if re.match(r'^(Single-pass|Multi-pass).*protein$', loc):
                        continue
                    
                    # Skip pure numbers or ECO codes
                    if re.match(r'^[\d}]+$', loc) or loc.startswith('ECO:'):
                        continue

                    # Add valid location
                    if loc:
                        locations.add(loc)

    # Clean up any remaining empty strings
    locations.discard('')

    return locations

# Apply the function to the column to create a new column 'Locations_Set'
df['location'] = df['Subcellular location [CC]'].apply(extract_locations_as_set)
df['n_location'] = df['location'].apply(len)

# some gens are duplcoated
df['Gene Names (primary)'] = df['Gene Names (primary)'].str.split(";").str[0]


# Display the result for the specific row (iloc[0] since we have a 1-row DF here)
df

array(['SUBCELLULAR LOCATION: Nucleus {ECO:0000305}.'], dtype=object)

Unnamed: 0,Entry,Gene Names (primary),Organism,Length,Subcellular location [CC],Sequence,location,n_location
0,A0A087X1C5,CYP2D7,Homo sapiens (Human),515,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"{Mitochondrion, Cytoplasm, Membrane}",3
3,A0A0K2S4Q6,CD300H,Homo sapiens (Human),201,SUBCELLULAR LOCATION: [Isoform 1]: Membrane {E...,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,"{Secreted, Membrane}",2
5,A0A1B0GTW7,CIROP,Homo sapiens (Human),788,SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ...,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,{Membrane},1
6,A0AV02,SLC12A8,Homo sapiens (Human),714,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,MTQMSQVQELFHEAAQQDALAQPQPWWKTQLFMWEPVLFGTWDGVF...,{Membrane},1
7,A0AV96,RBM47,Homo sapiens (Human),593,SUBCELLULAR LOCATION: Nucleus {ECO:0000269|Pub...,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...,"{Cytoplasm, Nucleus}",2
...,...,...,...,...,...,...,...,...
20395,Q9H0A3,TMEM191A,Homo sapiens (Human),160,SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ...,MMNNTDFLMLNNPWNKLCLVSMDFCFPLDFVSNLFWIFASKFIIVT...,{Membrane},1
20396,Q9H1L0,MIR1-1HG,Homo sapiens (Human),117,,MPSCSCALMAPCGPAAGPAAVERTQQVARGEPGSARGQLQVSPEMS...,{},0
20402,Q9H693,C16orf95,Homo sapiens (Human),158,,MRASRSPPSPRRCHHHHEATGAASGAAAGGPGAGCVGLCRLALTPS...,{},0
20403,Q9H8Q6,HEXA-AS1,Homo sapiens (Human),139,,MTGKNVYFQSQLEAFHCLQYELFPSRLTINLLVTTHIPFPQTKPHI...,{},0


In [72]:
# Get all unique location values across all rows
all_locations = set()
for loc_set in df['location']:
    all_locations.update(loc_set)

# Sort alphabetically for easier reading
all_locations_sorted = sorted(all_locations)

print(f"Total unique locations: {len(all_locations_sorted)}\n")
for loc in all_locations_sorted:
    print(f"  - {loc}")


Total unique locations: 281

  - A band
  - Apical cell membrane
  - Apicolateral cell membrane
  - Autolysosome
  - Autolysosome membrane
  - Basal cell membrane
  - Basolateral cell membrane
  - COPI-coated vesicle
  - COPI-coated vesicle membrane
  - COPII-coated vesicle
  - COPII-coated vesicle membrane
  - Cajal body
  - Cell junction
  - Cell membrane
  - Cell projection
  - Cell surface
  - Cell tip
  - Chromosome
  - Cleavage furrow
  - Cornified envelope
  - Cortical granule
  - Cytolytic granule
  - Cytolytic granule membrane
  - Cytoplasm
  - Cytoplasmic granule
  - Cytoplasmic granule lumen
  - Cytoplasmic granule membrane
  - Cytoplasmic ribonucleoprotein granule
  - Cytoplasmic vesicle
  - Cytoplasmic vesicle lumen
  - Cytoplasmic vesicle membrane
  - DNAJB12 and DNAJB14 in punctate structures within the endoplasmic reticulum membrane
  - DNAJB12 and DNAJC18 in punctate structures within the endoplasmic reticulum membrane
  - DNAJC18 and DNAJB14 in punctate structures wit

# Secreted Proteins (no multiple locations)

With csv, run 02_makedataset.ipynb

In [73]:
df_secreted = df[
    (df["location"].apply(lambda x: len(set(["Secreted"]).intersection(x)) != 0 ))
    ]

df_secreted = df_secreted[["Entry", "Length", "location", "Gene Names (primary)", "n_location", "Sequence"]].rename(columns={"Entry": "protein", "Gene Names (primary)": "gene", "Sequence": "sequence"})
df_secreted["geneset_count"] = 1 # costant (sample with same probavility)
df_secreted["definitive_positive"] = df_secreted["n_location"].apply(lambda x: x == 1) # the False will be sondiered ambigues and thus removeed on the o2_makedataset scirpt

display(df_secreted)
df_secreted["definitive_positive"].value_counts()



Unnamed: 0,protein,Length,location,gene,n_location,sequence,geneset_count,definitive_positive
3,A0A0K2S4Q6,201,"{Secreted, Membrane}",CD300H,2,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,1,False
17,A0M8Q6,106,"{Secreted, Cell membrane}",IGLC7,2,GQPKAAPSVTLFPPSSEELQANKATLVCLVSDFNPGAVTVAWKADG...,1,False
28,A1E959,279,"{Secreted, Cytoplasm, Nucleus}",ODAM,3,MKIIILLGFLGATLSAPLIPQRLMSASNSNELLLNLNNGQLLPLQL...,1,False
63,A4D1T9,235,"{Secreted, secretory vesicle, Cytoplasmic vesi...",PRSS37,4,MKYVFYLGVLAGTFFFADSSVQKEDPAPYLVYLKSHFNPCVGVLIK...,1,False
70,A5D8T8,446,"{Endosome, Secreted, Golgi apparatus, Endoplas...",CLEC18A,4,MLHPETSPGRGHLLAVLLALLGTAWAEVWPPQLQEQAPMAGALNRK...,1,False
...,...,...,...,...,...,...,...,...
19911,Q96EE4,140,{Secreted},CCDC126,1,MFFTISRKNMSQKLSLLLLVFGLIWGLMLLHYTFQQPRHQSSVKLR...,1,True
19946,Q96MU5,243,{Secreted},CD300LD-AS1,1,MDELALSFSLTCLLPENRASLSPSQPLSFQCLKAPATLTWEDEKQQ...,1,True
19993,Q9H106,197,{Secreted},SIRPD,1,MPIPASPLHPPLPSLLLYLLLELAGVTHVFHVQQTEMSQTVSTGES...,1,True
20008,Q9H7B7,122,{Secreted},PKD1L1-AS1,1,MGFHFCIWIIFLLPPPCKKCLSPPTMNLRPPKSCGNVFYWVLVLNS...,1,True


definitive_positive
False    922
True     815
Name: count, dtype: int64

In [74]:
df_secreted.to_csv("/home/gdallagl/myworkdir/ESMSec/data/secreted/positive_secreted_proteins.csv", index=False)

# Lysosome (no multiple locations)

With csv, run 02_makedataset.ipynb

In [92]:
df_lyso = df[
    #(df["location"].apply(lambda x: len(set(["Lysosome lumen", "Lysosome membrane", "Lysosome"]).intersection(x)) != 0 )) &
    (df["location"].apply(lambda x: len(set(["Lysosome lumen", "Lysosome"]).intersection(x)) != 0 )) &
    (df["n_location"] == 1) 
    ]

df_lyso = df_lyso[["Entry", "Length", "location", "Gene Names (primary)", "n_location", "Sequence"]].rename(columns={"Entry": "protein", "Gene Names (primary)": "gene", "Sequence": "sequence"})
df_lyso["geneset_count"] = 1 # costant
df_lyso["definitive_positive"] = True # costant

print(df_lyso.shape)
df_lyso.sort_values("gene").head(10)



(38, 8)


Unnamed: 0,protein,Length,location,gene,n_location,sequence,geneset_count,definitive_positive
2714,P13686,325,{Lysosome},ACP5,1,MDMWTALLILQALLLPSLADGATPALRFVAVGDWGGVPNAPFHTAR...,1,True
3126,P20933,346,{Lysosome},AGA,1,MARKSNLPVLLVPFLLCQALVRCSSPLPLVVNTWPFKNATEAAWRA...,1,True
15020,P51689,593,{Lysosome},ARSD,1,MRSAARRGRAAPAARDSLPVLLFLCLLLKTCEPKTANAFKPNILLI...,1,True
10448,Q96EG1,525,{Lysosome},ARSG,1,MGWLFLKVLLAGVSFSGFLYPLVDFCISGKTRGQKPNFVIILADDM...,1,True
15107,Q01459,385,{Lysosome},CTBS,1,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...,1,True
2538,P10619,480,{Lysosome},CTSA,1,MIRAAPPPLFLLLLLLLLLVSWASRGEAAPDQDEIQRLPGLAKQPS...,1,True
4617,P53634,463,{Lysosome},CTSC,1,MGAGPSLLLAALLLLLSGDGAVRCDTPANCTYLDLLGTWVFQVGSS...,1,True
13470,Q9UBX1,484,{Lysosome},CTSF,1,MAPWLQLLSLLGLLPGAVAAPAQPRAASFQAWGPPSPELLAPTRFA...,1,True
2393,P09668,335,{Lysosome},CTSH,1,MWATLPLLCAGAWLLGVPVCGAAELCVNSLEKFHFKSWMSKHRKTY...,1,True
4066,P43234,321,{Lysosome},CTSO,1,MDVRALPWLPWLLWLLCRGGGDADSRAPFTPTWPRSREREAAAFRE...,1,True


In [76]:
df_lyso.to_csv("/home/gdallagl/myworkdir/ESMSec/data/secreted/positive_lyso_proteins.csv", index=False)

# Lyso Vs Secreted

No need for 02_makeDataset.ipynb

In [77]:
df_lyso.loc[:, "label"] = 1 # costant
df_secreted.loc[:, "label"] = 0 # costant

data = pd.concat([df_lyso, df_secreted])

# Remove ambigous from secretedS
data = data[data.definitive_positive == True]

data["geneset_count"] = 1 # costant

data

Unnamed: 0,protein,Length,location,gene,n_location,sequence,geneset_count,definitive_positive,label
190,O00115,360,{Lysosome},DNASE2,1,MIPLLLAALLCVPAGALTCYGDSGQPVDWFVVYKLPALRGSGEAAQ...,1,True,1
289,O00462,879,{Lysosome},MANBA,1,MRLHLLLLLALCGAGTTAAELSYSLRGNWSICNGNGSLELPGAVPG...,1,True,1
1096,O60911,334,{Lysosome},CTSV,1,MNLSLVLAAFCLGIASAVPKFDQNLDTKWYQWKATHRRLYGANEEG...,1,True,1
2032,P04066,466,{Lysosome},FUCA1,1,MRAPGMRSRPAGPALLLLLLFLGAAESVRRAQPPRRYTPDWPSLDS...,1,True,1
2119,P05164,745,{Lysosome},MPO,1,MGVPFFSSLRCMVDLGPCWAGGLTAEMKLLLALAGLLAILATPQPS...,1,True,1
...,...,...,...,...,...,...,...,...,...
19911,Q96EE4,140,{Secreted},CCDC126,1,MFFTISRKNMSQKLSLLLLVFGLIWGLMLLHYTFQQPRHQSSVKLR...,1,True,0
19946,Q96MU5,243,{Secreted},CD300LD-AS1,1,MDELALSFSLTCLLPENRASLSPSQPLSFQCLKAPATLTWEDEKQQ...,1,True,0
19993,Q9H106,197,{Secreted},SIRPD,1,MPIPASPLHPPLPSLLLYLLLELAGVTHVFHVQQTEMSQTVSTGES...,1,True,0
20008,Q9H7B7,122,{Secreted},PKD1L1-AS1,1,MGFHFCIWIIFLLPPPCKKCLSPPTMNLRPPKSCGNVFYWVLVLNS...,1,True,0


In [78]:
from sklearn.model_selection import train_test_split

# Stratified split by label
train_clusters, temp_clusters = train_test_split(
    data,
    test_size=0.2,
    stratify=data['label'],
    random_state=42
)
val_clusters, test_clusters = train_test_split(
    temp_clusters,
    test_size=0.5,
    stratify=temp_clusters['label'],
    random_state=42
)

train_prots = train_clusters["protein"].to_list()
val_prots = val_clusters["protein"].to_list()
test_prots = test_clusters["protein"].to_list()

test_prots
data['set'] = ["train" if p in train_prots else "val" if p in val_prots else "test" for p in data["protein"]]
data.loc[data[data.set == "val"].index[1:], "set"] = "test"  # just to have more

# Check split distribution 
print(data.groupby('set')['label'].value_counts().unstack(fill_value=0)) 
display(data)


label    0   1
set           
test   163   7
train  652  30
val      0   1


Unnamed: 0,protein,Length,location,gene,n_location,sequence,geneset_count,definitive_positive,label,set
190,O00115,360,{Lysosome},DNASE2,1,MIPLLLAALLCVPAGALTCYGDSGQPVDWFVVYKLPALRGSGEAAQ...,1,True,1,train
289,O00462,879,{Lysosome},MANBA,1,MRLHLLLLLALCGAGTTAAELSYSLRGNWSICNGNGSLELPGAVPG...,1,True,1,train
1096,O60911,334,{Lysosome},CTSV,1,MNLSLVLAAFCLGIASAVPKFDQNLDTKWYQWKATHRRLYGANEEG...,1,True,1,train
2032,P04066,466,{Lysosome},FUCA1,1,MRAPGMRSRPAGPALLLLLLFLGAAESVRRAQPPRRYTPDWPSLDS...,1,True,1,train
2119,P05164,745,{Lysosome},MPO,1,MGVPFFSSLRCMVDLGPCWAGGLTAEMKLLLALAGLLAILATPQPS...,1,True,1,val
...,...,...,...,...,...,...,...,...,...,...
19911,Q96EE4,140,{Secreted},CCDC126,1,MFFTISRKNMSQKLSLLLLVFGLIWGLMLLHYTFQQPRHQSSVKLR...,1,True,0,train
19946,Q96MU5,243,{Secreted},CD300LD-AS1,1,MDELALSFSLTCLLPENRASLSPSQPLSFQCLKAPATLTWEDEKQQ...,1,True,0,train
19993,Q9H106,197,{Secreted},SIRPD,1,MPIPASPLHPPLPSLLLYLLLELAGVTHVFHVQQTEMSQTVSTGES...,1,True,0,test
20008,Q9H7B7,122,{Secreted},PKD1L1-AS1,1,MGFHFCIWIIFLLPPPCKKCLSPPTMNLRPPKSCGNVFYWVLVLNS...,1,True,0,train


In [79]:
data.to_csv("/home/gdallagl/myworkdir/ESMSec/data/secreted/dataset_lysoVSsecreted.csv", index=False)

# Multi class