# Make Uniprot proteome processed

- Only Human
- Only reviewed
- No Isoforms
- Give gene names to all prots
- Remove proteins with no gene name
- Remove duplicated proteins
- Remove duplicated genes

In [1]:
import pandas as pd

UNIPROT_PATH = "/home/gdallagl/myworkdir/ESMSec/data/UniProt/human_proteome_raw.tsv"
SAVE_PATH = "/home/gdallagl/myworkdir/ESMSec/data/UniProt/human_proteome.tsv"

In [2]:
proteome_uniprot = pd.read_csv(UNIPROT_PATH, sep="\t")
print("Number intial proteins: ", len(proteome_uniprot))


# ONLY Human 
    # csv file alredy fitered by species

# ONLY swissprot
proteome_uniprot = proteome_uniprot[proteome_uniprot.Reviewed == "reviewed"].copy()

# No isofrom, use canonical protein
proteome_uniprot['IsIsoform'] = proteome_uniprot['Entry'].str.contains(r'-\d+$')
n_isoforms = proteome_uniprot['IsIsoform'].sum()
n_total = len(proteome_uniprot)
print(f"Isoforms: {n_isoforms} / {n_total} ({n_isoforms/n_total:.2%})")

# No protein withou gene name
proteome_uniprot["Gene Names (primary)"] = ( #give names
    proteome_uniprot["Gene Names (primary)"]
    .fillna(proteome_uniprot["Gene Names"])
)
proteome_uniprot = proteome_uniprot[pd.notna(proteome_uniprot["Gene Names (primary)"])]

# Choose a single gene name if nore than one
proteome_uniprot["Gene Names (primary)"] = proteome_uniprot["Gene Names (primary)"].str.split(";").str[0].str.split(" ").str[0]

# Remove proteins with same gene name --> mantain longer protein
proteome_uniprot = proteome_uniprot.sort_values("Length", ascending=False).drop_duplicates("Gene Names (primary)", keep="first").copy()

# Define proteins to use in the follwing analasys
ALLOWED_PROTEINS = set(proteome_uniprot.Entry)
print("Number allowed proteins: ", len(ALLOWED_PROTEINS))
print(*list(ALLOWED_PROTEINS)[:10], "...")
display(proteome_uniprot)


# Save
proteome_uniprot.to_csv(SAVE_PATH, index=False)

Number intial proteins:  20420
Isoforms: 0 / 20420 (0.00%)
Number allowed proteins:  20210
Q9BXN6 Q9Y2G5 O00170 Q6ZWT7 Q02410 P33121 Q8NGD2 P08913 O95721 P0CG21 ...


Unnamed: 0,Entry,Reviewed,Gene Names (primary),Gene Names,Organism,Length,Subcellular location [CC],Gene Ontology (cellular component),Sequence,IsIsoform
9928,Q8WZ42,reviewed,TTN,TTN,Homo sapiens (Human),34350,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000305|P...,condensed nuclear chromosome [GO:0000794]; cyt...,MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI...,False
9887,Q8WXI7,reviewed,MUC16,MUC16 CA125,Homo sapiens (Human),14507,SUBCELLULAR LOCATION: Cell membrane; Single-pa...,external side of plasma membrane [GO:0009897];...,MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG...,False
18492,Q9H195,reviewed,MUC3B,MUC3B,Homo sapiens (Human),13477,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,Golgi lumen [GO:0005796]; plasma membrane [GO:...,MQLLGLLSILWMLKSSPGATGTLSTATSTSHVTFPRAEATRTALSN...,False
9357,Q8NF91,reviewed,SYNE1,SYNE1 C6orf98 KIAA0796 KIAA1262 KIAA1756 MYNE1,Homo sapiens (Human),8797,SUBCELLULAR LOCATION: Nucleus outer membrane {...,cytoplasm [GO:0005737]; cytoskeleton [GO:00058...,MATSRGASRCPRDIANVMQRLQDEQEIVQKRTFTKWINSHLAKRKP...,False
3124,P20929,reviewed,NEB,NEB,Homo sapiens (Human),8525,"SUBCELLULAR LOCATION: Cytoplasm, myofibril, sa...",actin cytoskeleton [GO:0015629]; cytosol [GO:0...,MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...,False
...,...,...,...,...,...,...,...,...,...,...
18716,A0A0A0MT78,reviewed,TRBJ2-7,TRBJ2-7,Homo sapiens (Human),15,SUBCELLULAR LOCATION: Cell membrane {ECO:00003...,T cell receptor complex [GO:0042101],SYEQYFGPGTRLTVT,False
18718,A0A0A0MT89,reviewed,IGKJ1,IGKJ1,Homo sapiens (Human),12,SUBCELLULAR LOCATION: Secreted {ECO:0000303|Pu...,extracellular region [GO:0005576]; immunoglobu...,WTFGQGTKVEIK,False
19253,P0DOY5,reviewed,IGHD1-1,IGHD1-1,Homo sapiens (Human),5,SUBCELLULAR LOCATION: Secreted {ECO:0000303|Pu...,extracellular region [GO:0005576]; immunoglobu...,GTTGT,False
19263,P0DPI4,reviewed,TRBD1,TRBD1,Homo sapiens (Human),4,SUBCELLULAR LOCATION: Cell membrane {ECO:00003...,T cell receptor complex [GO:0042101],GTGG,False
