# Make Uniprot proteome processed

- Only Human
- Only reviewed
- No Isoforms
- Give gene names to all prots
- Remove proteins with no gene name
- Remove duplicated proteins
- Remove duplicated genes

In [21]:
import pandas as pd

UNIPROT_PATH = "/home/gdallagl/myworkdir/ESMSec/data/UniProt/human_proteome_raw.tsv"
SAVE_PATH = "/home/gdallagl/myworkdir/ESMSec/data/UniProt/human_proteome.tsv"

In [24]:
proteome_uniprot = pd.read_csv(UNIPROT_PATH, sep="\t")
print("Number intial proteins: ", len(proteome_uniprot))


# ONLY Human 
    # csv file alredy fitered by species

# ONLY swissprot
proteome_uniprot = proteome_uniprot[proteome_uniprot.Reviewed == "reviewed"].copy()

# No isofrom, use canonical protein
proteome_uniprot['IsIsoform'] = proteome_uniprot['Entry'].str.contains(r'-\d+$')
n_isoforms = proteome_uniprot['IsIsoform'].sum()
n_total = len(proteome_uniprot)
print(f"Isoforms: {n_isoforms} / {n_total} ({n_isoforms/n_total:.2%})")

# No protein withou gene name
proteome_uniprot["Gene Names (primary)"] = ( #give names
    proteome_uniprot["Gene Names (primary)"]
    .fillna(proteome_uniprot["Gene Names"])
)
proteome_uniprot = proteome_uniprot[pd.notna(proteome_uniprot["Gene Names (primary)"])]

# Choose a single gene name if nore than one
proteome_uniprot["Gene Names (primary)"] = proteome_uniprot["Gene Names (primary)"].str.split(";").str[0].str.split(" ").str[0]

# Remove proteins with same gene name --> mantain longer protein
proteome_uniprot = proteome_uniprot.sort_values("Length", ascending=False).drop_duplicates("Gene Names (primary)", keep="first").copy()

# Define proteins to use in the follwing analasys
ALLOWED_PROTEINS = set(proteome_uniprot.Entry)
print("Number allowed proteins: ", len(ALLOWED_PROTEINS))
print(*list(ALLOWED_PROTEINS)[:10], "...")
display(proteome_uniprot)


# Save
proteome_uniprot.to_csv(SAVE_PATH, index=False)

Number intial proteins:  20405
Isoforms: 0 / 20405 (0.00%)
Number allowed proteins:  20210
Q6UX40 Q9BQ90 Q9Y5X0 A0A0C4DH32 Q5BJD5 O14602 Q5SRI9 Q13011 Q8NA29 P52701 ...


Unnamed: 0,Entry,Gene Names (primary),Reviewed,Gene Names,Organism,Length,Protein names,Sequence,IsIsoform
9925,Q8WZ42,TTN,reviewed,TTN,Homo sapiens (Human),34350,Titin (EC 2.7.11.1) (Connectin) (Rhabdomyosarc...,MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI...,False
9884,Q8WXI7,MUC16,reviewed,MUC16 CA125,Homo sapiens (Human),14507,Mucin-16 (MUC-16) (Ovarian cancer-related tumo...,MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG...,False
18482,Q9H195,MUC3B,reviewed,MUC3B,Homo sapiens (Human),13477,Mucin-3B (MUC-3B) (Intestinal mucin-3B),MQLLGLLSILWMLKSSPGATGTLSTATSTSHVTFPRAEATRTALSN...,False
9354,Q8NF91,SYNE1,reviewed,SYNE1 C6orf98 KIAA0796 KIAA1262 KIAA1756 MYNE1,Homo sapiens (Human),8797,Nesprin-1 (Enaptin) (KASH domain-containing pr...,MATSRGASRCPRDIANVMQRLQDEQEIVQKRTFTKWINSHLAKRKP...,False
3121,P20929,NEB,reviewed,NEB,Homo sapiens (Human),8525,Nebulin,MADDEDYEEVVEYYTEEVVYEEVPGETITKIYETTTTRTSDYEQSE...,False
...,...,...,...,...,...,...,...,...,...
18720,A0A0J9YX06,TRBJ1-2,reviewed,TRBJ1-2,Homo sapiens (Human),15,T cell receptor beta joining 1-2,NYGYTFGSGTRLTVV,False
18708,A0A0A0MT89,IGKJ1,reviewed,IGKJ1,Homo sapiens (Human),12,Immunoglobulin kappa joining 1,WTFGQGTKVEIK,False
19241,P0DOY5,IGHD1-1,reviewed,IGHD1-1,Homo sapiens (Human),5,Immunoglobulin heavy diversity 1-1,GTTGT,False
19251,P0DPI4,TRBD1,reviewed,TRBD1,Homo sapiens (Human),4,T cell receptor beta diversity 1,GTGG,False
