In [6]:
import pandas as pd

In [7]:
# Load in mapping data

# mapping between STRING ID and UniprotID, from Uniprot mapping site
mapping = pd.read_csv("idmapping_2024_05_21.tsv", sep="\t")
mapping.head()


Unnamed: 0,From,To
0,Q64449,10090.ENSMUSP00000097909
1,Q9R118,10090.ENSMUSP00000006367
2,Q63844,10090.ENSMUSP00000051619
3,O08532,10090.ENSMUSP00000049457
4,Q9QXB9,10090.ENSMUSP00000018568


In [8]:
def extract_string_predictions(uniprot_id):
    # protein network in tabular format
    network = pd.read_csv(f"interaction_tables/{uniprot_id}.tsv", sep="\t")

    # map STRING ID to Uniprot ID in network df
    mapped_network = pd.merge(network, mapping, how="inner", left_on=["node1_string_id"], right_on="To")
    mapped_network.rename(columns={"From":"node1_uniprot_id"}, inplace=True)
    mapped_network.drop(columns=["To"], inplace=True)
    mapped_network = pd.merge(mapped_network, mapping, how="inner", left_on=["node2_string_id"], right_on="To")
    mapped_network.rename(columns={"From":"node2_uniprot_id"}, inplace=True)
    mapped_network.drop(columns=["To"], inplace=True)
    mapped_network

    # reorder protein columns alphanumerically for easy matching with topsy_turvy prediction table
    interactions = pd.DataFrame({
        "prot1": mapped_network[["node1_uniprot_id", "node2_uniprot_id"]].min(axis=1),
        "prot2": mapped_network[["node1_uniprot_id", "node2_uniprot_id"]].max(axis=1),
        "string_score": mapped_network["combined_score"]
        }
    )

    return interactions
    

In [9]:
all_interactions = pd.DataFrame(columns={
        "prot1": "None",
        "prot2": "None",
        "string_score": 0 
    }
    )
all_ligands = ["A2ASS6", "P14131", "P48774", "Q3U0V1", "Q8K183", "Q9D892",    
               "E9PV24", "P18242", "P50247", "Q5SNZ0", "Q8VC30", "Q9DCD0",    
               "O08576", "P19137", "P54729", "Q60823", "Q8VCM7", "Q9EQH3",    
               "O08677", "P19157", "P54822", "Q61081", "Q91X72", "Q9ESN6",    
               "O09114", "P22599", "P60202", "Q61147", "Q91YI0", "Q9ET01",   
               "O35350", "P23591", "P62821", "Q61292", "Q91ZX7", "Q9JJ59",    
               "O54941", "P23953", "P97315", "Q61316", "Q921I1", "Q9JLJ2",    
               "O88569", "P28665", "Q00623", "Q61730", "Q922D8", "Q9R0P5",    
               "O88844", "P29341", "Q00897", "Q6P069", "Q99K28", "Q9WTR5",    
               "O88998", "P29699", "Q00898", "Q6P9R2", "Q99KQ4", "Q9WU78",    
               "P01027", "P30416", "Q02105", "Q6ZPQ6", "Q99MN9", "Q9WVH9",    
               "P01592", "P34914", "Q06890", "Q80TL7", "Q9CXW3", "Q9Z2A0",
               "P10493", "P42227", "Q11011", "Q80XQ2", "Q9D154", "Q9Z2I9",    
               "P11276", "P46425", "Q3TIR3", "Q8K0E8", "Q9D1A2"]
dataframes = []

for ligand in all_ligands:
    interactions = extract_string_predictions(ligand)
    dataframes.append(interactions)

all_interactions = pd.concat(dataframes, ignore_index=True)
print(all_interactions)


      prot1   prot2  string_score
0    A2ASS6  Q9JI91         0.905
1    A2ASS6  Q8VDD5         0.900
2    Q8VDD5  Q9CQ19         0.924
3    Q3THE2  Q8VDD5         0.957
4    A2ASS6  Q3THE2         0.900
..      ...     ...           ...
744  P11276  P35441         0.994
745  P11276  Q8CIZ8         0.979
746  O54890  P35441         0.540
747  O54890  Q8CIZ8         0.868
748  P16460  Q9D1A2         0.489

[749 rows x 3 columns]


  all_interactions = pd.concat(dataframes, ignore_index=True)


In [15]:
# load in topsy_turvy predictions
topsy_turvy = pd.read_csv("../output/2024-05-17-14:27.predictions.tsv", sep="\t", header=None)

# reorder protein columns alphanumerically for easy matching
tt = pd.DataFrame({
    "prot1": topsy_turvy[[0, 1]].min(axis=1),
    "prot2": topsy_turvy[[0, 1]].max(axis=1),
    "tt_score": topsy_turvy[2]
    
})

# merge to drop all interactions that are not 1 ligand and 1 receptor, attach tt prediction score
#final = pd.merge(all_interactions, tt, how="inner", left_on=["prot1", "prot2"], right_on=["prot1", "prot2"])
final = pd.merge(tt, all_interactions, how="left", left_on=["prot1", "prot2"], right_on=["prot1", "prot2"])
final[final["string_score"].isna() == False].drop_duplicates(subset=["prot1", "prot2"]).to_csv("STRING_interactions.csv")

In [103]:
# load in mouse proteome with subcellular loci
uniprot_annot = pd.read_csv("../subcel_location/uniprotkb_organism_id_10090_2024_05_22.tsv", sep='\t')

# keywords and locations we want to filter
keywords = ["Receptor"]
locations = ["Cell surface", "Cell outer membrane"]

pattern = '|'.join(keywords)
pattern2 = '|'.join(locations)

# Filter the DataFrame using the patterns
membrane_prot = uniprot_annot[(uniprot_annot["Keywords"].str.contains(pattern, case=False, na=False)) 
    & (uniprot_annot["Subcellular location [CC]"].str.contains(pattern2, case=False, na=False))]

membrane_prot

Unnamed: 0,Entry,Subcellular location [CC],Keywords
669,O08747,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,Alternative splicing;Apoptosis;Cell membrane;C...
943,O54689,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,Cell membrane;Disulfide bond;G-protein coupled...
947,O54709,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,3D-structure;Adaptive immunity;Alternative spl...
1193,O88307,SUBCELLULAR LOCATION: Golgi apparatus membrane...,Cell membrane;Cleavage on pair of basic residu...
1237,O88536,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,Cell membrane;Chemotaxis;Disulfide bond;G-prot...
1343,O89026,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,Cell membrane;Cell projection;Chemotaxis;Devel...
1778,P14206,SUBCELLULAR LOCATION: Cell membrane. Cytoplasm...,3D-structure;Acetylation;Cell membrane;Cytopla...
1812,P15208,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,3D-structure;ATP-binding;Cell membrane;Cleavag...
1858,P16297,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,3D-structure;Cell membrane;Disulfide bond;Glyc...
2073,P23818,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,3D-structure;Cell membrane;Cell projection;Dis...


In [88]:
# filter final df so atleast one protein in interaction pair has membrane keyword
# create set or uniprot ids for faster lookup
memb_prot_set = set(membrane_prot["Entry"])

# takes a row, returns true if atleast one protein is a membrane protein
def pair_contains_memb_prot(row):
    return row["prot1"] in memb_prot_set or row["prot2"] in memb_prot_set

# apply function on every row of final, axis = 1 specifies row-wise
final_filtered = final[final.apply(pair_contains_memb_prot, axis=1)]
final_filtered = final_filtered.drop_duplicates()

In [101]:
final_filtered.sort_values(by="string_score", ascending=False)

Unnamed: 0,prot1,prot2,tt_score,string_score
149010,O88998,P23818,0.071798,0.928
5250,P15208,Q99KQ4,0.111587,0.408
4377,P01864,P28843,0.000030,
4378,P01867,P28843,0.000023,
4379,P28843,Q921I1,0.461540,
...,...,...,...,...
149021,P23818,P62821,0.415660,
149022,P23818,P29341,0.030342,
149023,P23818,Q9D892,0.599177,
149024,O88844,P23818,0.349388,


In [31]:
# alternate method: mouse cell surface proteome from: https://wlab.ethz.ch/cspa/#abstract
cell_surf_prot_sheet = pd.read_excel("../subcel_location/cell_surface_atlas.xlsx", sheet_name="Table B")
cell_surf_prot_sheet = cell_surf_prot_sheet[cell_surf_prot_sheet["CSPA category"] != "3 - unspecific"]
cell_surf_prot_sheet = cell_surf_prot_sheet[cell_surf_prot_sheet["CSPA category"] != "2 - putative"]
cell_surf_prot_sheet = cell_surf_prot_sheet[cell_surf_prot_sheet["UP_Protein_name"].str.contains("receptor", case=False)]

surf_prot_ids = set(cell_surf_prot_sheet["ID_link"])



  warn(msg)


In [32]:
# takes a row, returns true if atleast one protein is a surface protein
def pair_contains_surf_prot(row):
    return row["prot1"] in surf_prot_ids or row["prot2"] in surf_prot_ids

# apply function on every row of final, axis = 1 specifies row-wise
final_filtered = final[final.apply(pair_contains_surf_prot, axis=1)]
final_filtered = final_filtered.drop_duplicates(subset=["prot1", "prot2"])
final_filtered

Unnamed: 0,prot1,prot2,tt_score,string_score
0,P01864,Q64449,0.265069,
1,P01867,Q64449,0.098694,
2,Q64449,Q921I1,0.156688,
3,O08677,Q64449,0.245704,
4,P01837,Q64449,0.321899,
...,...,...,...,...
245067,Q61730,Q99MJ9,0.007650,
245160,Q61730,Q91ZX6,0.134941,
245253,Q61730,Q8BJ90,0.250010,
245346,Q61598,Q61730,0.065365,


In [39]:
final_filtered.sort_values(by="tt_score", ascending=False, inplace=True)
final_filtered = final_filtered.reset_index(drop=True)
final_filtered.loc[list(range(10,18)) + list(range(19, 26))]


Unnamed: 0,prot1,prot2,tt_score,string_score
10,P25446,Q91X72,0.748812,
11,O08756,Q61730,0.748693,
12,Q61730,Q921J2,0.748203,
13,Q62351,Q91X72,0.748143,
14,P25446,Q61292,0.748073,
15,P08556,Q61730,0.747705,
16,P32883,Q61730,0.747658,
17,O88844,P08101,0.743391,
19,P01843,P97797,0.738859,
20,Q61730,Q99JI6,0.734045,


In [40]:
final_filtered[["prot1", "prot2"]].loc[list(range(10,18)) + list(range(19, 26))].to_csv("../alphafold/to_json/SP(receptors)_tt_10:26.csv")