In [24]:
import pandas as pd

In [78]:
# df contains all known protein interactions with ligands, from interact protein database
df = pd.read_csv("ligands_interactomes_interact.txt", sep="\t", comment="#", header=None)
df = df[[0, 1, 9, 10, 11]]

# modify uniprot id strings for easy mapping
df[0] = df[0].str.replace('uniprotkb:', '', regex=False)
df[1] = df[1].str.replace('uniprotkb:', '', regex=False)
df[0] = df[0].str.replace(r'-1$', '', regex=True) #r indicates raw string, -1 is suffix to remove, $ specifies only if char at end
df[1] = df[1].str.replace(r'-1$', '', regex=True)
df.head()

# df1 contains all ligand/receptor pairs and predicted interactions scores (from topsy turvy)
df1 = pd.read_csv("../output/2024-05-17-14:27.predictions.tsv", sep="\t", header=None)
df1.head()

Unnamed: 0,0,1,2
0,Q64449,P01864,0.265069
1,Q64449,P01867,0.098694
2,Q64449,Q921I1,0.156688
3,Q64449,O08677,0.245704
4,Q64449,P01837,0.321899


In [79]:
# extract protein pairs that are present in both the interact dataset and topsy turvy predictions
merged = pd.merge(df, df1, how='inner', left_on=[0,1], right_on=[0,1])

# want to include ligand-receptor pairs but also receptor-ligand pairs, so flip columns
df1 = df1[[1, 0, 2]]
df1.columns = [0, 1, 2]

# extract receptor-ligand pairs
merged1 = pd.merge(df, df1, how='inner', left_on=[0,1], right_on=[0,1])

# merge receptor-ligand and ligand-receptor pairs
merged_df = pd.concat([merged, merged1], axis=0)

# drop duplicate pairings
df_sorted = pd.DataFrame({
    0: merged_df[[0, 1]].min(axis=1),
    1: merged_df[[0, 1]].max(axis=1),
    2: merged_df[11],
    3: merged_df[2]
})

# Drop duplicate rows based on sorted pairs
df_unique = df_sorted.drop_duplicates()
df_unique

Unnamed: 0,0,1,2,3
0,P62821,Q9EQH3,"psi-mi:""MI:0914""(association)",0.565085
1,Q3UL36,Q9JLJ2,"psi-mi:""MI:0914""(association)",0.041987
2,P01837,P01864,"psi-mi:""MI:0407""(direct interaction)",0.685090
4,Q99KQ4,Q99KQ4,"psi-mi:""MI:0407""(direct interaction)",0.606532
8,Q99K28,Q9QWI6,"psi-mi:""MI:0914""(association)",0.342669
...,...,...,...,...
63,Q6PHU5,Q9EQH3,"psi-mi:""MI:0915""(physical association)",0.005360
65,P07901,P62821,"psi-mi:""MI:0914""(association)",0.300141
69,P14602,P42227,"psi-mi:""MI:0915""(physical association)",0.510673
70,P42227,Q99KX1,"psi-mi:""MI:0915""(physical association)",0.369797


In [83]:
# rename columns and visualize top 20 PPIs by predicted affinites
df_unique.columns = ["protein1", "protein2", "association type", "predicted interaction"]
df_unique.sort_values(by="predicted interaction", ascending=False).head(20)

Unnamed: 0,protein1,protein2,association type,predicted interaction
59,P62821,P63011,"psi-mi:""MI:0403""(colocalization)",0.844074
58,P62821,P62823,"psi-mi:""MI:0403""(colocalization)",0.838359
41,P35283,P62821,"psi-mi:""MI:0914""(association)",0.831864
32,P62821,Q91ZR1,"psi-mi:""MI:0403""(colocalization)",0.826543
75,P29341,Q64012,"psi-mi:""MI:0914""(association)",0.826432
48,P62821,Q6PHN9,"psi-mi:""MI:0914""(association)",0.82509
68,P62821,Q9CZT8,"psi-mi:""MI:0403""(colocalization)",0.819512
42,P62821,Q9D1G1,"psi-mi:""MI:0914""(association)",0.806428
94,P35278,Q9EQH3,"psi-mi:""MI:0914""(association)",0.786812
7,P01837,P01868,"psi-mi:""MI:0407""(direct interaction)",0.737714
