In [2]:
import pandas as pd 
import pickle
import numpy as np
import tensorflow as tf

# Load Protein-Protein Interactions

In [5]:
df_PPI=pd.read_csv('data/PPI.txt',sep=' ')

* We use all interactions in the String DataBase
* We can filter interactions to include specfic confidence score or association types

In [3]:
#df_PPI=df_PPI[df_PPI['combined_score']>300]
#df_PPI=df_PPI[df_PPI['experiments']>0] #==> 40%
#df_PPI=df_PPI[(df_PPI['neighborhood']>0 ) | (df_PPI['cooccurence']>0 ) | (df_PPI['homology']>0 ) | (df_PPI['coexpression']>0 ) | (df_PPI['experiments']>0 ) | (df_PPI['database']>0) | (df_PPI['textmining']>0  ) ]
#df_PPI=df_PPI[(df_PPI['neighborhood_transferred']>0 ) | (df_PPI['coexpression_transferred']>0 ) | (df_PPI['experiments_transferred']>0 ) | (df_PPI['database_transferred']>0 ) | (df_PPI['textmining_transferred']>0 )  ]
#df_PPI=df_PPI[(df_PPI['experiments_transferred']>0 ) | (df_PPI['experiments']>0 )  ] # 4574962


In [7]:
df_PPI.shape

(11759454, 16)

In [8]:
df_PPI.head()

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,9606.ENSP00000000233,9606.ENSP00000272298,0,0,0,332,0,0,62,0,181,0,0,0,125,490
1,9606.ENSP00000000233,9606.ENSP00000253401,0,0,0,0,0,0,0,0,186,0,0,0,56,198
2,9606.ENSP00000000233,9606.ENSP00000401445,0,0,0,0,0,0,0,0,160,0,0,0,0,159
3,9606.ENSP00000000233,9606.ENSP00000418915,0,0,0,0,0,0,61,0,158,0,0,542,0,606
4,9606.ENSP00000000233,9606.ENSP00000327801,0,0,0,0,0,69,61,0,78,0,0,0,89,167


In [10]:
df_PPI=df_PPI[['protein1', 'protein2']]

trainInteracts=df_PPI.reset_index(drop=True)

trainInteracts.head()

Unnamed: 0,protein1,protein2
0,9606.ENSP00000000233,9606.ENSP00000272298
1,9606.ENSP00000000233,9606.ENSP00000253401
2,9606.ENSP00000000233,9606.ENSP00000401445
3,9606.ENSP00000000233,9606.ENSP00000418915
4,9606.ENSP00000000233,9606.ENSP00000327801


### Load String IDs 

In [12]:
with open('data/pickles/StringIDs.pickle', "rb") as f:    
    identifiers=pickle.load(f)


numProteins=len(identifiers)

proteinIndices={}

for i in range(numProteins):
    proteinIndices[identifiers[i]]=i
    


# Triplets

In [13]:
def generate_triplets():
    numInteracts=len(trainInteracts)
    anchors=np.empty((numInteracts,), dtype=np.int32)
    positives=np.empty((numInteracts,), dtype=np.int32)
    negatives=np.empty((numInteracts,), dtype=np.int32)
    for i,pid, hid in tqdm(trainInteracts.itertuples()):    
        anchorID=proteinIndices[pid]
        anchors[i]=anchorID
        positives[i]=proteinIndices[hid]
        randomNegative=np.random.randint(numProteins)
        negatives[i]=randomNegative
    return anchors, positives,negatives


## Triplet Loss

In [14]:
from keras.layers import concatenate
from keras.layers import *
from keras.layers import Concatenate
import numpy as np
from keras import backend as K
from keras.models import Model
from keras.layers import Embedding, Flatten, Input, merge
from keras.optimizers import Adam

Using TensorFlow backend.


In [17]:
def identity_loss(y_true, y_pred):

    return K.mean(y_pred - 0 * y_true)

def triplet_loss(X):
    positive_item_latent, negative_item_latent, user_latent = X
    loss = 1.0 - K.sigmoid(
    K.sum(user_latent * positive_item_latent, axis=-1, keepdims=True) -
    K.sum(user_latent * negative_item_latent, axis=-1, keepdims=True))
    return loss

# Design the Network

In [21]:
def triplet_nework(num_proteins, embedding_dim):

    positive_protein_id = Input((1, ), name='positive_protein_id')
    negative_protein_id = Input((1, ), name='negative_protein_id')
    anchor_protein_id = Input((1, ), name='anchor_protein_id')

    # Shared embedding layer for all proteins
    shared_embedding_layer = Embedding( num_proteins, embedding_dim, name='embedding', input_length=1)

    positive_protein_embedding = Flatten()(shared_embedding_layer(positive_protein_id))
    negative_protein_embedding = Flatten()(shared_embedding_layer(negative_protein_id))
    anchor_protein_embedding = Flatten()(shared_embedding_layer(anchor_protein_id))
    

    myloss = Lambda(triplet_loss,output_shape=(1, ))([positive_protein_embedding,negative_protein_embedding, anchor_protein_embedding])

    model = Model(
        input=[positive_protein_id, negative_protein_id, anchor_protein_id],
        output=myloss)
    model.compile(loss=identity_loss, optimizer=Adam())

    return model

In [22]:
embedding_dim = 64

In [23]:

model = triplet_nework(numProteins, embedding_dim)

print(model.summary())


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
positive_protein_id (InputLayer (None, 1)            0                                            
__________________________________________________________________________________________________
negative_protein_id (InputLayer (None, 1)            0                                            
__________________________________________________________________________________________________
anchor_protein_id (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 64)        1252224     positive_protein_id[0][0]        
                                                                 negative_protein_id[0][0]        
         



## Train the network


In [24]:
from tqdm import tqdm

In [25]:
aids, pids, nids = generate_triplets()

11759454it [01:12, 162044.72it/s]


In [26]:
triplets = {
    'anchor_protein_id': aids,
    'positive_protein_id': pids,
    'negative_protein_id': nids
}

In [28]:
history=model.fit(triplets,
          np.ones(len(aids)),
          batch_size=1024,
          nb_epoch=3,
          verbose=1,
          shuffle=True)







  


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [30]:
model.save('Saved_Models/trained_model_to_generate_embeddings.h5')