In [1]:
import networkx as nx

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Load Datasets

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [4]:
trainDatasetFile='2017/S1_dataset/5939pdataset/idLocation.txt'

In [5]:
testDatasetFile='2017/S1_dataset/920pdataset/test_idLocation_code.txt'

In [6]:
testDataset=pd.read_csv(testDatasetFile,sep='|',header=None)

In [7]:
testDataset.head()

Unnamed: 0,0,1
0,P00505,1 4
1,Q9NP58,1 3 4
2,Q96HD9,1 2
3,O43687,1
4,Q8N7J2,1


In [8]:
trainDataset=pd.read_csv(trainDatasetFile,sep='|',header=None)

In [9]:
trainDataset.head()

Unnamed: 0,0,1
0,O95866,Cell_membrane
1,Q70Z44,Cell_membrane
2,Q5I7T1,Cell_membrane
3,O14514,Cell_membrane
4,Q9H159,Cell_membrane


# pre process lables

In [10]:
trainLables=trainDataset[[1]]
multiplexProteinsIndicesTrain=[]
trainLablesOneHot=np.zeros(shape=(trainLables.shape[0],6))

lableDict={'Cell_membrane':0,'Cytoplasm':1,'ER_Golgi':2,'Mitochondrion':3,'Nucleus':4,'Secreted':5}

for i,row in trainLables.iterrows():
    lables=row.str.split(' ')
    lables=list(lables)
    lables=lables[0]
    lables=lables[:-1]
    if len(lables)> 1:
        multiplexProteinsIndicesTrain.append(i)
    for lable in lables:
        trainLablesOneHot[i,lableDict[lable]]=1


In [11]:
trainLablesOneHot.shape

(5939, 6)

## separate multilex proteins

In [12]:
multiplexProteinsIndicesTrain=np.array(multiplexProteinsIndicesTrain)

trainLablesOneHotMultiplex=trainLablesOneHot[multiplexProteinsIndicesTrain]

# test lables

In [13]:
testLables=testDataset[[1]]
multiplexProteinsIndices=[]
testLablesOneHot=np.zeros(shape=(testLables.shape[0],6))


for i,row in testLables.iterrows():
    lables=row.str.split(' ')
    lables=list(lables)
    lables=lables[0]
    lables=lables[:-1]
    if len(lables)> 1:
        multiplexProteinsIndices.append(i)
    for lable in lables:
        testLablesOneHot[i,int(lable)-1]=1
        


## separate multilex proteins

In [14]:
multiplexProteinsIndices=np.array(multiplexProteinsIndices)

testLablesOneHotMultiplex=testLablesOneHot[multiplexProteinsIndices]

# prepare data for ven diagram

In [15]:
# all_data=np.concatenate((trainLablesOneHot,testLablesOneHot))

# trainLablesOneHot.shape

# all_data.shape

# trainLablesOneHot.shape

# len(trainLablesOneHot)

# [trainLablesOneHot[:,0]==1.0]

# Cell_membrane=pd.DataFrame (np.nonzero([all_data[:,0]==1.0])[1])

# Cell_membrane.to_csv('ven_chart_files/Cell_membraneIDS.csv',index=False,header=False)

# Cytoplasm=pd.DataFrame (np.nonzero([all_data[:,1]==1.0])[1])

# Cytoplasm.to_csv('ven_chart_files/CytoplasmIDS.csv',index=False,header=False)

# ER_Golgi=pd.DataFrame (np.nonzero([all_data[:,2]==1.0])[1])

# ER_Golgi.to_csv('ven_chart_files/ER_Golgi.csv',index=False,header=False)

# Mitochondrion=pd.DataFrame (np.nonzero([all_data[:,3]==1.0])[1])

# Mitochondrion.to_csv('ven_chart_files/Mitochondrion.csv',index=False,header=False)

# Nucleus=pd.DataFrame (np.nonzero([all_data[:,4]==1.0])[1])

# Nucleus.to_csv('ven_chart_files/Nucleus.csv',index=False,header=False)

# Secreted=pd.DataFrame (np.nonzero([all_data[:,5]==1.0])[1])

# Secreted.to_csv('ven_chart_files/Secreted.csv',index=False,header=False)


# map uniprot to String

In [16]:
dfTrain=trainDataset[[0]]

dfTrain.columns=['uniprot_ac']

#mapString2Uniprot=pd.read_csv('../uniprot2string.tsv',sep='\t',skiprows=1,usecols=[1,2])
mapString2Uniprot=pd.read_csv('../data/all_organisms.uniprot_2_string.2018.tsv',sep='\t',skiprows=1,usecols=[1,2])


#mapString2Uniprot.columns=['species', 'uniprot_ac_uniprot_id', 'string_id', 'identity' ,'bit_score']
mapString2Uniprot.columns=['uniprot_ac_uniprot_id', 'string_id']

mapString2Uniprot['uniprot_ac'] = mapString2Uniprot.uniprot_ac_uniprot_id.str.split('|').str[0]

dfTrain=pd.merge(dfTrain,mapString2Uniprot,on=['uniprot_ac'],how='left')


dfTrain.shape

#140 out of 5939 with no string id
len(np.nonzero (pd.isna(dfTrain['string_id']))[0])


#import pickle

# with open('2017/S1_dataset/5939pdataset/dfTrainMapUniprot2String.pickle','wb') as handle:
#     pickle.dump(dfTrain,handle)
    


  return getattr(obj, method)(*args, **kwds)


140

In [17]:
dfTest=testDataset[[0]]

dfTest.columns=['uniprot_ac']


mapString2Uniprot=pd.read_csv('../data/all_organisms.uniprot_2_string.2018.tsv',sep='\t',skiprows=1,usecols=[1,2])


#mapString2Uniprot.columns=['species', 'uniprot_ac_uniprot_id', 'string_id', 'identity' ,'bit_score']
mapString2Uniprot.columns=['uniprot_ac_uniprot_id', 'string_id']

mapString2Uniprot['uniprot_ac'] = mapString2Uniprot.uniprot_ac_uniprot_id.str.split('|').str[0]

dfTest=pd.merge(dfTest,mapString2Uniprot,on=['uniprot_ac'],how='left')


dfTest.shape

#140 out of 5939 with no string id
len(np.nonzero (pd.isna(dfTest['string_id']))[0])


#import pickle

# with open('2017/S1_dataset/5939pdataset/dfTrainMapUniprot2String.pickle','wb') as handle:
#     pickle.dump(dfTrain,handle)
    


18

In [18]:
dfTest.shape

(920, 3)

In [19]:
# import pickle

# with open('2017/S1_dataset/5939pdataset/dfTrainMapUniprot2String.pickle', "rb") as f:
#     dfTrain=pickle.load(f)
    

#  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# load embeddings

In [20]:
import pickle
import tensorflow as tf
from keras import backend as K
from keras.models import load_model


def identity_loss(y_true, y_pred):

    return K.mean(y_pred - 0 * y_true)

#proteins_all_humanScore500
#proteins_all_human
#proteins_all_humanScore300
#proteins_all_humanScore400
with open('../data/pickles/all_human_stringIDs_19566.pickle', "rb") as f:
    proteins=pickle.load(f)


Using TensorFlow backend.


In [21]:
len(proteins)

19566

In [22]:

#generate_embeddings_10epoch_50d_humanAll
#generate_embeddings_10epoch_100d_humanAll_score500
#generate_embeddings_10epoch_100d_humanAll
#generate_embeddings_10epoch_100d_humanAllBatch2028Epoch5==> 0.77
#generate_embeddings_10epoch_100d_humanAll_score500
#generate_embeddings_10epoch_50d_humanScore400Batch1024
#generate_embeddings_5epoch_50d_humanScore400Batch1024Improved ==> 0.75
#generate_embeddings_100epoch_50d_humanScore400Batch1024Improved
#generate_embeddings_50epoch5eoch_50d_humanScore400Batch1024Improved
#generate_embeddings_1epoch10eoch_50d_humanScore300Batch1024Improved
embedding_modebl = load_model('../data/pickles/triplet_embeddings/Final_fineTuned_2017.h5',custom_objects={ 'identity_loss': identity_loss })
#embedding_modebl = load_model('../data/pickles/triplet_embeddings/Final_onlyPPI_19566_64d.h5',custom_objects={ 'identity_loss': identity_loss })


def generate_vector(model, uid):

    vector = model.get_layer('item_embedding').get_weights()[0][uid]
    #vector = model.get_layer('embedding_2').get_weights()[0][uid]
     

    return vector


W0120 14:55:09.523674 139642236327744 deprecation_wrapper.py:119] From /home/ubuntuadmin/anaconda37/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0120 14:55:09.533867 139642236327744 deprecation_wrapper.py:119] From /home/ubuntuadmin/anaconda37/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0120 14:55:09.561540 139642236327744 deprecation_wrapper.py:119] From /home/ubuntuadmin/anaconda37/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0120 14:55:09.561998 139642236327744 deprecation_wrapper.py:119] From /home/ubuntuadmin/anaconda37/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.com

In [23]:
len(generate_vector(embedding_modebl,383))

64

#  ++++++++++++++++++++++++++++++++++++++++++++++++++

In [24]:
lenMax=len(proteins)



embedding_size=64

testProtein_weights = np.zeros((dfTest.shape[0], embedding_size))

testProtein_weights.shape


(920, 64)

In [25]:
lenMax=len(proteins)



embedding_size=64

trainProtein_weights = np.zeros((dfTrain.shape[0], embedding_size))

trainProtein_weights.shape


(5939, 64)

In [26]:
dfTrain.shape

(5939, 3)

In [27]:
dfTrain.head()

Unnamed: 0,uniprot_ac,uniprot_ac_uniprot_id,string_id
0,O95866,O95866|G6B_HUMAN,9606.ENSP00000364964
1,Q70Z44,Q70Z44|5HT3D_HUMAN,9606.ENSP00000371929
2,Q5I7T1,Q5I7T1|AG10B_HUMAN,9606.ENSP00000310120
3,O14514,O14514|AGRB1_HUMAN,9606.ENSP00000430945
4,Q9H159,Q9H159|CAD19_HUMAN,9606.ENSP00000262150


In [28]:
proteins[0]

'9606.ENSP00000000233'

In [29]:
len(trainProtein_weights[0])

64

In [30]:
c_found=0
for i,row in dfTest.iterrows():
    try:
        protein_id=np.searchsorted(proteins,row['string_id'])
        if protein_id != lenMax:
            c_found += 1
            testProtein_weights[i]=generate_vector(embedding_modebl,protein_id)
        else:
            testProtein_weights[i]=np.random.rand(embedding_size)
    except:
            testProtein_weights[i]=np.random.rand(embedding_size)

    



print('number of found: ',c_found)

testProtein_weights=pd.DataFrame(testProtein_weights)

testProtein_weights.shape


    


number of found:  902


(920, 64)

In [31]:
c_found=0
for i,row in dfTrain.iterrows():
    try:
        protein_id=np.searchsorted(proteins,row['string_id'])
        if protein_id != lenMax:
            c_found += 1
            trainProtein_weights[i]=generate_vector(embedding_modebl,protein_id)
        else:
            trainProtein_weights[i]=np.random.rand(embedding_size)
    except:
            trainProtein_weights[i]=np.random.rand(embedding_size)

    


dfTrain.shape

print('number of found: ',c_found)

trainProtein_weights=pd.DataFrame(trainProtein_weights)

trainProtein_weights.shape


    


number of found:  5799


(5939, 64)

In [32]:

# import pickle
# with open('2017/S1_dataset/5939pdataset/trainProtein_weights_5939_50d.pickle','wb') as handle:
#     pickle.dump(trainProtein_weights,handle)

In [33]:
# import pickle
# with open('2017/S1_dataset/5939pdataset/trainProtein_weights_5939_50d.pickle', "rb") as f:
#     trainProtein_weights=pickle.load(f)

In [34]:
X_train=trainProtein_weights

In [35]:
X_test=testProtein_weights

In [36]:
testProtein_weights.shape

(920, 64)

In [37]:
testProtein_weights=np.array(testProtein_weights)
X_testMultiplx=testProtein_weights[multiplexProteinsIndices]

trainProtein_weights=np.array(trainProtein_weights)
X_trainMultiplx=trainProtein_weights[multiplexProteinsIndicesTrain]


In [38]:
X_trainMultiplx.shape

(913, 64)

In [39]:
include_idx = set(multiplexProteinsIndicesTrain)  #Set is more efficient, but doesn't reorder your elements if that is desireable
mask = np.array([(i in include_idx) for i in range(len(trainProtein_weights))])
X_trainSingleplex=trainProtein_weights[~mask]
y_trainSingleplex=trainLablesOneHot[~mask]

In [40]:
X_train=np.array(X_train)

In [41]:
X_test=np.array(X_test)

y_train=trainLablesOneHot
y_trainMultiplex=trainLablesOneHotMultiplex
y_train.shape

y_test=testLablesOneHot

y_testMultiplex=testLablesOneHotMultiplex

y_test.shape

(920, 6)

# +++++++++++++++++++++++++++++++++++++++++++++

In [42]:
from keras import models
from keras import layers
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.model_selection import StratifiedKFold

In [43]:
network=None

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
def  computeAbsoluteAccuracyPerLable_single(Ytrue,Ypred):
    countSingle=0
    countSingleCorrect=0
    countMultiple=0
    countMultipleCorrect=0
    for i in range(6):
        print(':',len(np.nonzero (Ytrue[:,i])[0]))
    YpredOneHot=np.zeros(shape=(Ypred.shape))
    results=[]
    rows=Ytrue.shape[0]
    for i in range(rows):
        numLables=len(np.nonzero (Ytrue[i])[0])
        ind = np.argpartition(Ypred[i], -numLables)[-numLables:]
        YpredOneHot[i][ind]=1
        if numLables>1:
            countMultiple+=1
            if not any (np.logical_xor (YpredOneHot[i],Ytrue[i])):
                
                countMultipleCorrect+=1
        else:
            countSingle+=1
            if  not any (np.logical_xor (YpredOneHot[i],Ytrue[i])):
                countSingleCorrect+=1
            
    for i in range(6):
        corrects=len(np.nonzero(np.logical_and(Ytrue[:,i], YpredOneHot[:,i]))[0])
        numAllTrues=len(np.nonzero (Ytrue[:,i])[0])
        #this class has no sample
        if  numAllTrues==0:
                continue
        results.append(corrects/numAllTrues)
    print('countSingleCorrect',countSingleCorrect)
    print('countSingleCorrect',countSingle)
    
    return results,(countSingleCorrect/countSingle) ,1

        

In [46]:
def  computeAbsoluteAccuracyPerLable(Ytrue,Ypred):
    countSingle=0
    countSingleCorrect=0
    countMultiple=0
    countMultipleCorrect=0
    for i in range(6):
        print(':',len(np.nonzero (Ytrue[:,i])[0]))
    YpredOneHot=np.zeros(shape=(Ypred.shape))
    results=[]
    rows=Ytrue.shape[0]
    for i in range(rows):
        numLables=len(np.nonzero (Ytrue[i])[0])
        ind = np.argpartition(Ypred[i], -numLables)[-numLables:]
        YpredOneHot[i][ind]=1
        if numLables>1:
            countMultiple+=1
            if not any (np.logical_xor (YpredOneHot[i],Ytrue[i])):
                
                countMultipleCorrect+=1
        else:
            countSingle+=1
            if  not any (np.logical_xor (YpredOneHot[i],Ytrue[i])):
                countSingleCorrect+=1
            
    for i in range(6):
        corrects=len(np.nonzero(np.logical_and(Ytrue[:,i], YpredOneHot[:,i]))[0])
        numAllTrues=len(np.nonzero (Ytrue[:,i])[0])
        #this class has no sample
        if  numAllTrues==0:
                continue
        results.append(corrects/numAllTrues)
    print('countSingleCorrect',countSingleCorrect)
    print('countSingleCorrect',countSingle)
    
    return results,(countSingleCorrect/countSingle) ,(countMultipleCorrect/countMultiple)

        

In [47]:
def  computeAbsoluteAccuracy(Ytrue,Ypred):
    YpredOneHot=np.zeros(shape=(Ypred.shape))
    rows=Ytrue.shape[0]
    for i in range(rows):
        numLables=len(np.nonzero (Ytrue[i])[0])
        ind = np.argpartition(Ypred[i], -numLables)[-numLables:]
        YpredOneHot[i][ind]=1
    return  accuracy_score(Ytrue,YpredOneHot)   

        

In [48]:
from skimage import data, io
from matplotlib import pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation,Conv1D,GlobalMaxPooling1D,MaxPooling1D,Dropout,Flatten
import numpy as np
import pandas as pd
from keras import layers
from keras.layers  import  Conv1D,GlobalAveragePooling1D,Dense,Dropout,MaxPooling1D,GlobalMaxPooling1D
from sklearn.model_selection import train_test_split


In [49]:
#X_train=np.expand_dims(X_train,axis=-1)

In [50]:
X_train.shape

(5939, 64)

In [51]:
from keras import optimizers

In [52]:
def buildModelNaive():
    network = models.Sequential()
    
    #network.add(Conv1D(50, 7, activation='relu', input_shape=(100, 1),padding='valid'))
    
    
    network.add(layers.Dense(128, activation='relu', input_shape=(64, )))
    network.add(layers.Dense(64, activation='relu', ))
   
    
    network.add(layers.Dense(6, activation='softmax'))
    opt=optimizers.rmsprop(lr=0.0001)
    #opt=optimizers.Adam (lr=0.0001)
    network.compile(optimizer=opt,
                #loss='binary_crossentropy',
                    loss='categorical_crossentropy',
                metrics=['accuracy'])
    return network

In [53]:
random_seed=1311

In [76]:
def train_and_evaluate_model_lstm_attention(W_train,Y_train,epochs):
    prec_list=[]; reca_list=[]; fscore_list=[] ; fold=0
    skf = StratifiedKFold(n_splits=5, random_state=random_seed,shuffle=True)
    #skf = StratifiedKFold(n_splits=5, random_state=random_seed,shuffle=False)
    AccuracyMultiple=0
    AccuracySingle=0
    all_histories=[]
    Y = [np.argmax(y, axis=None, out=None) for y in Y_train]
    for train_index, test_index in skf.split(W_train,Y):     
        fold+=1
        X_train, X_test = W_train[train_index], W_train[test_index] 
        y_train, y_test = Y_train[train_index], Y_train[test_index]
        model = None # Clearing the NN.
        #model = build_model()
        model = buildModelNaive()
        #earlystopper = EarlyStopping(monitor='val_loss', patience=2, verbose=0)        

        history=model.fit(X_train, y_train, validation_data=(X_test,y_test) ,epochs=epochs, batch_size=16,verbose=1)
        
        all_histories.append(history)
        
        YtestPredicted=model.predict(X_test)
        
    #    avePrec =label_ranking_average_precision_score(y_test, YtestPredicted)
    
        #avePrec =computeAbsoluteAccuracy(y_test, YtestPredicted)
        
        #avePrec,AS,AM=computeAbsoluteAccuracyPerLable(y_test, YtestPredicted)
        avePrec,AS,AM=computeAbsoluteAccuracyPerLable_single(y_test, YtestPredicted)
        AccuracySingle+=AS
        AccuracyMultiple+=AM
        print(avePrec)

        
        
        
    
#         print("Fold {:d}: Precision:{:.2f}% ".format(i,np.mean( avePrec*100)))
        #prec_list.append(np.mean(avePrec))
        prec_list.append(avePrec)
    
#     precission=sum(prec_list)/len(prec_list)*100 
#     print("Final: Precision:{:.2f}% ".format(precission ))
    return prec_list,(AccuracySingle/5),(AccuracyMultiple/5)

In [55]:
#generate_embeddings_10epoch_50d_humanAll    ==> 0.7325441305244833
#generate_embeddings_10epoch_100d_humanAll_score500  ==>0.65
#generate_embeddings_10epoch_100d_humanAll   ==> 0.7324315902459138
#generate_embeddings_10epoch_100d_humanAllBatch2028Epoch5  ==> 0.7359638809890722

re,AccuracySingle,AccuracyMultiple=train_and_evaluate_model_lstm_attention(X_train,y_train,30)


W0120 14:55:30.798146 139642236327744 deprecation.py:323] From /home/ubuntuadmin/anaconda37/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 4748 samples, validate on 1191 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
: 291
: 316
: 115
: 91
: 404
: 164
countSingleCorrect 856
countSingleCorrect 1005
[0.8247422680412371, 0.8132911392405063, 0.7043478260869566, 0.9120879120879121, 0.9257425742574258, 0.7865853658536586]
Train on 4750 samples, validate on 1189 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30


Epoch 30/30
: 291
: 308
: 110
: 94
: 416
: 158
countSingleCorrect 859
countSingleCorrect 1008
[0.8247422680412371, 0.827922077922078, 0.7454545454545455, 0.8085106382978723, 0.9158653846153846, 0.8291139240506329]
Train on 4752 samples, validate on 1187 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
: 291
: 302
: 113
: 90
: 402
: 158
countSingleCorrect 871
countSingleCorrect 1022
[0.9037800687285223, 0.7847682119205298, 0.7345132743362832, 0.7888888888888889, 0.9203980099502488, 0.7911392405063291]
Train on 4753 samples, validate on 1186 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
E

Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
: 290
: 310
: 112
: 94
: 419
: 158
countSingleCorrect 851
countSingleCorrect 998
[0.8620689655172413, 0.8483870967741935, 0.8214285714285714, 0.8191489361702128, 0.8782816229116945, 0.7531645569620253]
Train on 4753 samples, validate on 1186 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
: 290
: 306
: 112
: 93
: 423
: 157
countSingleCorrect 815
countSingleCorrect 993
[0.8724137931034482, 0.8169934640522876, 0.7678571428571429, 0.7741935483870968, 0.9125295508274232, 0.6496815286624203]


In [53]:
#generate_embeddings_10epoch_50d_humanAll    ==> 0.7325441305244833
#generate_embeddings_10epoch_100d_humanAll_score500  ==>0.65
#generate_embeddings_10epoch_100d_humanAll   ==> 0.7324315902459138
#generate_embeddings_10epoch_100d_humanAllBatch2028Epoch5  ==> 0.7359638809890722

re,AccuracySingle,AccuracyMultiple=train_and_evaluate_model_lstm_attention(X_train,y_train,30)


Train on 4748 samples, validate on 1191 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
: 291
: 316
: 113
: 91
: 419
: 165
countSingleCorrect 748
countSingleCorrect 990
[0.7594501718213058, 0.7943037974683544, 0.6106194690265486, 0.7802197802197802, 0.8806682577565632, 0.6424242424242425]
Train on 4750 samples, validate on 1189 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30


Epoch 29/30
Epoch 30/30
: 291
: 308
: 114
: 95
: 411
: 163
countSingleCorrect 750
countSingleCorrect 1004
[0.7903780068728522, 0.724025974025974, 0.5701754385964912, 0.7368421052631579, 0.8613138686131386, 0.7055214723926381]
Train on 4752 samples, validate on 1187 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
: 291
: 302
: 113
: 87
: 405
: 153
countSingleCorrect 753
countSingleCorrect 1025
[0.8006872852233677, 0.7052980132450332, 0.5575221238938053, 0.7586206896551724, 0.8567901234567902, 0.7254901960784313]
Train on 4753 samples, validate on 1186 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
E

In [56]:
AccuracySingle

0.8459249899577841

In [57]:
AccuracyMultiple

0.6941708812268571

In [57]:
results=np.array(re)

In [58]:
lableDict

{'Cell_membrane': 0,
 'Cytoplasm': 1,
 'ER_Golgi': 2,
 'Mitochondrion': 3,
 'Nucleus': 4,
 'Secreted': 5}

In [59]:
np.mean (results,axis=0)

array([0.84376111, 0.79295405, 0.72432639, 0.80307498, 0.90384583,
       0.75869931])

In [60]:
np.mean (results)

0.8044436113732643

In [469]:
# all
np.mean (results)

0.7701797689253354

## Singleplex proteins

In [72]:
re,acc,_=train_and_evaluate_model_lstm_attention(X_trainSingleplex,y_trainSingleplex,20)


Train on 4019 samples, validate on 1007 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
: 252
: 153
: 108
: 83
: 275
: 136
countSingleCorrect 825
countSingleCorrect 1007
[0.8888888888888888, 0.6209150326797386, 0.6944444444444444, 0.8554216867469879, 0.9054545454545454, 0.8161764705882353]
Train on 4019 samples, validate on 1007 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
: 252
: 153
: 108
: 83
: 275
: 136
countSingleCorrect 835
countSingleCorrect 1007
[0.8968253968253969, 0.673202614379085, 0.7037037037037037, 0.8192771084337349, 0.9309090909090909, 0.7794117647058824]
Train on 4020 samples, validate on 10

In [73]:
acc

0.8185347721929833

In [65]:
results=np.array(re)

In [70]:
results[0]

[[0.873015873015873,
  0.6209150326797386,
  0.7222222222222222,
  0.8433734939759037,
  0.8945454545454545,
  0.7941176470588235],
 [0.873015873015873,
  0.6013071895424836,
  0.7129629629629629,
  0.8674698795180723,
  0.9345454545454546,
  0.7720588235294118],
 [0.8611111111111112,
  0.6470588235294118,
  0.7777777777777778,
  0.8674698795180723,
  0.8978102189781022,
  0.7720588235294118],
 [0.8928571428571429,
  0.5460526315789473,
  0.7850467289719626,
  0.8414634146341463,
  0.9197080291970803,
  0.7647058823529411],
 [0.8134920634920635,
  0.6118421052631579,
  0.7850467289719626,
  0.8048780487804879,
  0.9379562043795621,
  0.7352941176470589]]

In [80]:
np.mean (results,axis=0)

# Indipndnt Test Dataset

In [60]:
model = None # Clearing the NN.
#model = build_model()
model = buildModelNaive()

In [61]:
history=model.fit(X_train, y_train, validation_data=(X_test,y_test) ,epochs=30, batch_size=16,verbose=1)

Train on 5939 samples, validate on 920 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [62]:

YtestPredicted=model.predict(X_test)


avePrec=computeAbsoluteAccuracyPerLable(y_test, YtestPredicted)

: 258
: 296
: 225
: 198
: 323
: 112
countSingleCorrect 341
countSingleCorrect 541


In [65]:
avePrec

([0.6705426356589147,
  0.8918918918918919,
  0.5955555555555555,
  0.6111111111111112,
  0.6656346749226006,
  0.6517857142857143],
 0.6303142329020333,
 0.43007915567282323)

In [66]:
np.mean(avePrec[0])

0.6810869305709647

# test Multiplx

In [75]:

YtestPredicted=model.predict(X_testMultiplx)

avePrec=computeAbsoluteAccuracyPerLable(y_testMultiplex, YtestPredicted)

: 132
: 249
: 148
: 83
: 200
: 59
countSingleCorrect 0
countSingleCorrect 0


ZeroDivisionError: division by zero

In [425]:
avePrec

[0.8333333333333334,
 0.8755020080321285,
 0.44594594594594594,
 0.37349397590361444,
 0.725,
 0.6271186440677966]

In [426]:
np.mean(avePrec)

0.6467323178804698

# Test Singleplex

In [77]:
include_idx = set(multiplexProteinsIndices)  #Set is more efficient, but doesn't reorder your elements if that is desireable
mask = np.array([(i in include_idx) for i in range(len(testProtein_weights))])
X_testSingleplex=testProtein_weights[~mask]
y_testSingleplex=testLablesOneHot[~mask]

In [78]:

YtestPredicted=model.predict(X_testSingleplex)

avePrec=computeAbsoluteAccuracyPerLable(y_testSingleplex, YtestPredicted)

: 126
: 47
: 77
: 115
: 123
: 53
countSingleCorrect 341
countSingleCorrect 541


ZeroDivisionError: division by zero

In [429]:
avePrec

[0.7857142857142857,
 0.5319148936170213,
 0.6493506493506493,
 0.6521739130434783,
 0.7073170731707317,
 0.5094339622641509]

In [430]:
np.mean(avePrec)

0.6393174628600529