# Load Datasets

In [1]:
import numpy as np
import pandas as pd

In [2]:
trainDatasetFile='../../../data/REALoc/S1_dataset/5939pdataset/idLocation.txt'

In [3]:
testDatasetFile='../../../data/REALoc/S1_dataset/920pdataset/test_idLocation_code.txt'

In [4]:
testDataset=pd.read_csv(testDatasetFile,sep='|',header=None)

In [5]:
trainDataset=pd.read_csv(trainDatasetFile,sep='|',header=None)

In [6]:
testDataset.head()

Unnamed: 0,0,1
0,P00505,1 4
1,Q9NP58,1 3 4
2,Q96HD9,1 2
3,O43687,1
4,Q8N7J2,1


In [7]:
trainDataset.head()

Unnamed: 0,0,1
0,O95866,Cell_membrane
1,Q70Z44,Cell_membrane
2,Q5I7T1,Cell_membrane
3,O14514,Cell_membrane
4,Q9H159,Cell_membrane


# pre process lables

In [8]:
trainLables=trainDataset[[1]]
multiplexProteinsIndicesTrain=[]
trainLablesOneHot=np.zeros(shape=(trainLables.shape[0],6))

lableDict={'Cell_membrane':0,'Cytoplasm':1,'ER_Golgi':2,'Mitochondrion':3,'Nucleus':4,'Secreted':5}

for i,row in trainLables.iterrows():
    lables=row.str.split(' ')
    lables=list(lables)
    lables=lables[0]
    lables=lables[:-1]
    if len(lables)> 1:
        multiplexProteinsIndicesTrain.append(i)
    for lable in lables:
        trainLablesOneHot[i,lableDict[lable]]=1


In [9]:
trainLablesOneHot.shape

(5939, 6)

# test lables

In [10]:
testLables=testDataset[[1]]
multiplexProteinsIndices=[]
testLablesOneHot=np.zeros(shape=(testLables.shape[0],6))


for i,row in testLables.iterrows():
    lables=row.str.split(' ')
    lables=list(lables)
    lables=lables[0]
    lables=lables[:-1]
    if len(lables)> 1:
        multiplexProteinsIndices.append(i)
    for lable in lables:
        testLablesOneHot[i,int(lable)-1]=1
        


In [11]:
testLablesOneHot.shape

(920, 6)

# map uniprot to String

* STring IDs are required for embedding generation

In [12]:
import sys
sys.path.append('../../../utils/')

In [13]:
from  utils import UniprotID_to_StringId

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


### Convert Train IDs

In [14]:
dfTrain=trainDataset[[0]]

dfTrain.columns=['uniprot_ac']
dfTrain=UniprotID_to_StringId(dfTrain)

Not Matched:  140


  return bound(*args, **kwds)


In [15]:
dfTrain.head()

Unnamed: 0,uniprot_ac,uniprot_ac_uniprot_id,string_id
0,O95866,O95866|G6B_HUMAN,9606.ENSP00000364964
1,Q70Z44,Q70Z44|5HT3D_HUMAN,9606.ENSP00000371929
2,Q5I7T1,Q5I7T1|AG10B_HUMAN,9606.ENSP00000310120
3,O14514,O14514|AGRB1_HUMAN,9606.ENSP00000430945
4,Q9H159,Q9H159|CAD19_HUMAN,9606.ENSP00000262150


### Convert Test IDs

In [16]:
dfTest=testDataset[[0]]

dfTest.columns=['uniprot_ac']

dfTest=UniprotID_to_StringId(dfTest)

Not Matched:  18


In [17]:
dfTest.shape

(920, 3)

# Load TripletProt Network and generate embeddings

In [18]:
from  utils import generate_tripletProt_embeddings

### Generate Train Embeddings

In [19]:
dfTrain.head()

Unnamed: 0,uniprot_ac,uniprot_ac_uniprot_id,string_id
0,O95866,O95866|G6B_HUMAN,9606.ENSP00000364964
1,Q70Z44,Q70Z44|5HT3D_HUMAN,9606.ENSP00000371929
2,Q5I7T1,Q5I7T1|AG10B_HUMAN,9606.ENSP00000310120
3,O14514,O14514|AGRB1_HUMAN,9606.ENSP00000430945
4,Q9H159,Q9H159|CAD19_HUMAN,9606.ENSP00000262150


In [20]:
trainProtein_weights=generate_tripletProt_embeddings(dfTrain)







number of found:  5799


#### Generate Test Embeddings

In [21]:
testProtein_weights=generate_tripletProt_embeddings(dfTest)

number of found:  902


In [22]:
testProtein_weights.shape

(920, 64)

#### Prepare Input Data

In [23]:
X_test=np.array(testProtein_weights)
X_train=np.array(trainProtein_weights)
y_train=trainLablesOneHot
y_test=testLablesOneHot


#### Evaluation Metric

In [24]:
def  computeAbsoluteAccuracyPerLable(Ytrue,Ypred):
    countSingle=0
    countSingleCorrect=0
    countMultiple=0
    countMultipleCorrect=0
    YpredOneHot=np.zeros(shape=(Ypred.shape))
    results=[]
    rows=Ytrue.shape[0]
    for i in range(rows):
        numLables=len(np.nonzero (Ytrue[i])[0])
        ind = np.argpartition(Ypred[i], -numLables)[-numLables:]
        YpredOneHot[i][ind]=1
        if numLables>1:
            countMultiple+=1
            if not any (np.logical_xor (YpredOneHot[i],Ytrue[i])):
                
                countMultipleCorrect+=1
        else:
            countSingle+=1
            if  not any (np.logical_xor (YpredOneHot[i],Ytrue[i])):
                countSingleCorrect+=1
            
    for i in range(6):
        corrects=len(np.nonzero(np.logical_and(Ytrue[:,i], YpredOneHot[:,i]))[0])
        numAllTrues=len(np.nonzero (Ytrue[:,i])[0])
        #this class has no sample
        if  numAllTrues==0:
                continue
        results.append(corrects/numAllTrues)
    #print('countSingleCorrect',countSingleCorrect)
    #print('countSingle',countSingle)
    
    return results,(countSingleCorrect/countSingle) ,(countMultipleCorrect/countMultiple)

        

### Train and Evaluate the model

In [25]:
from sklearn.model_selection import StratifiedKFold

from  model import naive_CNN_classifier

import keras

In [26]:
random_seed=1311
embedding_size=64

In [30]:
X_train=np.expand_dims(X_train,axis=-1)

In [75]:
def train_and_evaluate_model(W_train,Y_train,epochs):
    prec_list=[]; reca_list=[]; fscore_list=[] ; fold=0
    skf = StratifiedKFold(n_splits=5, random_state=random_seed,shuffle=True)
    AccuracyMultiple=0
    AccuracySingle=0
    all_histories=[]
    Y = [np.argmax(y, axis=None, out=None) for y in Y_train]
    for train_index, test_index in skf.split(W_train,Y):     
        fold+=1
        X_train, X_test = W_train[train_index], W_train[test_index] 
        y_train, y_test = Y_train[train_index], Y_train[test_index]
        model = None # Clearing the NN.
        model = naive_CNN_classifier(len(lableDict),embedding_size)
        erary_stop=keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto', restore_best_weights=True)
        history=model.fit(X_train, y_train, validation_data=(X_test,y_test) ,epochs=epochs, batch_size=8,verbose=1,callbacks=[erary_stop])
        all_histories.append(history)
        YtestPredicted=model.predict(X_test)
        avePrec,AS,AM=computeAbsoluteAccuracyPerLable(y_test, YtestPredicted)
        AccuracySingle+=AS
        AccuracyMultiple+=AM
        prec_list.append(avePrec)
    return prec_list,(AccuracySingle/5),(AccuracyMultiple/5)

In [76]:
results,AccuracySingle,AccuracyMultiple=train_and_evaluate_model(X_train,y_train,200)


Train on 4751 samples, validate on 1188 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Train on 4751 samples, validate on 1188 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200

Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Train on 4751 samples, validate on 1188 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200

Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Train on 4752 samples, validate on 1187 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200


In [77]:
results=np.array(results)

In [78]:
lableDict

{'Cell_membrane': 0,
 'Cytoplasm': 1,
 'ER_Golgi': 2,
 'Mitochondrion': 3,
 'Nucleus': 4,
 'Secreted': 5}

In [38]:
np.mean (results,axis=0)

array([0.78321365, 0.765912  , 0.65866759, 0.77061711, 0.87988803,
       0.73828593])

In [39]:

print('Average : ',np.mean (results))

Average :  0.7660973841940704


# Indipndnt Test Dataset

In [81]:
model = None # Clearing the NN.

model=naive_CNN_classifier(len(lableDict))

X_test=np.expand_dims(X_test,axis=-1)

In [85]:
history=model.fit(X_train, y_train, validation_data=(X_test,y_test) ,epochs=30, batch_size=16,verbose=0)

In [86]:

YtestPredicted=model.predict(X_test)

results,AccuracySingle,AccuracyMultiple=computeAbsoluteAccuracyPerLable(y_test, YtestPredicted)

In [89]:
results

[0.7751937984496124,
 0.8277027027027027,
 0.5422222222222223,
 0.5404040404040404,
 0.7058823529411765,
 0.5535714285714286]

In [91]:
print('Average : ',np.mean (results))

Average :  0.6574960908818639
