# Load Datasets

In [1]:
import numpy as np
import pandas as pd

In [2]:
trainDatasetFile='../../../data/REALoc/S1_dataset/5939pdataset/idLocation.txt'

In [3]:
testDatasetFile='../../../data/REALoc/S1_dataset/920pdataset/test_idLocation_code.txt'

In [4]:
testDataset=pd.read_csv(testDatasetFile,sep='|',header=None)

In [5]:
trainDataset=pd.read_csv(trainDatasetFile,sep='|',header=None)

In [6]:
testDataset.head()

Unnamed: 0,0,1
0,P00505,1 4
1,Q9NP58,1 3 4
2,Q96HD9,1 2
3,O43687,1
4,Q8N7J2,1


In [7]:
trainDataset.head()

Unnamed: 0,0,1
0,O95866,Cell_membrane
1,Q70Z44,Cell_membrane
2,Q5I7T1,Cell_membrane
3,O14514,Cell_membrane
4,Q9H159,Cell_membrane


# pre process lables

In [8]:
trainLables=trainDataset[[1]]
multiplexProteinsIndicesTrain=[]
trainLablesOneHot=np.zeros(shape=(trainLables.shape[0],6))

lableDict={'Cell_membrane':0,'Cytoplasm':1,'ER_Golgi':2,'Mitochondrion':3,'Nucleus':4,'Secreted':5}

for i,row in trainLables.iterrows():
    lables=row.str.split(' ')
    lables=list(lables)
    lables=lables[0]
    lables=lables[:-1]
    if len(lables)> 1:
        multiplexProteinsIndicesTrain.append(i)
    for lable in lables:
        trainLablesOneHot[i,lableDict[lable]]=1


In [9]:
trainLablesOneHot.shape

(5939, 6)

# test lables

In [10]:
testLables=testDataset[[1]]
multiplexProteinsIndices=[]
testLablesOneHot=np.zeros(shape=(testLables.shape[0],6))


for i,row in testLables.iterrows():
    lables=row.str.split(' ')
    lables=list(lables)
    lables=lables[0]
    lables=lables[:-1]
    if len(lables)> 1:
        multiplexProteinsIndices.append(i)
    for lable in lables:
        testLablesOneHot[i,int(lable)-1]=1
        


In [11]:
testLablesOneHot.shape

(920, 6)

# map uniprot to String

* STring IDs are required for embedding generation

In [12]:
import sys
sys.path.append('../../../utils/')

In [14]:
from  utils import UniprotID_to_StringId

### Convert Train IDs

In [15]:
dfTrain=trainDataset[[0]]

dfTrain.columns=['uniprot_ac']
dfTrain=UniprotID_to_StringId(dfTrain)

Not Matched:  140


  return bound(*args, **kwds)


In [16]:
dfTrain.head()

Unnamed: 0,uniprot_ac,uniprot_ac_uniprot_id,string_id
0,O95866,O95866|G6B_HUMAN,9606.ENSP00000364964
1,Q70Z44,Q70Z44|5HT3D_HUMAN,9606.ENSP00000371929
2,Q5I7T1,Q5I7T1|AG10B_HUMAN,9606.ENSP00000310120
3,O14514,O14514|AGRB1_HUMAN,9606.ENSP00000430945
4,Q9H159,Q9H159|CAD19_HUMAN,9606.ENSP00000262150


### Convert Test IDs

In [17]:
dfTest=testDataset[[0]]

dfTest.columns=['uniprot_ac']

dfTest=UniprotID_to_StringId(dfTest)

Not Matched:  18


In [18]:
dfTest.shape

(920, 3)

# Load TripletProt Network and generate embeddings

In [19]:
from  utils import generate_tripletProt_embeddings

### Generate Train Embeddings

In [20]:
dfTrain.head()

Unnamed: 0,uniprot_ac,uniprot_ac_uniprot_id,string_id
0,O95866,O95866|G6B_HUMAN,9606.ENSP00000364964
1,Q70Z44,Q70Z44|5HT3D_HUMAN,9606.ENSP00000371929
2,Q5I7T1,Q5I7T1|AG10B_HUMAN,9606.ENSP00000310120
3,O14514,O14514|AGRB1_HUMAN,9606.ENSP00000430945
4,Q9H159,Q9H159|CAD19_HUMAN,9606.ENSP00000262150


In [21]:
trainProtein_weights=generate_tripletProt_embeddings(dfTrain)







number of found:  5799


#### Generate Test Embeddings

In [22]:
testProtein_weights=generate_tripletProt_embeddings(dfTest)

number of found:  902


In [23]:
testProtein_weights.shape

(920, 64)

#### Prepare Input Data

In [24]:
X_test=np.array(testProtein_weights)
X_train=np.array(trainProtein_weights)
y_train=trainLablesOneHot
y_test=testLablesOneHot


#### Evaluation Metric

In [25]:
def  computeAbsoluteAccuracyPerLable(Ytrue,Ypred):
    countSingle=0
    countSingleCorrect=0
    countMultiple=0
    countMultipleCorrect=0
    YpredOneHot=np.zeros(shape=(Ypred.shape))
    results=[]
    rows=Ytrue.shape[0]
    for i in range(rows):
        numLables=len(np.nonzero (Ytrue[i])[0])
        ind = np.argpartition(Ypred[i], -numLables)[-numLables:]
        YpredOneHot[i][ind]=1
        if numLables>1:
            countMultiple+=1
            if not any (np.logical_xor (YpredOneHot[i],Ytrue[i])):
                
                countMultipleCorrect+=1
        else:
            countSingle+=1
            if  not any (np.logical_xor (YpredOneHot[i],Ytrue[i])):
                countSingleCorrect+=1
            
    for i in range(6):
        corrects=len(np.nonzero(np.logical_and(Ytrue[:,i], YpredOneHot[:,i]))[0])
        numAllTrues=len(np.nonzero (Ytrue[:,i])[0])
        #this class has no sample
        if  numAllTrues==0:
                continue
        results.append(corrects/numAllTrues)
    #print('countSingleCorrect',countSingleCorrect)
    #print('countSingle',countSingle)
    
    return results,(countSingleCorrect/countSingle) ,(countMultipleCorrect/countMultiple)

        

### Train and Evaluate the model

In [26]:
from sklearn.model_selection import StratifiedKFold

from  model import naive_CNN_classifier

import keras

In [27]:
random_seed=1311
embedding_size=64

In [30]:
#X_train=np.expand_dims(X_train,axis=-1)

In [28]:
def train_and_evaluate_model(W_train,Y_train,epochs):
    prec_list=[]; reca_list=[]; fscore_list=[] ; fold=0
    skf = StratifiedKFold(n_splits=5, random_state=random_seed,shuffle=True)
    AccuracyMultiple=0
    AccuracySingle=0
    all_histories=[]
    Y = [np.argmax(y, axis=None, out=None) for y in Y_train]
    for train_index, test_index in skf.split(W_train,Y):     
        fold+=1
        X_train, X_test = W_train[train_index], W_train[test_index] 
        y_train, y_test = Y_train[train_index], Y_train[test_index]
        model = None # Clearing the NN.
        model = naive_CNN_classifier(len(lableDict),embedding_size)
        erary_stop=keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto', restore_best_weights=True)
        #history=model.fit(X_train, y_train, validation_data=(X_test,y_test) ,epochs=epochs, batch_size=8,verbose=1,callbacks=[erary_stop])
        history=model.fit(X_train, y_train, validation_data=(X_test,y_test) ,epochs=epochs, batch_size=8,verbose=1)
        all_histories.append(history)
        YtestPredicted=model.predict(X_test)
        avePrec,AS,AM=computeAbsoluteAccuracyPerLable(y_test, YtestPredicted)
        AccuracySingle+=AS
        AccuracyMultiple+=AM
        prec_list.append(avePrec)
    return prec_list,(AccuracySingle/5),(AccuracyMultiple/5)

In [29]:
results,AccuracySingle,AccuracyMultiple=train_and_evaluate_model(X_train,y_train,100)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 4751 samples, validate on 1188 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Train on 4751 samples, validate on 1188 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Train on 4751 samples, validate on 1188 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Train on 4751 samples, validate on 1188 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/



In [30]:
results=np.array(results)

In [31]:
lableDict

{'Cell_membrane': 0,
 'Cytoplasm': 1,
 'ER_Golgi': 2,
 'Mitochondrion': 3,
 'Nucleus': 4,
 'Secreted': 5}

In [32]:
np.mean (results,axis=0)

array([0.79076431, 0.73816467, 0.65686607, 0.75974755, 0.88044519,
       0.73947941])

In [33]:

print('Average : ',np.mean (results))

Average :  0.7609111992654668


# Indipndnt Test Dataset

In [37]:
model = None # Clearing the NN.

model=naive_CNN_classifier(len(lableDict),embedding_size)

#X_test=np.expand_dims(X_test,axis=-1)

In [38]:
history=model.fit(X_train, y_train, validation_data=(X_test,y_test) ,epochs=30, batch_size=16,verbose=0)

In [39]:

YtestPredicted=model.predict(X_test)

results,AccuracySingle,AccuracyMultiple=computeAbsoluteAccuracyPerLable(y_test, YtestPredicted)

In [40]:
results

[0.748062015503876,
 0.8141891891891891,
 0.5555555555555556,
 0.5757575757575758,
 0.7058823529411765,
 0.6517857142857143]

In [41]:
print('Average : ',np.mean (results))

Average :  0.6752054005388478
