In [1]:
import numpy as np
import pandas as pd

## Load Dataset

In [19]:
import scipy.io
D1 = scipy.io.loadmat('../../../data/MkSVM/dataset_3106.mat',squeeze_me=True)

### Details of the Dataset

In [20]:
type(D1)

dict

In [21]:
D1.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y_3106', 'label_name', 'protein_list', 'sequence_3106'])

In [22]:
D1['sequence_3106'][0]

'MFRRKLTALDYHNPAGFNCKDETEFRNFIVWLEDQKIRHYKIEDRGNLRNIHSSDWPKFFEKYLRDVNCPFKIQDRQEAIDWLLGLAVRLEYGDNAEKYKDLVPDNSKTADNATKNAEPLINLDVNNPDFKAGVMALANLLQIQRHDDYLVMLKAIRILVQERLTQDAVAKANQTKEGLPVALDKHILGFDTGDAVLNEAAQILRLLHIEELRELQTKINEAIVAVQAIIADPKTDHRLGKVGR'

In [23]:
D1['Y_3106'][0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=uint8)

In [24]:
D1['Y_3106']

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)

In [25]:
len(D1['sequence_3106'])

3106

#### Number of multiplex proteins

In [26]:
len(np.nonzero(np.sum(D1['Y_3106'],axis=1)>1)[0])

526

In [27]:
D1['label_name']

array(['Centrosome', 'Cytoplasm', 'Cytoskeleton', 'Endosome',
       'Endoplasmic-Reticulum', 'Extracellular', 'Golgi-Apparatus',
       'Lysosome', 'Microsome', 'Mitochondrion', 'Nucleus', 'Peroxisome',
       'Plasma-Membrane', 'Synapse'], dtype=object)

In [28]:
D1['protein_list'][0]

'Q9Y224'

### Extract Proteins from the Dataset

In [29]:
dfTrain=pd.DataFrame(D1['protein_list'],columns=['uniprot_ac'])

In [30]:
dfTrain.shape

(3106, 1)

## Map Uniprot 2 String

* STring IDs are required for embedding generation

In [31]:
import sys
sys.path.append('../../../utils/')

In [32]:
from  utils import UniprotID_to_StringId

In [33]:
dfTrain.columns=['uniprot_ac']
dfTrain=UniprotID_to_StringId(dfTrain)

Not Matched:  55


In [34]:
dfTrain.shape

(3106, 3)

In [35]:
dfTrain.head()

Unnamed: 0,uniprot_ac,uniprot_ac_uniprot_id,string_id
0,Q9Y224,Q9Y224|RTRAF_HUMAN,9606.ENSP00000261700
1,Q5JTW2,Q5JTW2|CEP78_HUMAN,9606.ENSP00000365782
2,Q7Z460,Q7Z460|CLAP1_HUMAN,9606.ENSP00000263710
3,Q5VT06,Q5VT06|CE350_HUMAN,9606.ENSP00000356579
4,Q5VYK3,Q5VYK3|ECM29_HUMAN,9606.ENSP00000259335


In [36]:
# all proteins belong to human
len(np.nonzero (dfTrain['string_id'].str.contains('9606'))[0])

3106

# Load TripletProt Network and generate embeddings

In [37]:
from  utils import generate_tripletProt_embeddings

### Generate Train Embeddings

In [38]:
trainProtein_weights=generate_tripletProt_embeddings(dfTrain)







number of found:  3051


In [39]:
trainProtein_weights.shape

(3106, 64)

In [40]:
X_train=trainProtein_weights

In [41]:
X_train=np.array(X_train)

In [42]:
y_train=D1['Y_3106']

In [43]:
y_train.shape

(3106, 14)

In [44]:
y_train[0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=uint8)

### Train and Evaluate the model

In [45]:
from sklearn.model_selection import StratifiedKFold

from  model import feedForward_classifier

from sklearn.metrics import label_ranking_average_precision_score

import keras


In [58]:
embedding_size=64

class_count=y_train.shape[1]

In [59]:
def train_and_evaluate_model(W_train,Y_train,Kfold_repeat):
    prec_list=[]; reca_list=[]; fscore_list=[] ; fold=0
    random_seed=1311
    all_histories=[]
    Y = [np.argmax(y, axis=None, out=None) for y in Y_train]
    for i in range (Kfold_repeat):
        random_seed=random_seed*i
        skf = StratifiedKFold(n_splits=5, random_state=random_seed,shuffle=True)
        fold=0
        for train_index, test_index in skf.split(W_train,Y):     
            fold+=1
            X_train, X_test = W_train[train_index], W_train[test_index] 
            y_train, y_test = Y_train[train_index], Y_train[test_index]
            model = None # Clearing the NN.
            model=feedForward_classifier(class_count,embedding_size)
            history=model.fit(X_train, y_train ,epochs=50, batch_size=32,verbose=0)

            all_histories.append(history)

            YtestPredicted=model.predict(X_test)

            avePrec =label_ranking_average_precision_score(y_test, YtestPredicted) 

            print("Repeat {:d}: Fold {:d}: Precision:{:.2f}% ".format(i+1,fold,avePrec*100))

            prec_list.append(avePrec) 

    precission=sum(prec_list)/len(prec_list)*100 
    print("Final: Precision:{:.2f}% ".format(precission ))

In [60]:

train_and_evaluate_model(X_train,y_train,Kfold_repeat=5)


Repeat 1: Fold 1: Precision:74.96% 
Repeat 1: Fold 2: Precision:75.70% 
Repeat 1: Fold 3: Precision:79.24% 
Repeat 1: Fold 4: Precision:77.13% 
Repeat 1: Fold 5: Precision:77.94% 
Repeat 2: Fold 1: Precision:75.40% 
Repeat 2: Fold 2: Precision:76.26% 
Repeat 2: Fold 3: Precision:78.86% 
Repeat 2: Fold 4: Precision:78.24% 
Repeat 2: Fold 5: Precision:78.19% 
Repeat 3: Fold 1: Precision:74.95% 
Repeat 3: Fold 2: Precision:76.31% 
Repeat 3: Fold 3: Precision:78.90% 
Repeat 3: Fold 4: Precision:77.21% 
Repeat 3: Fold 5: Precision:78.21% 
Repeat 4: Fold 1: Precision:75.76% 
Repeat 4: Fold 2: Precision:76.14% 
Repeat 4: Fold 3: Precision:79.22% 
Repeat 4: Fold 4: Precision:77.15% 
Repeat 4: Fold 5: Precision:76.91% 
Repeat 5: Fold 1: Precision:75.82% 
Repeat 5: Fold 2: Precision:76.39% 
Repeat 5: Fold 3: Precision:77.98% 
Repeat 5: Fold 4: Precision:77.86% 
Repeat 5: Fold 5: Precision:78.05% 
Final: Precision:77.15% 
