In [1]:
import numpy as np
import pandas as pd

## Load Dataset

In [2]:
import scipy.io
D1 = scipy.io.loadmat('../../../data/MkSVM/dataset_3106.mat',squeeze_me=True)

### Details of the Dataset

In [3]:
type(D1)

dict

In [4]:
D1.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y_3106', 'label_name', 'protein_list', 'sequence_3106'])

In [5]:
D1['sequence_3106'][0]

'MFRRKLTALDYHNPAGFNCKDETEFRNFIVWLEDQKIRHYKIEDRGNLRNIHSSDWPKFFEKYLRDVNCPFKIQDRQEAIDWLLGLAVRLEYGDNAEKYKDLVPDNSKTADNATKNAEPLINLDVNNPDFKAGVMALANLLQIQRHDDYLVMLKAIRILVQERLTQDAVAKANQTKEGLPVALDKHILGFDTGDAVLNEAAQILRLLHIEELRELQTKINEAIVAVQAIIADPKTDHRLGKVGR'

In [6]:
D1['Y_3106'][0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=uint8)

In [7]:
D1['Y_3106']

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)

In [8]:
len(D1['sequence_3106'])

3106

#### Number of multiplex proteins

In [9]:
len(np.nonzero(np.sum(D1['Y_3106'],axis=1)>1)[0])

526

In [10]:
D1['label_name']

array(['Centrosome', 'Cytoplasm', 'Cytoskeleton', 'Endosome',
       'Endoplasmic-Reticulum', 'Extracellular', 'Golgi-Apparatus',
       'Lysosome', 'Microsome', 'Mitochondrion', 'Nucleus', 'Peroxisome',
       'Plasma-Membrane', 'Synapse'], dtype=object)

In [11]:
D1['protein_list'][0]

'Q9Y224'

### Extract Proteins from the Dataset

In [12]:
dfTrain=pd.DataFrame(D1['protein_list'],columns=['uniprot_ac'])

In [13]:
dfTrain.shape

(3106, 1)

## Map Uniprot 2 String

* STring IDs are required for embedding generation

In [14]:
import sys
sys.path.append('../../../utils/')

In [16]:
from  utils import UniprotID_to_StringId

In [17]:
dfTrain.columns=['uniprot_ac']
dfTrain=UniprotID_to_StringId(dfTrain)

Not Matched:  55


  return bound(*args, **kwds)


In [18]:
dfTrain.shape

(3106, 3)

In [19]:
dfTrain.head()

Unnamed: 0,uniprot_ac,uniprot_ac_uniprot_id,string_id
0,Q9Y224,Q9Y224|RTRAF_HUMAN,9606.ENSP00000261700
1,Q5JTW2,Q5JTW2|CEP78_HUMAN,9606.ENSP00000365782
2,Q7Z460,Q7Z460|CLAP1_HUMAN,9606.ENSP00000263710
3,Q5VT06,Q5VT06|CE350_HUMAN,9606.ENSP00000356579
4,Q5VYK3,Q5VYK3|ECM29_HUMAN,9606.ENSP00000259335


In [20]:
# all proteins belong to human
len(np.nonzero (dfTrain['string_id'].str.contains('9606'))[0])

3106

# Load TripletProt Network and generate embeddings

In [21]:
from  utils import generate_tripletProt_embeddings

### Generate Train Embeddings

In [22]:
trainProtein_weights=generate_tripletProt_embeddings(dfTrain)







number of found:  3051


In [23]:
trainProtein_weights.shape

(3106, 64)

In [24]:
X_train=trainProtein_weights

In [25]:
X_train=np.array(X_train)

In [26]:
y_train=D1['Y_3106']

In [27]:
y_train.shape

(3106, 14)

In [28]:
y_train[0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=uint8)

### Train and Evaluate the model

In [44]:
from sklearn.model_selection import StratifiedKFold

from  model import naive_CNN_classifier

from sklearn.metrics import label_ranking_average_precision_score

import keras


In [None]:
X_train=np.expand_dims(X_train,axis=-1)

X_train.shape

random_seed=1311

embedding_size=64

class_count=y_train.shape[1]

In [60]:
def train_and_evaluate_model(W_train,Y_train):
    prec_list=[]; reca_list=[]; fscore_list=[] ; fold=0
    skf = StratifiedKFold(n_splits=5, random_state=random_seed)
    
    all_histories=[]
    Y = [np.argmax(y, axis=None, out=None) for y in Y_train]
    for train_index, test_index in skf.split(W_train,Y):     
        fold+=1
        X_train, X_test = W_train[train_index], W_train[test_index] 
        y_train, y_test = Y_train[train_index], Y_train[test_index]
        model = None # Clearing the NN.
        #model = build_model()
        #model = buildModel()
        model=naive_CNN_classifier(class_count,embedding_size)
        erary_stop=keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto', restore_best_weights=True)
        history=model.fit(X_train, y_train, validation_data=(X_test,y_test) ,epochs=100, batch_size=16,verbose=0,callbacks=[erary_stop])
        
        all_histories.append(history)
        
        YtestPredicted=model.predict(X_test)
        
        avePrec =label_ranking_average_precision_score(y_test, YtestPredicted) 
        
        print("Fold {:d}: Precision:{:.2f}% ".format(fold,avePrec*100))
        
        prec_list.append(avePrec) 
    
    precission=sum(prec_list)/len(prec_list)*100 
    print("Final: Precision:{:.2f}% ".format(precission ))

In [61]:
X_train.shape

(3106, 64, 1)

In [62]:

train_and_evaluate_model(X_train,y_train)




Fold 1: Precision:76.75% 
Fold 2: Precision:77.23% 
Fold 3: Precision:76.44% 
Fold 4: Precision:76.96% 
Fold 5: Precision:77.65% 
Final: Precision:77.01% 
