In [1]:
import pandas as pd

import numpy as np

import pickle

# Load Datasets

In [2]:
#Reference of the dataset from DeepGO
# ! tar -xvzf  CAFA3_training_data/data12G/data/train.tar.gz

### Load Biological Process Terms

In [3]:
trainBP_terms=pd.read_pickle('../../../data/DeepGo/train-bp.pkl')

trainBP_terms=trainBP_terms[trainBP_terms['orgs']=='9606']

testBP_terms=pd.read_pickle('../../../data/DeepGo/test-bp.pkl')

testBP_terms=testBP_terms[testBP_terms['orgs']=='9606']

trainBP_terms.columns=['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences','orgs', 'embeddings']

testBP_terms.columns=['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences','orgs', 'embeddings']

# load embeddings

In [13]:
from numpy import load

data = load('../../../data/ProtVec_Kmer_baseline/k_mer_l1_norm.npz')

k_mer_l1_norm=data['arr_0']

embeddings=k_mer_l1_norm

accessions=pd.read_csv('../../../data/ProtVec_Kmer_baseline/accessions.txt',header=None)

embedding_size=embeddings.shape[1]
trainProtein_weights = np.zeros((trainBP_terms.shape[0], embedding_size))
testProtein_weights = np.zeros((testBP_terms.shape[0], embedding_size))

for i,prot in enumerate(trainBP_terms['uniprot_ac']):  
    trainProtein_weights[i]=embeddings[accessions.loc[accessions[0]==prot].index[0]]

for i,prot in enumerate(testBP_terms['uniprot_ac']):  
    testProtein_weights[i]=embeddings [accessions.loc[accessions[0]==prot].index[0]]

### Prepare Input Data

In [15]:
trainBP_terms.columns

Index(['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences',
       'orgs', 'embeddings'],
      dtype='object')

In [26]:
Ytrain=trainBP_terms[['labels']]
Ytrain=Ytrain.labels.apply(pd.Series)
Ytrain=np.array(Ytrain)

X_train=trainProtein_weights
X_train=np.array(X_train)
X_train=np.expand_dims(X_train,axis=-1)

Ytest=testBP_terms[['labels']]
Ytest=Ytest.labels.apply(pd.Series)
Ytest=np.array(Ytest)

X_test=testProtein_weights
X_test=np.array(X_test)
X_test=np.expand_dims(X_test,axis=-1)

### Train and Evaluate the model

In [27]:
import sys
sys.path.append('../../../utils/')

from sklearn.model_selection import StratifiedKFold

from  model import naive_CNN_classifier

import keras

from sklearn.metrics import label_ranking_average_precision_score

from numpy import arange

from sklearn.metrics import f1_score


In [28]:
random_seed=13

In [29]:
num_classes=Ytrain.shape[1]

In [33]:
model=None
model = naive_CNN_classifier(num_classes,embedding_size)

In [34]:
history=model.fit(X_train, Ytrain, validation_data=(X_test,Ytest) ,epochs=50, batch_size=16,verbose=0)
YtestPredicted_raw=model.predict(X_test)
avePrec =label_ranking_average_precision_score(Ytest, YtestPredicted_raw) 

In [35]:
avePrec

0.362192008866414

In [36]:
results={'treshold':[],'Average Precision':[],'F1 (micro)':[],'F1 (macro)':[],'Method':[]}

In [37]:
for treshold in arange(0.05,0.55,0.05):  
    YtestPredicted=None
    YtestPredicted=YtestPredicted_raw.copy()
    results['treshold'].append(treshold)
    results['Average Precision'].append(avePrec)
    YtestPredicted[YtestPredicted>=treshold]=1
    YtestPredicted[YtestPredicted<treshold]=0
    results['F1 (micro)'].append  (f1_score(Ytest, YtestPredicted, average='micro'))
    results['F1 (macro)'].append  (f1_score(Ytest, YtestPredicted, average='macro'))
    results['Method'].append ('Kmers')


  average, "true nor predicted", 'F-score is', len(true_sum)


In [38]:
df_results=pd.DataFrame(results)

In [39]:
df_results.head(50)

Unnamed: 0,treshold,Average Precision,F1 (micro),F1 (macro),Method
0,0.05,0.362192,0.269304,0.100064,Kmers
1,0.1,0.362192,0.32213,0.094687,Kmers
2,0.15,0.362192,0.33886,0.084399,Kmers
3,0.2,0.362192,0.338895,0.075888,Kmers
4,0.25,0.362192,0.330172,0.069433,Kmers
5,0.3,0.362192,0.315682,0.063465,Kmers
6,0.35,0.362192,0.296381,0.057593,Kmers
7,0.4,0.362192,0.276691,0.052838,Kmers
8,0.45,0.362192,0.255384,0.048423,Kmers
9,0.5,0.362192,0.233246,0.044197,Kmers


In [41]:
df_results.to_csv('../../../data/_Outputs/Final_Kmers_BP.csv',index=False)