In [1]:
import pandas as pd

import numpy as np

import pickle

# Load Datasets

In [2]:
#Reference of the dataset from DeepGO
# ! tar -xvzf  CAFA3_training_data/data12G/data/train.tar.gz

### Load Biological Process Terms

In [3]:
trainBP_terms=pd.read_pickle('../../../data/DeepGo/train-bp.pkl')

trainBP_terms=trainBP_terms[trainBP_terms['orgs']=='9606']

testBP_terms=pd.read_pickle('../../../data/DeepGo/test-bp.pkl')

testBP_terms=testBP_terms[testBP_terms['orgs']=='9606']

trainBP_terms.columns=['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences','orgs', 'embeddings']

testBP_terms.columns=['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences','orgs', 'embeddings']

# load DeepGO embeddings

In [5]:
embedding_size=256

trainProtein_weights = np.zeros((trainBP_terms.shape[0], embedding_size))

testProtein_weights = np.zeros((testBP_terms.shape[0], embedding_size))




In [9]:
trainBP_terms.columns

Index(['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences',
       'orgs', 'embeddings'],
      dtype='object')

In [27]:
for i in range(trainBP_terms.shape[0]):  
    trainProtein_weights[i]=trainBP_terms.iloc[i][7]

for i in range(testBP_terms.shape[0]):  
    testProtein_weights[i]=testBP_terms.iloc[i][7]

In [29]:
X_train=trainProtein_weights
X_train=np.array(X_train)
X_train=np.expand_dims(X_train,axis=-1)

Ytrain=trainBP_terms[['labels']]
Ytrain=Ytrain.labels.apply(pd.Series)
Ytrain=np.array(Ytrain)

X_test=testProtein_weights
X_test=np.array(X_test)
X_test=np.expand_dims(X_test,axis=-1)

Ytest=testBP_terms[['labels']]
Ytest=Ytest.labels.apply(pd.Series)
Ytest=np.array(Ytest)


# +++++++++++++++++++++++++++++++++++++++++++++

### Train and Evaluate the model

In [30]:
import sys
sys.path.append('../../../utils/')

from sklearn.model_selection import StratifiedKFold

from  model import naive_CNN_classifier

import keras

from sklearn.metrics import label_ranking_average_precision_score

from numpy import arange

from sklearn.metrics import f1_score


In [31]:
random_seed=13

In [32]:
num_classes=Ytrain.shape[1]

In [33]:
model=None
model = naive_CNN_classifier(num_classes,embedding_size)

In [34]:
history=model.fit(X_train, Ytrain, validation_data=(X_test,Ytest) ,epochs=50, batch_size=16,verbose=0)
YtestPredicted_raw=model.predict(X_test)
avePrec =label_ranking_average_precision_score(Ytest, YtestPredicted_raw) 

In [35]:
avePrec

0.40649887716258626

In [36]:
results={'treshold':[],'Average Precision':[],'F1 (micro)':[],'F1 (macro)':[],'Method':[]}

In [37]:
for treshold in arange(0.05,0.55,0.05):  
    YtestPredicted=None
    YtestPredicted=YtestPredicted_raw.copy()
    results['treshold'].append(treshold)
    results['Average Precision'].append(avePrec)
    YtestPredicted[YtestPredicted>=treshold]=1
    YtestPredicted[YtestPredicted<treshold]=0
    results['F1 (micro)'].append  (f1_score(Ytest, YtestPredicted, average='micro'))
    results['F1 (macro)'].append  (f1_score(Ytest, YtestPredicted, average='macro'))
    results['Method'].append ('DeepGO')


  average, "true nor predicted", 'F-score is', len(true_sum)


In [38]:
df_results=pd.DataFrame(results)

In [39]:
df_results.head(50)

Unnamed: 0,treshold,Average Precision,F1 (micro),F1 (macro),Method
0,0.05,0.406499,0.322007,0.187423,DeepGO
1,0.1,0.406499,0.368999,0.204128,DeepGO
2,0.15,0.406499,0.391499,0.2096,DeepGO
3,0.2,0.406499,0.403063,0.208598,DeepGO
4,0.25,0.406499,0.407743,0.205537,DeepGO
5,0.3,0.406499,0.408565,0.201766,DeepGO
6,0.35,0.406499,0.405027,0.197203,DeepGO
7,0.4,0.406499,0.39949,0.192764,DeepGO
8,0.45,0.406499,0.388711,0.186258,DeepGO
9,0.5,0.406499,0.37826,0.180334,DeepGO


In [40]:
df_results.to_csv('../../../data/_Outputs/Final_DeepGO_BP.csv',index=False)