In [1]:
import pandas as pd

import numpy as np

import pickle

# Load Datasets

In [2]:
#Reference of the dataset from DeepGO
# ! tar -xvzf  CAFA3_training_data/data12G/data/train.tar.gz

### Load Molecular Function Terms

In [3]:
trainMF_terms=pd.read_pickle('../../../data/DeepGo/train-mf.pkl')

trainMF_terms=trainMF_terms[trainMF_terms['orgs']=='9606']

testMF_terms=pd.read_pickle('../../../data/DeepGo/test-mf.pkl')

testMF_terms=testMF_terms[testMF_terms['orgs']=='9606']

trainMF_terms.columns=['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences','orgs', 'embeddings']

testMF_terms.columns=['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences','orgs', 'embeddings']

# load embeddings

In [4]:
from numpy import load

data = load('../../../data/ProtVec_Kmer_baseline/k_mer_l1_norm.npz')

k_mer_l1_norm=data['arr_0']

embeddings=k_mer_l1_norm

accessions=pd.read_csv('../../../data/ProtVec_Kmer_baseline/accessions.txt',header=None)

embedding_size=embeddings.shape[1]
trainProtein_weights = np.zeros((trainMF_terms.shape[0], embedding_size))
testProtein_weights = np.zeros((testMF_terms.shape[0], embedding_size))

for i,prot in enumerate(trainMF_terms['uniprot_ac']):  
    trainProtein_weights[i]=embeddings[accessions.loc[accessions[0]==prot].index[0]]

for i,prot in enumerate(testMF_terms['uniprot_ac']):  
    testProtein_weights[i]=embeddings [accessions.loc[accessions[0]==prot].index[0]]

### Prepare Input Data

In [5]:
trainMF_terms.columns

Index(['uniprot_ac', 'gos', 'labels', 'ngrams', 'proteins', 'sequences',
       'orgs', 'embeddings'],
      dtype='object')

In [6]:
Ytrain=trainMF_terms[['labels']]
Ytrain=Ytrain.labels.apply(pd.Series)
Ytrain=np.array(Ytrain)

X_train=trainProtein_weights
X_train=np.array(X_train)
X_train=np.expand_dims(X_train,axis=-1)

Ytest=testMF_terms[['labels']]
Ytest=Ytest.labels.apply(pd.Series)
Ytest=np.array(Ytest)

X_test=testProtein_weights
X_test=np.array(X_test)
X_test=np.expand_dims(X_test,axis=-1)

### Train and Evaluate the model

In [7]:
import sys
sys.path.append('../../../utils/')

from sklearn.model_selection import StratifiedKFold

from  model import naive_CNN_classifier

import keras

from sklearn.metrics import label_ranking_average_precision_score

from numpy import arange

from sklearn.metrics import f1_score


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [8]:
random_seed=13

In [9]:
num_classes=Ytrain.shape[1]

In [10]:
model=None
model = naive_CNN_classifier(num_classes,embedding_size)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
history=model.fit(X_train, Ytrain, validation_data=(X_test,Ytest) ,epochs=50, batch_size=16,verbose=0)
YtestPredicted_raw=model.predict(X_test)
avePrec =label_ranking_average_precision_score(Ytest, YtestPredicted_raw) 




In [12]:
avePrec

0.42940313603660907

In [13]:
results={'treshold':[],'Average Precision':[],'F1 (micro)':[],'F1 (macro)':[],'Method':[]}

In [14]:
for treshold in arange(0.05,0.55,0.05):  
    YtestPredicted=None
    YtestPredicted=YtestPredicted_raw.copy()
    results['treshold'].append(treshold)
    results['Average Precision'].append(avePrec)
    YtestPredicted[YtestPredicted>=treshold]=1
    YtestPredicted[YtestPredicted<treshold]=0
    results['F1 (micro)'].append  (f1_score(Ytest, YtestPredicted, average='micro'))
    results['F1 (macro)'].append  (f1_score(Ytest, YtestPredicted, average='macro'))
    results['Method'].append ('Kmers')


  average, "true nor predicted", 'F-score is', len(true_sum)


In [15]:
df_results=pd.DataFrame(results)

In [16]:
df_results.head(50)

Unnamed: 0,treshold,Average Precision,F1 (micro),F1 (macro),Method
0,0.05,0.429403,0.266028,0.059237,Kmers
1,0.1,0.429403,0.331393,0.053667,Kmers
2,0.15,0.429403,0.352304,0.046796,Kmers
3,0.2,0.429403,0.355406,0.041925,Kmers
4,0.25,0.429403,0.352766,0.037438,Kmers
5,0.3,0.429403,0.344828,0.034914,Kmers
6,0.35,0.429403,0.329538,0.032561,Kmers
7,0.4,0.429403,0.316301,0.030733,Kmers
8,0.45,0.429403,0.296672,0.028213,Kmers
9,0.5,0.429403,0.27677,0.025524,Kmers


In [17]:
df_results.to_csv('../../../data/_Outputs/Final_Kmers_MF.csv',index=False)