In [1]:
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score,roc_curve,precision_recall_curve
import pandas as pd
PATH_TO_RESULTS='results/'
PATH_TO_CSV='data/'

In [2]:
def MCC(predictions,Yt):
    thres=.5
    tp=np.sum((predictions>=thres)*Yt)
    fp=np.sum((predictions>=thres)*(1-Yt))
    tn=np.sum((predictions<thres)*(1-Yt))
    fn=np.sum((predictions<thres)*Yt)
    mcc=(tp*tn-fp*fn)/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    return mcc

In [3]:
def F1_score(predictions,Yt):
    thres=.5
    tp=np.sum((predictions>=thres)*Yt)
    fp=np.sum((predictions>=thres)*(1-Yt))
    tn=np.sum((predictions<thres)*(1-Yt))
    fn=np.sum((predictions<thres)*Yt)
    f1=(2*tp)/(2*tp+fn+fp)
    return f1

In [4]:

def ACC_F1_AUC_AP(predictions,Yt):
    mask= np.logical_not(np.isnan(predictions))
    predictions=predictions[mask]
    Yt=Yt[mask]
    thres=.5
    tpr=np.sum((predictions>=thres)*Yt)/np.sum(Yt)
    tnr=np.sum((predictions<thres)*(1-Yt))/np.sum(1-Yt)
    f1_score=round(F1_score(predictions,Yt),3)
    acc=round(np.sum((predictions>=thres)==Yt)/len(Yt),3)
    try:
        roc_auc=round(roc_auc_score(Yt,predictions),3)
    except:
        roc_auc=0
    try:
        aver_precision_score=round(average_precision_score(Yt,predictions),3)
    except:
        aver_precision_score=0
    return acc,f1_score,roc_auc,aver_precision_score

In [5]:
good_species=['escherichia_coli', 'mycobacterium_tuberculosis']
#used in generating table 1
name = good_species[0] #change this to see the ammounts of the different species
test_set=pd.read_csv(f"{PATH_TO_CSV}{name}_test_full.csv")
train_set=pd.read_csv(f"{PATH_TO_CSV}{name}_train_full.csv")
valid_set=pd.read_csv(f"{PATH_TO_CSV}{name}_valid_full.csv")
    
nums=np.zeros(6)
nums[0]=np.sum(train_set['dna_binding'])
nums[1]=np.sum(1-train_set['dna_binding'])
nums[2]=np.sum(test_set['dna_binding'])
nums[3]=np.sum(1-test_set['dna_binding'])
nums[4]=np.sum(valid_set['dna_binding'])
nums[5]=np.sum(1-valid_set['dna_binding'])

In [6]:
nums

array([ 17546., 217288.,    836.,   7109.,    425.,   2997.])

In [22]:
# used in generating the results table
good_species=['escherichia_coli', 'mycobacterium_tuberculosis']
models=['nn','cnn','lstm','bilstm','xgb']
total_results=np.zeros((len(models),4))
valid='Bacillus subtilis'
name=good_species[1] #change this to see the results of the different species
test_set=pd.read_csv(f"{PATH_TO_CSV}{name}_test_50.csv")
Yt=test_set['dna_binding'].values
for i, model in enumerate(models):
    
    
    pred=np.load(f'{PATH_TO_RESULTS}results_{model}_{name}_50.npy')
    acc,f1score,auc,aver_precision_score=ACC_F1_AUC_AP(pred,Yt)
    
    total_results[i,0]=acc
    total_results[i,1]=f1score
    total_results[i,2]=auc
    total_results[i,3]=aver_precision_score

In [23]:
model_names=["NN",'CNN','LSTM','BILSTM','XGB']
df_result=pd.DataFrame(data=total_results,index=model_names,
                 columns=['ACC','F1 Score', 'AUC','AP'])

In [24]:
df_result

Unnamed: 0,ACC,F1 Score,AUC,AP
NN,0.958,0.787,0.881,0.641
CNN,0.92,0.641,0.933,0.71
LSTM,0.921,0.628,0.906,0.663
BILSTM,0.825,0.371,0.835,0.347
XGB,0.951,0.768,0.979,0.877


In [21]:
df_result.to_latex()

'\\begin{tabular}{lrrrr}\n\\toprule\n{} &    ACC &  F1 Score &  AUC &     AP \\\\\n\\midrule\nNN     &  0.901 &       0.0 &  0.5 &  0.099 \\\\\nLSTM   &  0.901 &       0.0 &  0.5 &  0.099 \\\\\nBILSTM &  0.901 &       0.0 &  0.5 &  0.099 \\\\\nCNN    &  0.901 &       0.0 &  0.5 &  0.099 \\\\\nXGB    &  0.901 &       0.0 &  0.5 &  0.099 \\\\\n\\bottomrule\n\\end{tabular}\n'