In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from keras.layers import Input, Dense, Conv1D, Flatten, MaxPooling1D, Conv2D, MaxPooling2D, AveragePooling2D, Dropout, Reshape, normalization
from keras.models import Model
import keras.backend as K
from keras.layers.recurrent import LSTM
from sklearn import metrics
import random
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [27]:
def precision(y_true, y_pred):
    # Calculates the precision
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    # Calculates the recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def f1(test_Y, pre_test_y):
    """F1-score"""
    Precision = precision(test_Y, pre_test_y)
    Recall = recall(test_Y, pre_test_y)
    f1 = 2 * ((Precision * Recall) / (Precision + Recall + K.epsilon()))
    return f1 

def TP(test_Y,pre_test_y):
    TP = K.sum(K.round(K.clip(test_Y * pre_test_y, 0, 1)))#TP
    return TP

def FN(test_Y,pre_test_y):
    TP = K.sum(K.round(K.clip(test_Y * pre_test_y, 0, 1)))#TP
    P=K.sum(K.round(K.clip(test_Y, 0, 1)))
    FN = P-TP #FN=P-TP
    return FN

def TN(test_Y,pre_test_y):
    TN=K.sum(K.round(K.clip((test_Y-K.ones_like(test_Y))*(pre_test_y-K.ones_like(pre_test_y)), 0, 1)))#TN
    return TN

def FP(test_Y,pre_test_y):
    N = (-1)*K.sum(K.round(K.clip(test_Y-K.ones_like(test_Y), -1, 0)))#N
    TN=K.sum(K.round(K.clip((test_Y-K.ones_like(test_Y))*(pre_test_y-K.ones_like(pre_test_y)), 0, 1)))#TN
    FP=N-TN
    return FP


METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

def dnn_model(train_X, train_Y, test_X, test_Y, lr, epoch, batch_size):
    train_X = np.expand_dims(train_X, 2)
    test_X = np.expand_dims(test_X, 2)
    inputs = Input(shape = (train_X.shape[1], train_X.shape[2]))
    x = Conv1D(32, kernel_size = 3, strides = 1, padding = 'valid', activation = 'relu')(inputs)
    x = MaxPooling1D(pool_size = 2, strides = 2, padding = 'same')(x)
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(32, activation = 'relu')(x)
    x = Dense(16, activation = 'relu')(x)
    x = Dense(8, activation = 'relu')(x)
    predictions = Dense(1, activation = 'sigmoid')(x)
    model = Model(inputs = inputs, outputs = predictions)
    print("model")
    model.compile(optimizer = 'RMSprop',
                  loss = 'mean_squared_error',
                  metrics = METRICS)
    print("compile")
    model.fit(train_X, train_Y, epochs = epoch, batch_size = 32, validation_data = (test_X, test_Y), shuffle = True)
    model.save('CNN_model.h5')
    pre_test_y = model.predict(test_X, batch_size = 50)
    pre_train_y = model.predict(train_X, batch_size = 50)
    test_auc = metrics.roc_auc_score(test_Y, pre_test_y)
    train_auc = metrics.roc_auc_score(train_Y, pre_train_y)
    print("train_auc: ", train_auc)
    print("test_auc: ", test_auc) 
    return test_auc



In [45]:

data = np.array(pd.read_csv("6_vecs_train.csv"))
pos_number=2994
# NOTE: the number of postive sample in train file
#CNN_model = 'CNN_model.h5'

X1 = data[0:pos_number, 1:]
Y1 = data[0:pos_number, 0]
X2 = data[pos_number:, 1:]
Y2 = data[pos_number:, 0]
X = np.concatenate([X1, X2], 0)
Y = np.concatenate([Y1, Y2], 0)
#Y = Y.reshape((Y.shape[0], -1))
print (X)
print ("X.shape: ", X.shape)
print ("Y.shape: ", Y.shape)

lr = 0.4
epoch = 20
batch_size = 32
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
#kf = KFold(n_splits = 5, shuffle = False)
kf = kf.split(X)

test_aucs = []
for i, (train_fold, validate_fold) in enumerate(kf):
    print("\n\ni: ", i)
    test_auc = dnn_model(X[train_fold], Y[train_fold], X[validate_fold], Y[validate_fold], lr, epoch, batch_size)
    test_aucs.append(test_auc)
w = open("train_Result.txt", "w")
for j in test_aucs: 
    w.write(str(j) + ',')
w.write('\n')
w.write(str(np.mean(test_aucs)) + '\n')
w.close()


[[ 0.20644704 -0.45706296 -0.2562405  ... -1.4206289   0.9298074
  -1.0609802 ]
 [-1.2384602   1.7543772   0.9248372  ... -1.8391443   1.139564
  -0.7542995 ]
 [-0.05409286  0.21270198 -0.27500126 ...  0.64489275  0.15629809
  -0.22529553]
 ...
 [ 0.9653186  -0.08109022 -0.68847555 ...  1.4501451  -0.49036932
  -0.16801585]
 [ 0.41178006 -0.64946824 -0.43891364 ...  0.25375736  0.06554881
  -0.01694088]
 [ 0.910602   -0.45758218 -0.7318825  ...  0.3283448  -0.02965107
  -0.2959396 ]]
X.shape:  (5989, 200)
Y.shape:  (5989,)


i:  0
model
compile
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
train_auc:  0.9653615591650825
test_auc:  0.9449106806676986


i:  1
model
compile
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


In [41]:

data = np.array(pd.read_csv("6_vecs_test.csv"))
pos_number = 899 # NOTE: the number of postive sample in test file
X1 = data[0:pos_number, 1:]
Y1 = data[0:pos_number, 0]
X2 = data[pos_number:, 1:]
Y2 = data[pos_number:, 0]
X_test = np.concatenate([X1, X2], 0)
Y_test = np.concatenate([Y1, Y2], 0)

print(X1.shape, X2.shape)
print(Y1.shape, Y2.shape)

lr = 0.4
# epoch = 3
batch_size = 32
data = np.array(pd.read_csv("6_vecs_train.csv"))
# pos_number = 3863
# pos_number=2097
pos_number=2994


X1 = data[0:pos_number, 1:]
Y1 = data[0:pos_number, 0]
X2 = data[pos_number:, 1:]
Y2 = data[pos_number:, 0]
X = np.concatenate([X1, X2], 0)
Y = np.concatenate([Y1, Y2], 0)

(899, 200) (898, 200)
(899,) (898,)


In [42]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(4791, 200) (4791,) (1797, 200) (1198,)


In [43]:
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

test_auc = dnn_model(X_train, y_train, X_test, Y_test, lr, 15, 32)

model
compile
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
train_auc:  0.9613534559436543
test_auc:  0.9592822760255765
