# 10 Fold CV

In [1]:
import pandas as pd
from Bio import SeqIO
import os
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Input, Flatten, LSTM, Dropout, Bidirectional, LeakyReLU, Reshape, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping

# performance matrices
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score

# plots
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from numpy import array
from numpy import argmax

from tensorflow.keras.regularizers import l1, l2


from tensorflow.keras.models import Model
from tensorflow.keras.layers import Concatenate

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

import imblearn
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, auc
import os 
import pandas as pd
import numpy as np

In [2]:
os.chdir("/home/t326h379/OGP")


df_negative = pd.read_csv('Feature_Extraction_O_linked_Training_Negative_114307_Sites_less.txt',header=None)

df_positive = pd.read_csv('Feature_Extraction_O_linked_Training_Positive_4885_Sites_less.txt',header=None)

Header_name = ["Position","PID","Position_redundant","81 Window sequence","S or T"]

col_of_feature = [i for i in range(1,1025)]

Header_name = Header_name + col_of_feature

df_positive.columns = Header_name
df_negative.columns = Header_name


frames = [df_positive, df_negative]

O_linked_training = pd.concat(frames,ignore_index = True)

df_Train_array = O_linked_training.drop(["Position","PID","Position_redundant","81 Window sequence","S or T"],axis=1)
df_Train_array = np.array(df_Train_array)

X_train_full = df_Train_array

y_train_full = np.array([1]*4885+[0]*114144)

In [3]:
X_train_full.shape

(119029, 1024)

In [4]:
y_train_full.shape

(119029,)

In [5]:
print("Prior")
print(X_train_full.shape)
print(y_train_full.shape)
seed = 42
rus = RandomUnderSampler(random_state = seed)
X_train_full, y_train_full = rus.fit_resample(X_train_full,y_train_full)
print("After")
print(X_train_full.shape)
print(y_train_full.shape)

Prior
(119029, 1024)
(119029,)
After
(9770, 1024)
(9770,)


In [6]:
X_train_full = X_train_full.reshape(X_train_full.shape[0],1024,1)

In [7]:
X_train_full.shape

(9770, 1024, 1)

In [8]:
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, auc

In [9]:
X_train_full, y_train_full = shuffle(X_train_full, y_train_full)

print(X_train_full.shape)
print(y_train_full.shape)


MCC = []
kfold = StratifiedKFold(n_splits=10)
cvscores, auc_scores, sn, sp, acc = list(), list(), list(), list(), list()

for train_index, test_index in kfold.split(X_train_full, y_train_full):
    xtrain, xval = X_train_full[train_index], X_train_full[test_index]
    ytrain, yval = y_train_full[train_index], y_train_full[test_index]      

    X_train_10= xtrain
    Y_train_10=ytrain

    Y_train_10 = tf.keras.utils.to_categorical(Y_train_10,2)
    
    model = Sequential()

    model.add(Input(shape=(1024,1)))

    model.add(Conv1D(filters=64,kernel_size=3,activation='relu',name='Conv_1D_1_add'))
    model.add(MaxPooling1D(pool_size=2,name="MaxPooling1D"))
    model.add(Dropout(0.3))

    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(32,activation='relu',name="Dense_1"))
    model.add(Dropout(0.3))

    model.add(Dense(2,activation='softmax',name="Dense_2"))

        
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),loss="binary_crossentropy",metrics=["accuracy"])

    checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath="ROC_ROC_Premise_Assumption.h5", 
                                    monitor = 'val_accuracy',
                                    verbose=0, 
                                    save_weights_only=False,
                                    save_best_only=True)

    reduce_lr_acc = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.001, patience=3, verbose=1, min_delta=1e-4, mode='max')

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=5,mode='max')

    history = model.fit(X_train_10, Y_train_10,epochs=35,verbose=1,batch_size=256)

    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.gca().set_ylim(0,1)
    plt.show()
    plt.savefig('accuracy_loss@@@@@@@@@@@@@@@_curve.png', dpi=300, bbox_inches='tight')

    Y_pred = model.predict(xval)
    Y_pred = (Y_pred > 0.5)
    y_pred = [np.argmax(y, axis=None, out=None) for y in Y_pred]
    y_pred = np.array(y_pred)
    print("Matthews Correlation : ",matthews_corrcoef(yval, y_pred))
    print("Confusion Matrix : \n",confusion_matrix(yval, y_pred))
    print("Accuracy on test set:   ",accuracy_score(yval, y_pred))

    cm = confusion_matrix(yval, y_pred)

    TP = cm[1][1]
    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]

    mcc = matthews_corrcoef(yval, y_pred)
    
    acc.append(accuracy_score(yval, y_pred))
    
    cvscores.append(mcc)

    Sensitivity = TP/(TP+FN)

    Specificity = TN/(TN+FP)
    
    sn.append(Sensitivity)
    sp.append(Specificity)

    print("Sensitivity:   ",Sensitivity,"\t","Specificity:   ",Specificity)


    fpr, tpr, _ = roc_curve(yval, y_pred)

    roc_auc_test = auc(fpr,tpr)



    print("Area Under Curve:   ",roc_auc_test)
    auc_scores.append(roc_auc_test)   


    model.summary()
    
    


print("Mean MCC = %.4f + %.4f and Mean AUC = %.4f + %.4f" % (np.mean(cvscores),np.std(cvscores),np.mean(auc_scores),np.std(auc_scores)))
MCC.append(np.mean(cvscores))

print("Mean MCC:   ",np.mean(MCC))
print("Mean Sensitivity = %.4f + %.4f and Mean Specificity = %.4f + %.4f and Mean Accuracy = %.4f + %.4f" % (np.mean(sn),np.std(sn),np.mean(sp),np.std(sp),np.mean(acc),np.std(acc)))

print("\n\n\n\n\n\n\n\n\n\n\n\n\n")

(9770, 1024, 1)
(9770,)
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Matthews Correlation :  0.6582549656647502
Confusion Matrix : 
 [[409  79]
 [ 88 401]]
Accuracy on test set:    0.8290685772773797
Sensitivity:    0.820040899795501 	 Specificity:    0.8381147540983607
Area Under Curve:    0.8290778269469309
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv_1D_1_add (Conv1D)       (None, 1022, 64)          256       
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 511

Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Matthews Correlation :  0.6625877874344758
Confusion Matrix : 
 [[438  50]
 [118 371]]
Accuracy on test set:    0.8280450358239508
Sensitivity:    0.7586912065439673 	 Specificity:    0.8975409836065574
Area Under Curve:    0.8281160950752623
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv_1D_1_add (Conv1D)       (None, 1022, 64)          256       
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 511, 64)           0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 511, 64)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 32704)             0         
_________________________________________________________________
dense_1 (Dense)

Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Matthews Correlation :  0.6521394827358874
Confusion Matrix : 
 [[408  80]
 [ 90 399]]
Accuracy on test set:    0.8259979529170931
Sensitivity:    0.8159509202453987 	 Specificity:    0.8360655737704918
Area Under Curve:    0.8260082470079453
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv_1D_1_add (Conv1D)       (None, 1022, 64)          256       
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 511, 64)           0         
____________________________________________________________

Matthews Correlation :  0.6306121841036593
Confusion Matrix : 
 [[412  76]
 [105 384]]
Accuracy on test set:    0.8147389969293757
Sensitivity:    0.7852760736196319 	 Specificity:    0.8442622950819673
Area Under Curve:    0.8147691843507995
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv_1D_1_add (Conv1D)       (None, 1022, 64)          256       
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 511, 64)           0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 511, 64)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 32704)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               8372480   
_________

Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Matthews Correlation :  0.604479477572185
Confusion Matrix : 
 [[410  79]
 [115 373]]
Accuracy on test set:    0.8014329580348004
Sensitivity:    0.764344262295082 	 Specificity:    0.8384458077709611
Area Under Curve:    0.8013950350330216
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv_1D_1_add (Conv1D)       (None, 1022, 64)          256       
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 511, 64)           0         
_________________________________________________________________
dropout_18 (Dropout)         (None, 511, 64)         

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Matthews Correlation :  0.6284419728302509
Confusion Matrix : 
 [[412  77]
 [105 383]]
Accuracy on test set:    0.8137154554759468
Sensitivity:    0.7848360655737705 	 Specificity:    0.8425357873210634
Area Under Curve:    0.813685926447417
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv_1D_1_add (Conv1D)       (None, 1022, 64)          256       
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 511, 64)           0     

Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Matthews Correlation :  0.6705967236304332
Confusion Matrix : 
 [[428  61]
 [101 387]]
Accuracy on test set:    0.834186284544524
Sensitivity:    0.7930327868852459 	 Specificity:    0.8752556237218814
Area Under Curve:    0.8341442053035637
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv_1D_1_add (Conv1D)       (None, 1022, 64)          256       
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 511, 64)           0         
_________________________________________________________________
dropout_27 (Dropout)         (None, 511, 64)           0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 32704)             0         
_________________________________________________________________
dense_9 (Dense) 

# Independent Testing

In [11]:
os.chdir("/home/t326h379/OGP")


df_negative = pd.read_csv('Feature_Extraction_O_linked_Training_Negative_114307_Sites_less.txt',header=None)

df_positive = pd.read_csv('Feature_Extraction_O_linked_Training_Positive_4885_Sites_less.txt',header=None)

Header_name = ["Position","PID","Position_redundant","81 Window sequence","S or T"]

col_of_feature = [i for i in range(1,1025)]

Header_name = Header_name + col_of_feature

df_positive.columns = Header_name
df_negative.columns = Header_name


frames = [df_positive, df_negative]

O_linked_training = pd.concat(frames,ignore_index = True)

df_Train_array = O_linked_training.drop(["Position","PID","Position_redundant","81 Window sequence","S or T"],axis=1)
df_Train_array = np.array(df_Train_array)

X_train_full = df_Train_array

y_train_full = np.array([1]*4885+[0]*114144)

print("Prior")
print(X_train_full.shape)
print(y_train_full.shape)
seed = 42
rus = RandomUnderSampler(random_state = seed)
X_train_full, y_train_full = rus.fit_resample(X_train_full,y_train_full)
print("After")
print(X_train_full.shape)
print(y_train_full.shape)

# Independent Test Dataset
df_negative_test = pd.read_csv('Feature_Extraction_O_linked_Testing_Negative_11466_Sites_less.txt',header=None)

df_positive_test = pd.read_csv('Feature_Extraction_O_linked_Testing_Positive_375_Sites_less.txt',header=None)

Header_name = ["Position","PID","Position_redundant","81 Window sequence","S or T"]

col_of_feature = [i for i in range(1,1025)]

Header_name = Header_name + col_of_feature

df_positive_test.columns = Header_name

df_negative_test.columns = Header_name


frames_test = [df_positive_test, df_negative_test]

O_linked_testing = pd.concat(frames_test,ignore_index = True)

df_Test_array = O_linked_testing.drop(["Position","PID","Position_redundant","81 Window sequence","S or T"],axis=1)
df_Test_array = np.array(df_Test_array)

X_test_full = df_Test_array

y_test_full = np.array([1]*374+[0]*11466)

# Training Starts From Here

from sklearn.metrics import roc_curve, roc_auc_score, classification_report, auc

from imblearn.under_sampling import RandomUnderSampler

# Run the model for Three times and choose the best answer among them
a = random.sample(range(1, 1000000), 3)

for i in a:
    seed = i
    print("Seed : ", seed)

    rus = RandomUnderSampler(random_state = seed)
    X_train, y_train = rus.fit_resample(X_train_full,y_train_full)
    
    X_train =  X_train.reshape( X_train.shape[0],1024,1)

    x_train, x_val, y_train_1, y_val = train_test_split(X_train, y_train,random_state =21, test_size=0.1)

    y_train_1 = tf.keras.utils.to_categorical(y_train_1,2)
    y_val = tf.keras.utils.to_categorical(y_val,2)
    
    model = Sequential()

    model.add(Input(shape=(1024,1)))

    model.add(Conv1D(filters=64,kernel_size=3,activation='relu',name='Conv_1D_1_add'))
    model.add(MaxPooling1D(pool_size=2,name="MaxPooling1D"))
    model.add(Dropout(0.3))

    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(32,activation='relu',name="Dense_1"))
    model.add(Dropout(0.3))

    model.add(Dense(2,activation='softmax',name="Dense_2"))


    model.compile(optimizer=tf.keras.optimizers.Adam(),loss="binary_crossentropy",metrics=["accuracy"])

    checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath="ROC_ROC_Premise_Assumption.h5", 
                                    monitor = 'val_accuracy',
                                    verbose=0, 
                                    save_weights_only=False,
                                    save_best_only=True)

    reduce_lr_acc = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.001, patience=5, verbose=1, min_delta=1e-4, mode='max')

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=5,mode='max')

    history = model.fit(x_train, y_train_1,epochs=400,verbose=1,batch_size=256,
                            callbacks=[checkpointer,reduce_lr_acc, early_stopping],validation_data=(x_val, y_val))
    
    from imblearn.under_sampling import RandomUnderSampler
    
    from imblearn.under_sampling import RandomUnderSampler

    rus = RandomUnderSampler(random_state = seed)
    X_independent, y_independent = rus.fit_resample(X_test_full,y_test_full)
    
    X_independent = X_independent.reshape(X_independent.shape[0],1024,1)
    Y_pred = model.predict(X_independent)
    Y_pred = (Y_pred > 0.5)
    y_pred = [np.argmax(y, axis=None, out=None) for y in Y_pred]
    y_pred = np.array(y_pred)

    confusion = confusion_matrix(y_independent,y_pred)

    print("Matthews Correlation : ",matthews_corrcoef(y_independent, y_pred))
    print("Confusion Matrix : \n",confusion_matrix(y_independent, y_pred))
    print("Accuracy on test set:   ",accuracy_score(y_independent, y_pred))

    cm = confusion_matrix(y_independent, y_pred)

    TP = cm[1][1]
    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]

    mcc = matthews_corrcoef(y_independent, y_pred)

    Sensitivity = TP/(TP+FN)

    Specificity = TN/(TN+FP)

    print("Sensitivity:   ",Sensitivity,"\t","Specificity:   ",Specificity)

    print(classification_report(y_independent, y_pred))

    fpr, tpr, _ = roc_curve(y_independent, y_pred)

    roc_auc_test = auc(fpr,tpr)



    print("Area Under Curve:   ",roc_auc_test)


Prior
(119029, 1024)
(119029,)
After
(9770, 1024)
(9770,)
Seed :  841264
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 00014: ReduceLROnPlateau reducing learning rate to 1.0000000474974512e-06.
Matthews Correlation :  0.5525827412536368
Confusion Matrix : 
 [[305  69]
 [ 99 275]]
Accuracy on test set:    0.7754010695187166
Sensitivity:    0.7352941176470589 	 Specificity:    0.8155080213903744
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       374
           1       0.80      0.74      0.77       374

    accuracy                           0.78       748
   macro avg       0.78      0.78      0.78       748
weighted avg       0.78      0.78      0.78       748

Area Under Curve:    0.7754010695187166
Seed :  883746
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoc