In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# oversampling packages
from imblearn.combine import SMOTEENN
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import RandomOverSampler

#optimisation
from bayes_opt import BayesianOptimization

# model building
import keras
import tensorflow
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Embedding, SimpleRNN , concatenate, Lambda, Conv1D, MaxPooling1D
from keras.layers import LeakyReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from keras import metrics
from keras.losses import BinaryCrossentropy
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from sklearn import linear_model, model_selection

## Loading Models

In [5]:
X_train = pd.read_csv('../datasets/word2vec/train_data_imputed_FINAL.csv')
X_test = pd.read_csv('../datasets/word2vec/test_data_imputed_FINAL.csv')
y_train = pd.read_csv('../datasets/word2vec/y_train_FINAL.csv')
y_test = pd.read_csv('../datasets/word2vec/y_test_FINAL.csv')

In [6]:
y_train = y_train.drop(columns=['Unnamed: 0'])
y_test = y_test.drop(columns=['Unnamed: 0'])

In [7]:
ros = RandomOverSampler(sampling_strategy = 1, random_state=42)
X_res, y_res = ros.fit_resample(X_train, y_train)

## CNN Model 1

In [5]:
best_params = {'activation': 4, 
               'batch_size': 263.7, 
               'dropout': .6181, 
               'dropout2':.4978,
               'dropout_rate': 0.3217, 
               'dropout_rate1':.2936,
               'epochs' : 28, 
               'filters':54,
               'kernel':2,
               'layers1': 3, 
               'layers2':3, 
               'learning_rate': 0.003667, 
               'neurons': 113.8 , 'optimizer': 2, 
              'pool_size':3, 'strides':3}

In [6]:
cnn_model = Sequential()
cnn_model.add(Conv1D(54, 2, activation='relu', input_shape=(X_res.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=3, strides=3))
cnn_model.add(Flatten())
cnn_model.add(Dense(113.8, activation='relu'))
cnn_model.add(Dropout(0.3217))
cnn_model.add(Dense(113.8, activation='relu'))
cnn_model.add(Dropout(0.3217))
cnn_model.add(Dense(113.8, activation='relu'))
cnn_model.add(Dropout(0.3217))
cnn_model.add(Dense(113.8, activation='relu'))
cnn_model.add(Dense(113.8, activation='relu'))
cnn_model.add(Dense(113.8, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer = RMSprop(learning_rate=0.003667), metrics=['accuracy',metrics.Precision(), metrics.Recall(),metrics.AUC()])

In [7]:
cnn_model.fit(X_res, y_res, epochs=28, batch_size=264)

Epoch 1/28
Epoch 2/28
Epoch 3/28
Epoch 4/28
Epoch 5/28
Epoch 6/28
Epoch 7/28
Epoch 8/28
Epoch 9/28
Epoch 10/28
Epoch 11/28
Epoch 12/28
Epoch 13/28
Epoch 14/28
Epoch 15/28
Epoch 16/28
Epoch 17/28
Epoch 18/28
Epoch 19/28
Epoch 20/28
Epoch 21/28
Epoch 22/28
Epoch 23/28
Epoch 24/28
Epoch 25/28
Epoch 26/28
Epoch 27/28
Epoch 28/28


<keras.callbacks.History at 0x222cf812d60>

In [8]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[0.08467860519886017, 0.9699371457099915, 0.9432843327522278, 1.0, 0.9956314563751221]


In [9]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [10]:
confusion_matrix(y_test, y_pred)

array([[2640,  184],
       [   5,   51]], dtype=int64)

In [11]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.934375
0.2170212765957447
0.9107142857142857
0.35051546391752575


## CNN Model 2

In [12]:
best_params = {'activation': 3, 
               'batch_size': 266.4, 
               'dropout': .9696, 
               'dropout2':.7751,
               'dropout_rate': 0.3758, 
               'dropout_rate1':.3579,
               'epochs' : 22, 
               'filters':60,
               'kernel':1,
               'layers1': 1, 
               'layers2':1, 
               'learning_rate': 0.003928, 
               'neurons': 119.1 , 'optimizer': 2, 
              'pool_size':3, 'strides':1}

In [14]:
cnn_model = Sequential()
cnn_model.add(Conv1D(60, 1, activation='relu', input_shape=(X_res.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=3, strides=1))
cnn_model.add(Flatten())
cnn_model.add(Dense(119.1, activation='relu'))
cnn_model.add(Dropout(0.3758))
cnn_model.add(Dense(119.1, activation='relu'))
cnn_model.add(Dropout(0.3579))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer = RMSprop(learning_rate=0.003928), metrics=['accuracy',metrics.Precision(), metrics.Recall(),metrics.AUC()])

In [15]:
cnn_model.fit(X_res, y_res, epochs=22, batch_size=266)

Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoch 11/22
Epoch 12/22
Epoch 13/22
Epoch 14/22
Epoch 15/22
Epoch 16/22
Epoch 17/22
Epoch 18/22
Epoch 19/22
Epoch 20/22
Epoch 21/22
Epoch 22/22


<keras.callbacks.History at 0x222c0b0fdf0>

In [16]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[0.060278505086898804, 0.9777738451957703, 0.9820423722267151, 0.9733462929725647, 0.997482180595398]


In [17]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [18]:
confusion_matrix(y_test, y_pred)

array([[2755,   69],
       [  16,   40]], dtype=int64)

In [19]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9704861111111112
0.3669724770642202
0.7142857142857143
0.4848484848484848


## CNN Model 3

In [20]:
best_params = {'activation': 4, 
               'batch_size': 269, 
               'dropout': 1, 
               'dropout2':.556,
               'dropout_rate': 0.4, 
               'dropout_rate1':.3176,
               'epochs' : 29, 
               'filters':57,
               'kernel':2,
               'layers1': 2, 
               'layers2':2, 
               'learning_rate': 0.002523, 
               'neurons': 118.6 , 'optimizer': 2, 
              'pool_size':3, 'strides':3}

In [21]:
cnn_model = Sequential()
cnn_model.add(Conv1D(57, 2, activation='relu', input_shape=(X_res.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=3, strides=3))
cnn_model.add(Flatten())
cnn_model.add(Dense(118.6, activation='relu'))
cnn_model.add(Dropout(0.4))
cnn_model.add(Dense(118.6, activation='relu'))
cnn_model.add(Dropout(0.4))
cnn_model.add(Dense(118.6, activation='relu'))
cnn_model.add(Dropout(0.3176))
cnn_model.add(Dense(118.6, activation='relu'))
cnn_model.add(Dropout(0.3176))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer = RMSprop(learning_rate=0.002523), metrics=['accuracy',metrics.Precision(), metrics.Recall(),metrics.AUC()])

In [22]:
cnn_model.fit(X_res, y_res, epochs=29, batch_size=269)

Epoch 1/29
Epoch 2/29
Epoch 3/29
Epoch 4/29
Epoch 5/29
Epoch 6/29
Epoch 7/29
Epoch 8/29
Epoch 9/29
Epoch 10/29
Epoch 11/29
Epoch 12/29
Epoch 13/29
Epoch 14/29
Epoch 15/29
Epoch 16/29
Epoch 17/29
Epoch 18/29
Epoch 19/29
Epoch 20/29
Epoch 21/29
Epoch 22/29
Epoch 23/29
Epoch 24/29
Epoch 25/29
Epoch 26/29
Epoch 27/29
Epoch 28/29
Epoch 29/29


<keras.callbacks.History at 0x222ddd5e640>

In [23]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[0.08944056928157806, 0.970069944858551, 0.9435207843780518, 1.0, 0.9957037568092346]


In [24]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [25]:
confusion_matrix(y_test, y_pred)

array([[2648,  176],
       [   7,   49]], dtype=int64)

In [26]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9364583333333333
0.21777777777777776
0.875
0.34875444839857644


## CNN Model 4

In [5]:
params_rnn ={
    'neurons': (32, 256),
    'activation':(0, 6),
    'optimizer':(0, 5),
    'learning_rate':(0.001, 0.01),
    'batch_size':(100, 1000),
    'epochs':(10, 30),
    'filters': (16, 64),
    'kernel': (1, 4),
    'pool_size': (1, 4),
    'strides': (0, 4)
}

In [9]:
cnn_model = Sequential()
cnn_model.add(Conv1D(27, 3, activation='tanh', input_shape=(X_res.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=4, strides=1))
cnn_model.add(Flatten())
cnn_model.add(Dense(256, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer = RMSprop(learning_rate=0.002569), metrics=['accuracy',metrics.Precision(), metrics.Recall(),metrics.AUC()])

In [10]:
cnn_model.fit(X_res, y_res, epochs=27, batch_size=115)

Epoch 1/27
Epoch 2/27
Epoch 3/27
Epoch 4/27
Epoch 5/27
Epoch 6/27
Epoch 7/27
Epoch 8/27
Epoch 9/27
Epoch 10/27
Epoch 11/27
Epoch 12/27
Epoch 13/27
Epoch 14/27
Epoch 15/27
Epoch 16/27
Epoch 17/27
Epoch 18/27
Epoch 19/27
Epoch 20/27
Epoch 21/27
Epoch 22/27
Epoch 23/27
Epoch 24/27
Epoch 25/27
Epoch 26/27
Epoch 27/27


<keras.callbacks.History at 0x210db312fd0>

In [11]:
cnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 261, 27)           108       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 258, 27)          0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 6966)              0         
                                                                 
 dense (Dense)               (None, 256)               1783552   
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                        

In [12]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[8.050722681218758e-05, 1.0, 1.0, 1.0, 1.0]


In [15]:
## Train Prediction
y_tr_pred = cnn_model.predict(X_train)
y_t_pred = np.where(y_tr_pred < 0.5, 0, 1)



In [18]:
print(accuracy_score(y_train, y_t_pred))
print(precision_score(y_train, y_t_pred))
print(recall_score(y_train, y_t_pred))
print(f1_score(y_train, y_t_pred))

1.0
1.0
1.0
1.0


In [19]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [20]:
confusion_matrix(y_test, y_pred)

array([[2817,    7],
       [  13,   43]], dtype=int64)

In [21]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9930555555555556
0.86
0.7678571428571429
0.8113207547169812


## CNN Model 5

In [34]:
cnn_model = Sequential()
cnn_model.add(Conv1D(24, 1, activation='softsign', input_shape=(X_res.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=3, strides=4))
cnn_model.add(Flatten())
cnn_model.add(Dense(256, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer = Adam(learning_rate=0.004395), metrics=['accuracy',metrics.Precision(), metrics.Recall(),metrics.AUC()])

In [35]:
cnn_model.fit(X_res, y_res, epochs=26, batch_size=842)

Epoch 1/26
Epoch 2/26
Epoch 3/26
Epoch 4/26
Epoch 5/26
Epoch 6/26
Epoch 7/26
Epoch 8/26
Epoch 9/26
Epoch 10/26
Epoch 11/26
Epoch 12/26
Epoch 13/26
Epoch 14/26
Epoch 15/26
Epoch 16/26
Epoch 17/26
Epoch 18/26
Epoch 19/26
Epoch 20/26
Epoch 21/26
Epoch 22/26
Epoch 23/26
Epoch 24/26
Epoch 25/26
Epoch 26/26


<keras.callbacks.History at 0x222df9cf3d0>

In [36]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[0.009963099844753742, 0.9968564510345459, 0.9937521815299988, 1.0, 0.9997864961624146]


In [37]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [38]:
confusion_matrix(y_test, y_pred)

array([[2773,   51],
       [  13,   43]], dtype=int64)

In [39]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9777777777777777
0.4574468085106383
0.7678571428571429
0.5733333333333334


## CNN Model 6

In [40]:
cnn_model = Sequential()
cnn_model.add(Conv1D(53, 2, activation='selu', input_shape=(X_res.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2, strides=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(256, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dropout(0.2))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer = RMSprop(learning_rate=0.006845), metrics=['accuracy',metrics.Precision(), metrics.Recall(),metrics.AUC()])

In [41]:
cnn_model.fit(X_res, y_res, epochs=25, batch_size=290)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x222e64df9d0>

In [42]:
train_score = cnn_model.evaluate(X_res, y_res)
print(train_score)

[0.0007640519179403782, 0.9999557137489319, 0.9999114871025085, 1.0, 0.9999558329582214]


In [43]:
pred = cnn_model.predict(X_test)
y_pred = np.where(pred < 0.5, 0, 1)



In [44]:
confusion_matrix(y_test, y_pred)

array([[2811,   13],
       [  16,   40]], dtype=int64)

In [45]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9899305555555555
0.7547169811320755
0.7142857142857143
0.7339449541284405
