In [1]:
import pandas as pd
import spacy
import torch
import numpy as np
import itertools

from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm as SVM
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from keras.models import Model
from keras.layers import Dense, Flatten, Dropout, Input
from keras.layers.merge import concatenate
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

import tensorflow as tf
cfg=K.tf.compat.v1.ConfigProto()

# LOADING THE DATASET
data = pd.read_pickle(".\ClassifiedDataset\OriginalDataFeatureExtracted.pkl")

# LOADING THE SPACY MODEL
nlp = spacy.load("en_core_web_trf")

# LOADING THE GLOSSARY
f = open("./Glossary/NISTIR 7298 Rev3.txt", "r")
glossary = []
for x in f:
  glossary.append(x.replace("\n", ""))

# LOADING SECBERT
from transformers import pipeline

fill_mask_secbert = pipeline(
    "fill-mask",
    model="jackaduma/SecBERT",
    tokenizer="jackaduma/SecBERT"
)
# torch.device("cuda") if torch.cuda.is_available() else
device =  torch.device("cpu")
tokenizer = fill_mask_secbert.tokenizer
secbert = fill_mask_secbert.model
secbert.to(device)
print("Libraries and models uploaded!")

Libraries and models uploaded!


### Model Selection

In [2]:
# torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device))

In [3]:
# len(tokenizer.get_vocab())

In [4]:
# FUNCTIONS TO CREATE DIFFERENT MODELS #################################################################################################################################
def create_models_EntPosDep(xtrain, ytrain):
    max_length= 558
    array = pd.DataFrame()
    array["EntPosDep"] = None
    count = 0
    for row in xtrain[['Entities','Dependencies','Parts of Speech']].iterrows():
        tmp = []
        for value in row[1]['Entities']:
            tmp.append(value)
        for value in row[1]['Dependencies']:
            tmp.append(value)
        for value in row[1]['Parts of Speech']:
            tmp.append(value)
        array.at[count,"EntPosDep"] = tmp
        count += 1
    
    #Logistic Regression on Entity, Pos, Dependency
    logreg = LogisticRegression(random_state=0, max_iter=250)
    logreg.fit(array["EntPosDep"].to_list(), ytrain)
    #--------------------------------------------------
    
    #Support Vector Machines on Entity, Pos, Dependency 
    svm = SVM.SVR()
    svm.fit(array["EntPosDep"].to_list(), ytrain)
    #--------------------------------------------------
    
    #Gaussian Naive Bayes on Entity, Pos, Dependency
    gnb = GaussianNB()
    gnb.fit(array["EntPosDep"].to_list(), ytrain)
    #--------------------------------------------------
    
    #K-Nearest Neighbors with k = 5(default) on Entity, Pos, Dependency
    knn = KNeighborsClassifier()
    knn.fit(array["EntPosDep"].to_list(), ytrain)
    #--------------------------------------------------
    
    #Decision Tree on Entity, Pos, Dependency
    dt = tree.DecisionTreeClassifier()
    dt.fit(array["EntPosDep"].to_list(), ytrain)
    #--------------------------------------------------
    
    #Random Forest on Entity, Pos, Dependency
    rfc = RandomForestClassifier(max_depth=2, random_state=0)
    rfc.fit(array["EntPosDep"].to_list(), ytrain)
    #--------------------------------------------------
    
    with torch.no_grad():
        x1 = pad_sequences(np.array(xtrain['Entities']))
        x2 = pad_sequences(np.array(xtrain['Dependencies']))
        x3 = pad_sequences(np.array(xtrain['Parts of Speech']))
        #CNN on Entity, Pos, Dependency
        vocab_size = int(len(tokenizer.get_vocab())/1000 + 1)
        np.random.seed(7)

        # channel 1
        inputs1 = Input(shape=(max_length,))
        embedding1 = Embedding(vocab_size, 100)(inputs1)
        conv1 = Conv1D(filters=16, kernel_size=4, activation='relu')(embedding1)
        drop1 = Dropout(0.2)(conv1)
        pool1 = MaxPooling1D(pool_size=2)(drop1)
        flat1 = Flatten()(pool1)

        # channel 2
        inputs2 = Input(shape=(max_length,))
        embedding2 = Embedding(vocab_size, 100)(inputs2)
        conv2 = Conv1D(filters=16, kernel_size=4, activation='relu')(embedding2)
        drop2 = Dropout(0.2)(conv2)
        pool2 = MaxPooling1D(pool_size=2)(drop2)
        flat2 = Flatten()(pool2)
        
        # channel 3
        inputs3 = Input(shape=(max_length,))
        embedding3 = Embedding(vocab_size, 100)(inputs3)
        conv3 = Conv1D(filters=16, kernel_size=4, activation='relu')(embedding3)
        drop3 = Dropout(0.2)(conv3)
        pool3 = MaxPooling1D(pool_size=2)(drop3)
        flat3 = Flatten()(pool3)

        # merge
        merged = concatenate([flat1, flat2, flat3])
        # interpretation
        dense1 = Dense(500, activation='relu')(merged)
        dense2 = Dense(50, activation='relu')(dense1)
        dense3 = Dense(5, activation='relu')(dense2)
        outputs = Dense(1, activation='sigmoid')(dense3)

        cnn = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
        # compile
        cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        cnn.fit([x1,x2,x3], np.array(ytrain), epochs=20, batch_size=50)
    #--------------------------------------------------
    
    return logreg, svm, gnb, knn, dt, rfc, cnn


def create_model_SecWorSimReq(xtrain,ytrain):
    max_length= 558

    array = pd.DataFrame()
    array["SecWords&SimReq"] = None
    count = 0
    for row in xtrain[['Security Words','Similar Requirements']].iterrows():
        tmp = []
        for value in row[1]['Security Words']:
            tmp.append(value)
        for value in row[1]['Similar Requirements']:
            tmp.append(value)
        array.at[count,"SecWords&SimReq"] = tmp
        count += 1

    #Logistic Regression on Security Words and Similar Requirements 
    logsw = LogisticRegression(random_state=0, max_iter=250)
    logsw.fit(array["SecWords&SimReq"].to_list(), ytrain)
    #--------------------------------------------------
    #Support Vector Machines on Security Words and Similar Requirements
    svmsw = SVM.SVR()
    svmsw.fit(array["SecWords&SimReq"].to_list(), ytrain)
    #--------------------------------------------------
    
    #Gaussian Naive Bayes on Security Words and Similar Requirements
    gnbsw = GaussianNB()
    gnbsw.fit(array["SecWords&SimReq"].to_list(), ytrain)
    #--------------------------------------------------
    
    #K-Nearest Neighbors with k = 5(default)
    knnsw = KNeighborsClassifier()
    knnsw.fit(array["SecWords&SimReq"].to_list(), ytrain)
    #--------------------------------------------------
    
    #Decision Tree on Security Words and Similar Requirements
    dtsw = tree.DecisionTreeClassifier()
    dtsw.fit(array["SecWords&SimReq"].to_list(), ytrain)
    #--------------------------------------------------
    
    #Random Forest on Security Words and Similar Requirements
    rfcsw = RandomForestClassifier(max_depth=2, random_state=0)
    rfcsw.fit(array["SecWords&SimReq"].to_list(), ytrain)
    #--------------------------------------------------
    
    with torch.no_grad():
        x1 = pad_sequences(np.array(xtrain['Security Words']))
        x2 = pad_sequences(np.array(xtrain['Similar Requirements']))
        #CNN on Security Words and Similar Requirements
        vocab_size = int(len(tokenizer.get_vocab())/1000 + 1)
        np.random.seed(7)

        # channel 1
        inputs1 = Input(shape=(max_length,))
        embedding1 = Embedding(vocab_size, 100)(inputs1)
        conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
        drop1 = Dropout(0.2)(conv1)
        pool1 = MaxPooling1D(pool_size=2)(drop1)
        flat1 = Flatten()(pool1)

        # channel 2
        inputs2 = Input(shape=(max_length,))
        embedding2 = Embedding(vocab_size, 100)(inputs2)
        conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding2)
        drop2 = Dropout(0.2)(conv2)
        pool2 = MaxPooling1D(pool_size=2)(drop2)
        flat2 = Flatten()(pool2)

        # merge
        merged = concatenate([flat1, flat2])
        # interpretation
        dense1 = Dense(500, activation='relu')(merged)
        dense2 = Dense(50, activation='relu')(dense1)
        dense3 = Dense(5, activation='relu')(dense2)
        outputs = Dense(1, activation='sigmoid')(dense3)

        cnn_sw = Model(inputs=[inputs1, inputs2], outputs=outputs)
        # compile
        cnn_sw.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        cnn_sw.fit([x1,x2], np.array(ytrain), epochs=20, batch_size=50)
        #--------------------------------------------------
    
    return logsw, svmsw, gnbsw, knnsw, dtsw, rfcsw, cnn_sw
    


def create_TL_model(x_train, y_train):
    #Padding the test set for CNNs
    x1 = pad_sequences(np.array(x_train["Entities"]))
    x2 = pad_sequences(np.array(x_train["Dependencies"]))
    x3 = pad_sequences(np.array(x_train["Parts of Speech"]))
    x4 = pad_sequences(np.array(x_train["Secbert Outputs"]))
    K.clear_session()
    torch.cuda.empty_cache()
    with torch.no_grad():
        max_length= 558
        vocab_size = int(len(tokenizer.get_vocab())/1000 + 1)
        
        #lenght of input for TL coming from the output of BERT
        input_shape = 33280 #max_dim = 126, to be sure we set as max_dim = 130, then multiply it for the lenght of each one (768) = 99840, taking element each 5 value( the more elements are taken, the more accurate the modell will be, but depends on the memory of the machine) = 19968
        
        #Transfer Learning Model
        np.random.seed(7)

        # channel 1
        inputs1 = Input(shape=(max_length,))
        embedding1 = Embedding(vocab_size, 100)(inputs1)
        conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
        drop1 = Dropout(0.2)(conv1)
        pool1 = MaxPooling1D(pool_size=2)(drop1)
        flat1 = Flatten()(pool1)

        # channel 2
        inputs2 = Input(shape=(max_length,))
        embedding2 = Embedding(vocab_size, 100)(inputs2)
        conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding2)
        drop2 = Dropout(0.2)(conv2)
        pool2 = MaxPooling1D(pool_size=2)(drop2)
        flat2 = Flatten()(pool2)

        # channel 3
        inputs3 = Input(shape=(max_length,))
        embedding3 = Embedding(vocab_size, 100)(inputs3)
        conv3 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding3)
        drop3 = Dropout(0.2)(conv3)
        pool3 = MaxPooling1D(pool_size=2)(drop3)
        flat3 = Flatten()(pool3)

        #output of secbert model
        inputs4 = Input(shape=(input_shape,))
        # embedding4 = Embedding(vocab_size, 100)(inputs4)
        # conv4 = Conv1D(filters=16, kernel_size=2, activation='relu')(embedding4)
        # drop4 = Dropout(0.5)(conv4)
        # pool4 = MaxPooling1D(pool_size=4)(drop4)
        # drop4 = Dropout(0.5)(pool4)
        # pool4 = MaxPooling1D(pool_size=4)(drop4)
        # drop4 = Dropout(0.5)(pool4)
        # pool4 = MaxPooling1D(pool_size=4)(drop4)
        # flat4 = Flatten()(pool4)

        # merge flat1, flat2, flat3,
        merged = concatenate([ flat1, flat2, flat3, inputs4])
        # interpretation
        dense = Dense(500, activation='relu')(merged)
        dense1 = Dense(50, activation='relu')(dense)
        dense2 = Dense(5, activation='relu')(dense1)
        # dense3 = Dense(1, activation='relu')(dense2)
        outputs = Dense(1, activation='sigmoid')(dense2)

        tl_cnn = Model(inputs=[inputs1,inputs2,inputs3,inputs4], outputs=outputs)
        # compile
        tl_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        tl_cnn.fit([x1,x2,x3,x4], np.array(y_train), epochs=20, batch_size=20, use_multiprocessing=False)
        #--------------------------------------------------
        
        return tl_cnn

In [5]:
# models_EntPosDep = create_models_EntPosDep()
# print(models_EntPosDep)
# del models_EntPosDep
# models_SecWorSimReq = create_model_SecWorSimReq()
# print(models_SecWorSimReq)
# del models_SecWorSimReq
# model_tl = create_TL_model()
# print(model_tl)
# del model_tl

In [6]:
# K-Fold Cross-Validation
# def cross_validation(model, _X, _y, _cv=5):
#       '''Function to perform k Folds Cross-Validation
#        Parameters
#        ----------
#       model: Python Class, default=None
#               This is the machine learning algorithm to be used for training.
#       _X: array
#            This is the matrix of features.
#       _y: array
#            This is the target variable.
#       _cv: int, default=5
#           Determines the number of folds for cross-validation.
#        Returns
#        -------
#        The function returns a dictionary containing the metrics 'accuracy', 'precision',
#        'recall', 'f1' for both training set and validation set.
#       '''
#       _scoring = ['accuracy', 'precision', 'recall', 'f1']
#       results = cross_validate(estimator=model,
#                                X=_X,
#                                y=_y,
#                                cv=_cv,
#                                scoring=_scoring,
#                                return_train_score=True)
      
#       return {"Training Accuracy scores": results['train_accuracy'],
#               "Mean Training Accuracy": results['train_accuracy'].mean()*100,
#               "Training Precision scores": results['train_precision'],
#               "Mean Training Precision": results['train_precision'].mean(),
#               "Training Recall scores": results['train_recall'],
#               "Mean Training Recall": results['train_recall'].mean(),
#               "Training F1 scores": results['train_f1'],
#               "Mean Training F1 Score": results['train_f1'].mean(),
#               "Validation Accuracy scores": results['test_accuracy'],
#               "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
#               "Validation Precision scores": results['test_precision'],
#               "Mean Validation Precision": results['test_precision'].mean(),
#               "Validation Recall scores": results['test_recall'],
#               "Mean Validation Recall": results['test_recall'].mean(),
#               "Validation F1 scores": results['test_f1'],
#               "Mean Validation F1 Score": results['test_f1'].mean()
#               }

In [7]:
# for model in create_models_EntPosDep():
#     print("Cross validation on "+ str(model))
#     # _scoring = ['accuracy', 'precision', 'recall', 'f1']
#     feature_cols = ['Entities', 'Dependencies','Parts of Speech']
#     x = data.loc[:, feature_cols].astype(float)
#     dep_var = ["Security Related"]
#     y = data.loc[:, dep_var]
#     results = cross_val_score(model, X = x[:1000], y = y[:1000], cv=2)
#     print(results)
    # results = cross_val_score(estimator=models_EntPosDep[0],
    #                             X=x[:1],
    #                             y=y[:1],
    #                             cv=1,
    #                             scoring=_scoring,
    #                             return_train_score=True)
    # cross_validation(model,data[["Entities_enc","Dependencies_enc","Parts of Speech_enc"]],data[["Security Related"]])

In [8]:
# for model in create_models_EntPosDep()[:5]:
#     print("Cross validation on "+ str(model))
#     feature_cols = ['Entities', 'Dependencies','Parts of Speech']
#     x = data.loc[:, feature_cols].astype(float)
#     dep_var = ["Security Related"]
#     y = data.loc[:, dep_var]
#     results = cross_validate(estimator=model,
#                                X=x,
#                                y=y,
#                                cv=1,
#                                scoring=_scoring,
#                                return_train_score=True)



# for model in create_model_SecWorSimReq()[:5]:
#     print("Cross validation on "+ str(model))
#     feature_cols = ['Security Words','Similar Requirements']
#     x = data.loc[:, feature_cols].astype(float)
#     dep_var = ["Security Related"]
#     y = data.loc[:, dep_var]
#     results = cross_validate(estimator=model,
#                                X=x,
#                                y=y,
#                                cv=1,
#                                scoring=_scoring,
#                                return_train_score=True)


    

In [9]:
# model = create_models_EntPosDep()[0]
# print("Cross validation on "+ str(model))
#     # _scoring = ['accuracy', 'precision', 'recall', 'f1']
# feature_cols = ['Entities', 'Dependencies','Parts of Speech']
# print("Slicing ...")
# x = data.loc[:150, feature_cols]
# temp=np.concatenate((x['Entities'][:100],x['Dependencies'][:100]),axis=0)
# xtrain= np.concatenate((temp,x['Parts of Speech'][:100]),axis=0)
# dep_var = ["Security Related"]
# y = data.loc[:150, dep_var]
# print("Fitting "+ str(model))
# model.fit(xtrain, y[:100])
# temp=np.concatenate((x['Entities'][100:150],x['Dependencies'][100:150]),axis=0)
# xtest= np.concatenate((temp,x['Parts of Speech'][100:150]),axis=0)
# y_pred = model.predict(xtest).round()
# print(f1_score(y[100:150], y_pred))
# print(precision_score(y[100:150], y_pred))
# print(recall_score(y[100:150], y_pred))
# print(accuracy_score(y[100:150], y_pred))

In [10]:
def create_fold_1(data):
    # CREATING FIRST FOLD
    xtrain = pd.concat( [ data[data["Security Related"]==True][928:], data[data["Security Related"]==False][1247:] ] )
    ytrain = np.array([1 for _ in range(len(xtrain[xtrain['Security Related']==True]))] + [0 for _ in range((len(xtrain[xtrain['Security Related']==False])))])

    xtest = pd.concat( [ data[data["Security Related"]==True][:928], data[data["Security Related"]==False][:1247] ] )
    ytest = np.array([1 for _ in range(len(xtest[xtest['Security Related']==True]))] + [0 for _ in range((len(xtest[xtest['Security Related']==False])))])

    return xtrain, xtest, ytrain, ytest

def create_fold_2(data):
    # CREATING SECOND FOLD
    xtrain = pd.concat( [ data[data["Security Related"]==True][:928],data[data["Security Related"]==True][1856:] ] )
    xtrain = pd.concat( [ xtrain, data[data["Security Related"]==False][:1247] ] )
    xtrain = pd.concat( [ xtrain, data[data["Security Related"]==False][2493:] ] )
    ytrain = np.array([1 for _ in range(len(xtrain[xtrain['Security Related']==True]))] + [0 for _ in range((len(xtrain[xtrain['Security Related']==False])))])

    xtest = pd.concat( [ data[data["Security Related"]==True][928:1856], data[data["Security Related"]==False][1247:2493] ] )
    ytest = np.array([1 for _ in range(len(xtest[xtest['Security Related']==True]))] + [0 for _ in range((len(xtest[xtest['Security Related']==False])))])

    return xtrain, xtest, ytrain, ytest
#--------------------------------------------------
def create_fold_3(data):
    # CREATING THIRD FOLD
    xtrain = pd.concat( [ data[data["Security Related"]==True][:1856], data[data["Security Related"]==True][2784:] ] )
    xtrain = pd.concat( [ xtrain, data[data["Security Related"]==False][:2493] ] )
    xtrain = pd.concat( [ xtrain, data[data["Security Related"]==False][3739:] ] )
    ytrain = np.array([1 for _ in range(len(xtrain[xtrain['Security Related']==True]))] + [0 for _ in range((len(xtrain[xtrain['Security Related']==False])))])

    xtest = pd.concat( [ data[data["Security Related"]==True][1856:2784], data[data["Security Related"]==False][2493:3739] ] )
    ytest = np.array([1 for _ in range(len(xtest[xtest['Security Related']==True]))] + [0 for _ in range((len(xtest[xtest['Security Related']==False])))])

    return xtrain, xtest, ytrain, ytest

def create_fold_4(data):
    # CREATING FOURTH FOLD
    xtrain = pd.concat( [ data[data["Security Related"]==True][:2784], data[data["Security Related"]==True][3712:] ] )
    xtrain = pd.concat( [ xtrain, data[data["Security Related"]==False][:3739] ] )
    xtrain = pd.concat( [ xtrain, data[data["Security Related"]==False][4985:] ] )
    ytrain = np.array([1 for _ in range(len(xtrain[xtrain['Security Related']==True]))] + [0 for _ in range((len(xtrain[xtrain['Security Related']==False])))])

    xtest = pd.concat( [ data[data["Security Related"]==True][2784:3712], data[data["Security Related"]==False][3739:4985] ] )
    ytest = np.array([1 for _ in range(len(xtest[xtest['Security Related']==True]))] + [0 for _ in range((len(xtest[xtest['Security Related']==False])))])

    return xtrain, xtest, ytrain, ytest

def create_fold_5(data):
    # CREATING FIFTH FOLD
    xtrain = pd.concat( [ data[data["Security Related"]==True][:3712], data[data["Security Related"]==False][:4985] ] )
    ytrain = np.array([1 for _ in range(len(xtrain[xtrain['Security Related']==True]))] + [0 for _ in range((len(xtrain[xtrain['Security Related']==False])))])

    xtest = pd.concat( [ data[data["Security Related"]==True][3712:], data[data["Security Related"]==False][4985:] ] )
    ytest = np.array([1 for _ in range(len(xtest[xtest['Security Related']==True]))] + [0 for _ in range((len(xtest[xtest['Security Related']==False])))])

    return xtrain, xtest, ytrain, ytest

In [11]:
# xtrain, ytrain, xtest, ytest = create_fold_1(data)
# print("Creating ")
# #Logistic Regression on Security Words and Similar Requirements 
# logsw = LogisticRegression(random_state=0)
# print("Fitting ")
# array = pd.DataFrame()
# array["SecWords&SimReq"] = None
# count = 0
# for row in xtrain[['Security Words','Similar Requirements']].iterrows():
#     tmp = []
#     for value in row[1]['Security Words']:
#         tmp.append(value)
#     for value in row[1]['Similar Requirements']:
#         tmp.append(value)
#     array.at[count,"SecWords&SimReq"] = tmp
#     count += 1
# # xtrain[['Security Words','Similar Requirements']][:10]
# # ytrain[['Security Related']][:1].values.ravel()
# logsw.fit(array["SecWords&SimReq"].to_list(),ytrain['Security Related'].to_list())
# logsw.predict(array["SecWords&SimReq"].to_list())

# logsw.fit([[1000,1500],[2000,300]],[1,0])
# logsw.predict([[2000,1000]])

In [12]:
# xtrain, xtest,ytrain, ytest = create_fold_1(data)
# create_TL_model(xtrain, ytrain)

In [13]:
def get_results(resultsfold, logreg, logsw, svm, svmsw, gnb, gnbsw, knn, knnsw, dt, dtsw, rfc, rfcsw, sec_words_cnn, cnn, xtest, y_test):
    
    counter=0
    resultsfold.loc[counter,"Real Prediction"] = ""
    for i in y_test:
        resultsfold.loc[counter,"Real Prediction"] = i
        counter+=1

    #Preparing input for shallow models
    array = pd.DataFrame()
    array["EntPosDep"] = None
    count = 0
    for row in xtest[['Entities','Dependencies','Parts of Speech']].iterrows():
        tmp = []
        for value in row[1]['Entities']:
            tmp.append(value)
        for value in row[1]['Dependencies']:
            tmp.append(value)
        for value in row[1]['Parts of Speech']:
            tmp.append(value)
        array.at[count,"EntPosDep"] = tmp
        count += 1
    array["SecWords&SimReq"] = None
    count = 0
    for row in xtest[['Security Words','Similar Requirements']].iterrows():
        tmp = []
        for value in row[1]['Security Words']:
            tmp.append(value)
        for value in row[1]['Similar Requirements']:
            tmp.append(value)
        array.at[count,"SecWords&SimReq"] = tmp
        count += 1
    
    #Padding the test set for CNNs
    x1 = pad_sequences(np.array(xtest["Entities"]))
    x2 = pad_sequences(np.array(xtest["Dependencies"]))
    x3 = pad_sequences(np.array(xtest["Parts of Speech"]))
    x4 = pad_sequences(np.array(xtest['Security Words']))
    x5 = pad_sequences(np.array(xtest['Similar Requirements']))
    
    #------------------------------------------------------
    #Result of Logistic Regression on EntPosDep
    y_pred = logreg.predict(array["EntPosDep"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"LR"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"LR"] = i
        counter+=1
    
    f1_log = f1_score(y_test, y_pred)
      
    acc_log = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Logstic Regression on sec Words
    y_pred = logsw.predict(array["SecWords&SimReq"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"LRsw"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"LRsw"] = i
        counter+=1
    
    f1_logsw = f1_score(y_test, y_pred)
        
    acc_logsw = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Support Vector Machines on EntPosDep
    y_pred = svm.predict(array["EntPosDep"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"SVM"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"SVM"] = i
        counter+=1
    
    f1_svm = f1_score(y_test, y_pred)
      
    acc_svm = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Support Vector Machines on sec Words
    y_pred = svmsw.predict(array["SecWords&SimReq"].to_list()).round()
    counter=0
    resultsfold.loc[counter,"SVMsw"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"SVMsw"] = i
        counter+=1
    
    f1_svmsw = f1_score(y_test, abs(y_pred))
        
    acc_svmsw = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Gaussian Naive Bayes on EntPosDep
    y_pred = gnb.predict(array["EntPosDep"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"GNB"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"GNB"] = i
        counter+=1
    
    f1_gnb = f1_score(y_test, y_pred)
    # f1_gnb = sum(f1_gnb)/len(f1_gnb)
      
    acc_gnb = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Gaussian Naive Bayes on sec Words
    y_pred = gnbsw.predict(array["SecWords&SimReq"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"GNBsw"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"GNBsw"] = i
        counter+=1
     
    f1_gnbsw = f1_score(y_test, y_pred)
    # f1_gnbsw = sum(f1_gnbsw)/len(f1_gnbsw)
        
    acc_gnbsw = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of k-Nearest Neighbors on EntPosDep
    y_pred = knn.predict(array["EntPosDep"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"KNN"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"KNN"] = i
        counter+=1
    
    f1_knn = f1_score(y_test, y_pred)
    # f1_knn = sum(f1_knn)/len(f1_knn)
      
    acc_knn = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of k-Nearest Neighbors on sec Words
    y_pred = knnsw.predict(array["SecWords&SimReq"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"KNNsw"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"KNNsw"] = i
        counter+=1
    
    f1_knnsw = f1_score(y_test, y_pred)
    # f1_knnsw = sum(f1_knnsw)/len(f1_knnsw)
        
    acc_knnsw = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Decision Tree on EntPosDep
    y_pred = dt.predict(array["EntPosDep"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"DT"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"DT"] = i
        counter+=1
    
    f1_dt = f1_score(y_test, y_pred)
    # f1_dt = sum(f1_dt)/len(f1_dt)
     
    acc_dt = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Decision Tree on sec Words
    y_pred = dtsw.predict(array["SecWords&SimReq"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"DTsw"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"DTsw"] = i
        counter+=1
     
    f1_dtsw = f1_score(y_test, y_pred)
    # f1_dtsw = sum(f1_dtsw)/len(f1_dtsw)
       
    acc_dtsw = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Random Forest Classifier on EntPosDep
    y_pred = rfc.predict(array["EntPosDep"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"RFC"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"RFC"] = i
        counter+=1
    
    f1_rfc = f1_score(y_test, y_pred)
    # f1_rfc = sum(f1_rfc)/len(f1_rfc)
      
    acc_rfc = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Random Forest Classifier on sec Words
    y_pred = rfcsw.predict(array["SecWords&SimReq"].to_list()).round()
    
    counter=0
    resultsfold.loc[counter,"RFCsw"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"RFCsw"] = i
        counter+=1
    
    f1_rfcsw = f1_score(y_test, y_pred)
    # f1_rfcsw = sum(f1_rfcsw)/len(f1_rfcsw)
        
    acc_rfcsw = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Security Words CNN
    with torch.no_grad():
        y_pred = sec_words_cnn.predict([x4,x5]).round()
    
    counter=0
    resultsfold.loc[counter,"sw_CNN"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"sw_CNN"] = i
        counter+=1
    
    f1_sw_cnn = f1_score(y_test, y_pred)
    # f1_sw_cnn = sum(f1_sw_cnn)/len(f1_sw_cnn)
    
    
    acc_sw_cnn = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    #Result of Disclosure CNN
    with torch.no_grad():
        y_pred = cnn.predict([x1,x2,x3]).round()
    
    counter=0
    resultsfold.loc[counter,"CNN"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"CNN"] = i
        counter+=1
    
    f1_cnn = f1_score(y_test, y_pred)
    # f1_cnn = sum(f1_cnn)/len(f1_cnn)
      
    acc_cnn = accuracy_score(y_test, y_pred)
    #------------------------------------------------------
    
    
    return resultsfold, f1_log, acc_log, f1_logsw, acc_logsw, f1_svm, acc_svm, f1_svmsw, acc_svmsw, f1_gnb, acc_gnb, f1_gnbsw, acc_gnbsw, f1_knn, acc_knn, f1_knnsw, acc_knnsw, f1_dt, acc_dt, f1_dtsw, acc_dtsw, f1_rfc, acc_rfc, f1_rfcsw, acc_rfcsw, f1_sw_cnn, acc_sw_cnn, f1_cnn, acc_cnn

def get_tl_results(tl_cnn, resultsfold, xtest, ytest):
    x1 = pad_sequences(np.array(xtest["Entities"]))
    x2 = pad_sequences(np.array(xtest["Dependencies"]))
    x3 = pad_sequences(np.array(xtest["Parts of Speech"]))
    x6 = pad_sequences(np.array(xtest["Secbert Outputs"]))
    #Result of TL
    with torch.no_grad():
        y_pred = tl_cnn.predict([x1,x2,x3,x6]).round()
    
    counter=0
    resultsfold.loc[counter,"TL"] = ""
    for i in y_pred:
        resultsfold.loc[counter,"TL"] = i
        counter+=1

    f1_tl = f1_score(ytest, y_pred)
    # f1_tl = sum(f1_tl)/len(f1_tl)
     
    acc_tl = accuracy_score(ytest, y_pred)

    return resultsfold, f1_tl, acc_tl


    

In [14]:
accuracy = pd.DataFrame()
f1score = pd.DataFrame()
f1score_calc = pd.DataFrame()
index=0

accuracy.at[index,"LR"] = ""
accuracy.at[index,"LRSW"] = ""
accuracy.at[index,"SVM"] = ""
accuracy.at[index,"SVMSW"] = ""
accuracy.at[index,"GNB"] = ""
accuracy.at[index,"GNBSW"] = ""
accuracy.at[index,"KNN"] = ""
accuracy.at[index,"KNNSW"] = ""
accuracy.at[index,"DT"] = ""
accuracy.at[index,"DTSW"] = ""
accuracy.at[index,"RFC"] = ""
accuracy.at[index,"RFCSW"] = ""
accuracy.at[index,"SW_CNN"] = ""
accuracy.at[index,"CNN"] = ""
accuracy.at[index,"TL"] = ""

f1score.at[index,"LR"] = ""
f1score.at[index,"LRSW"] = ""
f1score.at[index,"SVM"] = ""
f1score.at[index,"SVMSW"] = ""
f1score.at[index,"GNB"] = ""
f1score.at[index,"GNBSW"] = ""
f1score.at[index,"KNN"] = ""
f1score.at[index,"KNNSW"] = ""
f1score.at[index,"DT"] = ""
f1score.at[index,"DTSW"] = ""
f1score.at[index,"RFC"] = ""
f1score.at[index,"RFCSW"] = ""
f1score.at[index,"SW_CNN"] = ""
f1score.at[index,"CNN"] = ""
f1score.at[index,"TL"] = ""

#----------------------------------------
# Edit N to change the number of repetitions of the 5 fold cross validation
#----------------------------------------
N = 1

for _ in itertools.repeat(None, N):
    print('\n')
    print('\n')
    print('------------------------------- Test '+ str(index) + ' --------------------------------------')    
    print('\n')
    print('\n')
    data = data.sample(frac=1).reset_index(drop=True)
    results = pd.DataFrame()
    
    f1_log_vector = []
    acc_log_vector = []

    f1_logsw_vector = []
    acc_logsw_vector = []
    
    f1_svm_vector = []
    acc_svm_vector = []

    f1_svmsw_vector = []
    acc_svmsw_vector = []
    
    f1_gnb_vector = []
    acc_gnb_vector = []

    f1_gnbsw_vector = []
    acc_gnbsw_vector = []
    
    f1_knn_vector = []
    acc_knn_vector = []

    f1_knnsw_vector = []
    acc_knnsw_vector = []
    
    f1_dt_vector = []
    acc_dt_vector = []

    f1_dtsw_vector = []
    acc_dtsw_vector = []
    
    f1_rfc_vector = []
    acc_rfc_vector = []

    f1_rfcsw_vector = []
    acc_rfcsw_vector = []

    f1_sw_cnn_vector = []
    acc_sw_cnn_vector = []

    f1_disclo_cnn_vector = []
    acc_cnn_vector = [] 

    f1_pd_vector = []
    acc_pd_vector = []

    counter = 0
    while counter < 5:
        
        if counter==0:
            x_train, x_test, y_train, y_test = create_fold_1(data)
            print('\n')
            print('------------------------------- Fold '+ str(counter+1) + ' --------------------------------------')    
            print('\n')
        elif counter==1:
            x_train, x_test, y_train, y_test = create_fold_2(data)
            print('\n')
            print('------------------------------- Fold '+ str(counter+1) + ' --------------------------------------')    
            print('\n')
        elif counter==2:
            x_train, x_test, y_train, y_test = create_fold_3(data)
            print('\n')
            print('------------------------------- Fold '+ str(counter+1) + ' --------------------------------------')    
            print('\n')
        elif counter==3:
            x_train, x_test, y_train, y_test = create_fold_4(data)
            print('\n')
            print('------------------------------- Fold '+ str(counter+1) + ' --------------------------------------')    
            print('\n')
        elif counter==4:
            x_train, x_test, y_train, y_test = create_fold_5(data)
            print('\n')
            print('------------------------------- Fold '+ str(counter+1) + ' --------------------------------------')    
            print('\n')

        K.clear_session()
        torch.cuda.empty_cache()
        K.set_session(K.tf.compat.v1.Session(config=cfg))
        print('\n')
        print('------------------------------- Training Transfer Learning model --------------------------------------')    
        print('\n')
        resultsfold = pd.DataFrame()
        tl_cnn = create_TL_model(x_train, y_train)
        resultsfold, f1_tl, acc_tl = get_tl_results(tl_cnn,resultsfold,x_test, y_test)
        del tl_cnn
        print('\n')
        print('------------------------------- Training Entities, POS, Dependencies based models --------------------------------------')    
        print('\n')
        logreg, svm, gnb, knn, dt, rfc, cnn = create_models_EntPosDep(x_train, y_train)
        print('\n')
        print('------------------------------- Training Security Words and Similar Requirements based models --------------------------------------')    
        print('\n')
        logsw, svmsw, gnbsw, knnsw, dtsw, rfcsw, cnn_sw = create_model_SecWorSimReq(x_train, y_train)
        
        resultsfold, f1_log, acc_log, f1_logsw, acc_logsw, f1_svm, acc_svm, f1_svmsw, acc_svmsw, f1_gnb, acc_gnb, f1_gnbsw, acc_gnbsw, f1_knn, acc_knn, f1_knnsw, acc_knnsw, f1_dt, acc_dt, f1_dtsw, acc_dtsw, f1_rfc, acc_rfc, f1_rfcsw, acc_rfcsw, f1_sw_cnn, acc_sw_cnn, f1_cnn, acc_cnn = get_results(resultsfold, logreg, logsw, svm, svmsw, gnb, gnbsw, knn, knnsw, dt, dtsw, rfc, rfcsw, cnn_sw, cnn, x_test, y_test)
        del cnn
        del cnn_sw
        

        results = pd.concat([results, resultsfold], ignore_index=True)
        
        f1_log_vector.append(f1_log)
        acc_log_vector.append(acc_log)
        print("log")
        print(str(f1_log))
        
        print(str(acc_log))

        print("log sw")
        print(str(f1_logsw))

        print(str(acc_logsw))
        print("svm")
        print(str(f1_svm))
        
        print(str(acc_svm))

        print("svm_sw")
        print(str(f1_svmsw))

        print(str(acc_svmsw))

        print("Naive")
        print(str(f1_gnb))
        
        print(str(acc_gnb))

        print("Naive sw")
        print(str(f1_gnbsw))

        print(str(acc_gnbsw))
        print("knn")
        print(str(f1_knn))
        
        print(str(acc_knn))

        print("knn sw")
        print(str(f1_knnsw))

        print(str(acc_knnsw))

        print("dec tree")
        print(str(f1_dt))
        
        print(str(acc_dt))

        print("dec tree sw")
        print(str(f1_dtsw))
        
        print(str(acc_dtsw))
        print("ran forest")
        print(str(f1_rfc))
        
        print(str(acc_rfc))

        print("ran forest sw")
        print(str(f1_rfcsw))

        print(str(acc_rfcsw))

        print("cnn sw")
        print(str(f1_sw_cnn))
        print(str(acc_sw_cnn))

        print("cnn")
        print(str(f1_cnn))
        
        print(str(acc_cnn)) 

        print("tl")
        print(str(f1_tl))
        
        print(str(acc_tl))
  
        f1_logsw_vector.append(f1_logsw)
        acc_logsw_vector.append(acc_logsw)
    
        f1_svm_vector.append(f1_svm)
        acc_svm_vector.append(acc_svm)

        f1_svmsw_vector.append(f1_svmsw)
        acc_svmsw_vector.append(acc_svmsw)
    
        f1_gnb_vector.append(f1_gnb)
        acc_gnb_vector.append(acc_gnb)

        f1_gnbsw_vector.append(f1_gnbsw)
        acc_gnbsw_vector.append(acc_gnbsw)
    
        f1_knn_vector.append(f1_knn)
        acc_knn_vector.append(acc_knn)

        f1_knnsw_vector.append(f1_knnsw)
        acc_knnsw_vector.append(acc_knnsw)
    
        f1_dt_vector.append(f1_dt)
        acc_dt_vector.append(acc_dt)

        f1_dtsw_vector.append(f1_dtsw)
        acc_dtsw_vector.append(acc_dtsw)
    
        f1_rfc_vector.append(f1_rfc)
        acc_rfc_vector.append(acc_rfc)

        f1_rfcsw_vector.append(f1_rfcsw)
        acc_rfcsw_vector.append(acc_rfcsw)
    
        f1_sw_cnn_vector.append(f1_sw_cnn)
        acc_sw_cnn_vector.append(acc_sw_cnn)

        f1_disclo_cnn_vector.append(f1_cnn)
        acc_cnn_vector.append(acc_cnn) 

        f1_pd_vector.append(f1_tl)
        acc_pd_vector.append(acc_tl)
        
        counter +=1
    
    results.to_excel('Results_Test_'+str(index+1)+'.xlsx',index=False)
    
    accuracy.at[index,"LR"] = sum(acc_log_vector)/len(acc_log_vector)
    accuracy.at[index,"LRSW"] = sum(acc_logsw_vector)/len(acc_logsw_vector)
    
    accuracy.at[index,"SVM"] = sum(acc_svm_vector)/len(acc_svm_vector)
    accuracy.at[index,"SVMSW"] = sum(acc_svmsw_vector)/len(acc_svmsw_vector)
    
    accuracy.at[index,"GNB"] = sum(acc_gnb_vector)/len(acc_gnb_vector)
    accuracy.at[index,"GNBSW"] = sum(acc_gnbsw_vector)/len(acc_gnbsw_vector)
    
    accuracy.at[index,"KNN"] = sum(acc_knn_vector)/len(acc_knn_vector)
    accuracy.at[index,"KNNSW"] = sum(acc_knnsw_vector)/len(acc_knnsw_vector)
    
    accuracy.at[index,"DT"] = sum(acc_dt_vector)/len(acc_dt_vector)
    accuracy.at[index,"DTSW"] = sum(acc_dtsw_vector)/len(acc_dtsw_vector)
    
    accuracy.at[index,"RFC"] = sum(acc_rfc_vector)/len(acc_rfc_vector)
    accuracy.at[index,"RFCSW"] = sum(acc_rfcsw_vector)/len(acc_rfcsw_vector)
    
    accuracy.at[index,"SW_CNN"] = sum(acc_sw_cnn_vector)/len(acc_sw_cnn_vector)
    accuracy.at[index,"CNN"] = sum(acc_cnn_vector)/len(acc_cnn_vector)
    accuracy.at[index,"TL"] = sum(acc_pd_vector)/len(acc_pd_vector)
    
    f1score.at[index,"LR"] = sum(f1_log_vector)/len(f1_log_vector)
    f1score.at[index,"LRSW"] = sum(f1_logsw_vector)/len(f1_logsw_vector)
    
    f1score.at[index,"SVM"] = sum(f1_svm_vector)/len(f1_svm_vector)
    f1score.at[index,"SVMSW"] = sum(f1_svmsw_vector)/len(f1_svmsw_vector)
    
    f1score.at[index,"GNB"] = sum(f1_gnb_vector)/len(f1_gnb_vector)
    f1score.at[index,"GNBSW"] = sum(f1_gnbsw_vector)/len(f1_gnbsw_vector)
    
    f1score.at[index,"KNN"] = sum(f1_knn_vector)/len(f1_knn_vector)
    f1score.at[index,"KNNSW"] = sum(f1_knnsw_vector)/len(f1_knnsw_vector)
    
    f1score.at[index,"DT"] = sum(f1_dt_vector)/len(f1_dt_vector)
    f1score.at[index,"DTSW"] = sum(f1_dtsw_vector)/len(f1_dtsw_vector)
    
    f1score.at[index,"RFC"] = sum(f1_rfc_vector)/len(f1_rfc_vector)
    f1score.at[index,"RFCSW"] = sum(f1_rfcsw_vector)/len(f1_rfcsw_vector)
    
    f1score.at[index,"SW_CNN"] = sum(f1_sw_cnn_vector)/len(f1_sw_cnn_vector)
    f1score.at[index,"CNN"] = sum(f1_disclo_cnn_vector)/len(f1_disclo_cnn_vector)
    f1score.at[index,"TL"] = sum(f1_pd_vector)/len(f1_pd_vector)
    
    index += 1





------------------------------- Test 0 --------------------------------------






------------------------------- Fold 1 --------------------------------------




------------------------------- Training Transfer Learning model --------------------------------------


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


------------------------------- Training Entities, POS, Dependencies based models --------------------------------------




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


------------------------------- Training Security Words and Similar Requirements based models --------------------------------------




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
log
0.7394778902503996
0.7751724137931034
log sw
0.6813842482100239
0.7544827586206897
svm
0.7922705314009661
0.8220689655172414
svm_sw
0.7013888888888888
0.7627586206896552
Naive
0.2077205882352941
0.6036781609195402
Naive sw
0.22607110300820418
0.6096551724137931
knn
0.7612687813021702
0.8027586206896552
knn sw
0.6863207547169812
0.7554022988505747
dec tree
0.7617521367521368
0.7949425287356322
dec tree sw
0.6874221668742216
0.7691954022988505
ran forest
0.7542627883650954
0.774712643678161
ran forest sw
0.6986697513013302
0.7604597701149425
cnn sw
0.6954407294832827
0.7696551724137931
cnn
0.7735449735449735
0.8032183908045977
tl
0.8478488982161595
0.8666666666666667


------------------------------- Fold 2 --------------------------------------




-------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


------------------------------- Training Security Words and Similar Requirements based models --------------------------------------




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
log
0.7346938775510204
0.7727690892364305
log sw
0.6591715976331362
0.735050597976081
svm
0.7800524934383202
0.8072677092916284
svm_sw
0.6906803887935964
0.7511499540018399
Naive
0.29177268871925355
0.6159153633854646
Naive sw
0.2630185348631951
0.6159153633854646
knn
0.7565217391304349
0.7939282428702852
knn sw
0.6759259259259258
0.7424103035878565
dec tree
0.758438818565401
0.7893284268629255
dec tree sw
0.6662576687116564
0.749770009199632
ran forest
0.7340372046254399
0.7566697332106715
ran forest sw
0.683066361556064
0.7451701931922723
cnn sw
0.6791314837153197
0.7552897884084636
cnn
0.7865731462925852
0.8040478380864765
tl
0.8308374930671104
0.859705611775529


------------------------------- Fold 3 --------------------------------------




-----------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


------------------------------- Training Security Words and Similar Requirements based models --------------------------------------




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
log
0.7473684210526316
0.7792088316467342
log sw
0.676959619952494
0.749770009199632
svm
0.7887771307570144
0.8164673413063478
svm_sw
0.7127103888566454
0.7723091076356946
Naive
0.23209428830462375
0.610395584176633
Naive sw
0.25783348254252464
0.6186752529898804
knn
0.7460055096418733
0.7879484820607175
knn sw
0.6907817969661612
0.7562097516099356
dec tree
0.7700592353257942
0.8035878564857406
dec tree sw
0.6749999999999999
0.7608095676172953
ran forest
0.7610887096774194
0.7819687212511499
ran forest sw
0.6999419616947186
0.7621895124195032
cnn sw
0.0
0.5731370745170193
cnn
0.7912423625254583
0.8114075436982521
tl
0.8366228070175438
0.8629254829806807


------------------------------- Fold 4 --------------------------------------




-----------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


------------------------------- Training Security Words and Similar Requirements based models --------------------------------------




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
log
0.7418839808408727
0.7769089236430543
log sw
0.6824364281490243
0.7529898804047838
svm
0.7955625990491284
0.8219871205151794
svm_sw
0.720554272517321
0.7773689052437902
Naive
0.18975332068311193
0.6071757129714811
Naive sw
0.2677304964539007
0.6200551977920883
knn
0.7429519071310116
0.7861085556577737
knn sw
0.7066282420749279
0.765869365225391
dec tree
0.7706422018348622
0.8045078196872125
dec tree sw
0.6984520123839009
0.7759889604415824
ran forest
0.7591387080620932
0.7787488500459981
ran forest sw
0.7031339031339031
0.7603495860165593
cnn sw
0.7224157955865272
0.7801287948482061
cnn
0.7765843179377014
0.8086476540938362
tl
0.8409433015554441
0.8541858325666973


------------------------------- Fold 5 --------------------------------------




--------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


------------------------------- Training Security Words and Similar Requirements based models --------------------------------------




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
log
0.7468085106382979
0.781048758049678
log sw
0.6713697824808935
0.7428702851885924
svm
0.7881040892193307
0.8164673413063478
svm_sw
0.6967963386727689
0.7562097516099356
Naive
0.12745098039215688
0.5906163753449862
Naive sw
0.23869801084990958
0.6126954921803128
knn
0.7398328690807799
0.7851885924563018
knn sw
0.6879350348027842
0.7525298988040479
dec tree
0.7436589314624932
0.781508739650414
dec tree sw
0.6598855689764781
0.7539098436062558
ran forest
0.7585863613738179
0.7769089236430543
ran forest sw
0.6913861950941244
0.7511499540018399
cnn sw
0.6868198307134219
0.7617295308187673
cnn
0.0
0.5731370745170193
tl
0.8309178743961353
0.8551057957681693


In [17]:
#.to_excel("F1_Results.xlsx")
f1score

Unnamed: 0,LR,LRSW,SVM,SVMSW,GNB,GNBSW,KNN,KNNSW,DT,DTSW,RFC,RFCSW,SW_CNN,CNN,TL
0,0.742047,0.674264,0.788953,0.704426,0.209758,0.25067,0.749316,0.689518,0.76091,0.677403,0.753423,0.69524,0.556762,0.625589,0.837434


In [18]:
accuracy

Unnamed: 0,LR,LRSW,SVM,SVMSW,GNB,GNBSW,KNN,KNNSW,DT,DTSW,RFC,RFCSW,SW_CNN,CNN,TL
0,0.777022,0.747033,0.816852,0.763959,0.605556,0.615399,0.791186,0.754484,0.794775,0.761935,0.773802,0.755864,0.727988,0.760092,0.859718
