<a href="https://colab.research.google.com/github/Chibueze-20/Automatic-Detection-of-HTTP-injection-Attacks-using-CNN-and-DNN/blob/main/Detection_of_HTTP_Injection_Attacks_using_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Charcter level CNN and DNN on CISC 2010 and ECML/PKDD 2007

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import keras.losses as lossx
from keras.layers import (Activation, Conv1D, Dense, Dropout, Embedding,
                          Flatten, Input, MaxPooling1D)
from keras.models import Model, Sequential,load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
import sklearn.metrics as met

##Define the helper functions

###Extract dataset into pandas dataframe

In [None]:
def ExtractDataframe(csvdataset_path):
    return pd.read_csv(csvdataset_path)

###Build the tokenizer

In [None]:
def BuildTokenizer(vocabulary,sequence):
    char_dictionary={}
    for index, char in enumerate(vocabulary):
        char_dictionary[char] = index + 1
    tokenizer = Tokenizer(num_words=None,char_level=True,oov_token='UNK',lower=False)
    tokenizer.fit_on_texts(sequence)
    tokenizer.word_index=char_dictionary
    tokenizer.word_index[tokenizer.oov_token]= len(char_dictionary.values())+1
    return tokenizer

###Get the maximum length of a http query or body

In [None]:
def MaxRequestLength(requests):
    max_length = 0
    for request in requests:
        if max_length<len(request):
            max_length = len(request)
    return max_length

###Preprocessing and dataset splitting

In [None]:
def PreprocessAndSplit(dataframe,vocabulary,test_split=0):
    requests = dataframe['Parameters'].values
    labels = dataframe['Label'].values
    tokenizer=BuildTokenizer(vocabulary,requests)
    character_indexes = tokenizer.texts_to_sequences(requests)
    max_length = 840
    data = pad_sequences(character_indexes,maxlen=max_length,padding='post')
    targets = to_categorical(labels,num_classes=2)
    if test_split==0:
        return [tokenizer,max_length,len(tokenizer.word_index),data,targets]
    else:
        X_train,X_test,Y_train,Y_test = train_test_split(data,targets,test_size=test_split)
        return [tokenizer,max_length,len(tokenizer.word_index),X_train,X_test,Y_train,Y_test]

###Build the CNN model

In [None]:
def BuildCNN(embeddingSize,inputSize,conv_layers,fully_connected_layers,num_classes,dropout_p,optimizer,loss):
    #model definition
    #embedding layer definition
    Embedding_layer = Embedding(embeddingSize+1,embeddingSize,input_length=inputSize)
    #input layer
    inputs = Input(shape=(inputSize,), name='input', dtype='int64')
    #embedding layer
    model = Embedding_layer(inputs)
    #Conv layers
    for filter_num, filter_size, pooling_size in conv_layers:
        model = Conv1D(filter_num,filter_size)(model)
        model = Activation('relu')(model)
        if pooling_size !=-1:
            model=MaxPooling1D(pool_size=pooling_size)(model)
    #flatten layer
    model = Flatten()(model)
    #Fullly connected layers
    for dense_size in fully_connected_layers:
        model = Dense(dense_size,activation='relu')(model)
        model = Dropout(dropout_p)(model)
    #output layer
    predictions = Dense(num_classes,activation='softmax')(model)
    #model
    CNNmodel = Model(inputs=inputs, outputs=predictions)
    CNNmodel.compile(optimizer=optimizer,loss=loss,metrics=['accuracy'])
    return CNNmodel

###Build the DNN model

In [None]:
def BuildDNN(embeddingSize,inputSize,fully_connected_layers,num_classes,optimizer,loss):
    #model definition
    DNNmodel = Sequential()
    #embedding layer definition
    DNNmodel.add(Embedding(embeddingSize+1,embeddingSize,input_length=inputSize))
    #flatten layer
    DNNmodel.add(Flatten())
    for nodes in fully_connected_layers:
        DNNmodel.add(Dense(nodes,activation='relu'))
        DNNmodel.add(Dropout(0.5))
    #output layer
    DNNmodel.add(Dense(num_classes,activation='softmax'))
    #model
    DNNmodel.compile(optimizer=optimizer,loss=loss,metrics=['accuracy'])
    return DNNmodel

###Evaluate the model

In [None]:
def printEvaluation(model,X,Y):
  y_pred = model.predict(X)
  y_test = Y
  print(met.f1_score(np.argmax(y_test,1), np.argmax(y_pred,1)))
  print(met.confusion_matrix(np.argmax(y_test,1), np.argmax(y_pred,1)))
  print(met.classification_report(np.argmax(y_test,1), np.argmax(y_pred,1)))

##Get dataset, preproces and build the DNN and CNN models (CISIC)

###extract

In [None]:
#train_df = ExtractDataframe('/content/drive/My Drive/Dataset/dataset.csv')
#train_df = train_df.loc[:,['Parameters','Label']]
train1 = ExtractDataframe('/content/drive/My Drive/Dataset/CSIC_dataset_train.csv')
train2 = ExtractDataframe('/content/drive/My Drive/Dataset/CSIC_dataset_test_malicious.csv')
train_df = train1.append(train2)
train_df = train_df.loc[:,['Parameters','Label']]
#suffle data
train_df = train_df.sample(frac=1).reset_index(drop=True)

###Preprocess and split

In [None]:
data = PreprocessAndSplit(train_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ",0.3)
print("Max query length:",data[1])
print("Vocabulary Size:",data[2])
# print(data[3],data[5],data[4],data[6],sep='\n')

Max query lenght: 840
Vocabulary Size: 96
[[28 55 87 ...  0  0  0]
 [28 55 87 ...  0  0  0]
 [ 5 18 18 ...  0  0  0]
 ...
 [13 15  4 ...  0  0  0]
 [13 15  4 ...  0  0  0]
 [13 15  4 ...  0  0  0]]
[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [1. 0.]
 [0. 1.]
 [0. 1.]]
[[13 15  4 ...  0  0  0]
 [ 9  4 87 ...  0  0  0]
 [ 9  4 87 ...  0  0  0]
 ...
 [13 15  4 ...  0  0  0]
 [13 15  4 ...  0  0  0]
 [ 9  4 27 ...  0  0  0]]
[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [0. 1.]]


###Create CNN Model 

In [None]:
CnnModel = BuildCNN(data[2],data[1],[[256, 7, 3],[256, 7, 3],[256, 3, -1],[256, 3, -1],[256, 3, -1], 
               [256, 3, 3]],[1024,1024],2,0.5,'adam','categorical_crossentropy')
CnnModel.summary()

###Build DNN model

In [None]:
DNNmodel = BuildDNN(data[2],data[1],[1024,1024,1024],2,'sgd','categorical_crossentropy')
DNNmodel.summary()

###Train DNN and CNN model

####Train CNN model

In [None]:
CnnModel.fit(data[3],data[5],batch_size=128,validation_data=(data[4],data[6]))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 24910 samples, validate on 10676 samples
Epoch 1/1


<keras.callbacks.History at 0x7fc1c76b3278>

####Train DNN

In [None]:
DNNmodel.fit(data[3],data[5],batch_size=128,epochs=10,validation_data=(data[4],data[6]))

Load Model (For saved models only)

In [None]:
CnnModel = load_model('/content/drive/My Drive/Dataset/cisccnnkeras.h5')
DNNmodel = load_model('/content/drive/My Drive/Dataset/ciscdnnkeras.h5')



###Evaluate DNN and CNN




In [None]:
print("CNN")
printEvaluation(CnnModel,data[4],data[6])
print("DNN")
printEvaluation(DNNmodel,data[4],data[6])
# DNNmodel.evaluate(data[4],data[6])

CNN
0.966796032772747
[[4686   96]
 [ 289 5605]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      4782
           1       0.98      0.95      0.97      5894

    accuracy                           0.96     10676
   macro avg       0.96      0.97      0.96     10676
weighted avg       0.96      0.96      0.96     10676

DNN
0.8655397944449165
[[3998  784]
 [ 799 5095]]
              precision    recall  f1-score   support

           0       0.83      0.84      0.83      4782
           1       0.87      0.86      0.87      5894

    accuracy                           0.85     10676
   macro avg       0.85      0.85      0.85     10676
weighted avg       0.85      0.85      0.85     10676



####Test on benign

In [None]:
test_df = ExtractDataframe('/content/drive/My Drive/Dataset/CSIC_dataset_test_benign.csv')
test_df = test_df.loc[:,['Parameters','Label']]
datax1 = PreprocessAndSplit(test_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("CNN")
printEvaluation(CnnModel,datax1[3],datax1[4])
print("DNN")
printEvaluation(DNNmodel,datax1[3],datax1[4])
# CnnModel.evaluate(datax1[3],datax1[4])

CNN
0.0
[[15594   406]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     16000
           1       0.00      0.00      0.00         0

    accuracy                           0.97     16000
   macro avg       0.50      0.49      0.49     16000
weighted avg       1.00      0.97      0.99     16000

DNN


  _warn_prf(average, modifier, msg_start, len(result))


0.0
[[13026  2974]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      0.81      0.90     16000
           1       0.00      0.00      0.00         0

    accuracy                           0.81     16000
   macro avg       0.50      0.41      0.45     16000
weighted avg       1.00      0.81      0.90     16000



####Test on malicious

In [None]:
text_df = ExtractDataframe('/content/drive/My Drive/Dataset/CSIC_dataset_test_malicious.csv')
text_df = text_df.loc[:,['Parameters','Label']]
datat=PreprocessAndSplit(text_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
# printEvaluation(DNNmodel,datat[3],datat[4])
print("CNN")
printEvaluation(CnnModel,datat[3],datat[4])
print("DNN")
printEvaluation(DNNmodel,datat[3],datat[4])
# DNNmodel.evaluate(datat[3],datat[4])

CNN
0.9770721261816473
[[    0     0]
 [  878 18708]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.96      0.98     19586

    accuracy                           0.96     19586
   macro avg       0.50      0.48      0.49     19586
weighted avg       1.00      0.96      0.98     19586

DNN


  _warn_prf(average, modifier, msg_start, len(result))


0.9276758828360252
[[    0     0]
 [ 2642 16944]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.87      0.93     19586

    accuracy                           0.87     19586
   macro avg       0.50      0.43      0.46     19586
weighted avg       1.00      0.87      0.93     19586



#### test on EMCL/PKDD

In [None]:
tert_df = ExtractDataframe('/content/drive/My Drive/Dataset/dataset.csv')
tert_df= tert_df.loc[:,['Parameters','Label']]
data_tr=PreprocessAndSplit(tert_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
# printEvaluation(CnnModel,data_tr[3],data_tr[5])
# CnnModel.evaluate(data_tr[3],data_tr[5])
print("CNN")
printEvaluation(CnnModel,data_tr[3],data_tr[4])
print("DNN")
printEvaluation(DNNmodel,data_tr[3],data_tr[4])

CNN
0.5165671373504563
[[ 5978 22984]
 [ 1221 12932]]
              precision    recall  f1-score   support

           0       0.83      0.21      0.33     28962
           1       0.36      0.91      0.52     14153

    accuracy                           0.44     43115
   macro avg       0.60      0.56      0.42     43115
weighted avg       0.68      0.44      0.39     43115

DNN
0.49465937101200985
[[   60 28902]
 [    5 14148]]
              precision    recall  f1-score   support

           0       0.92      0.00      0.00     28962
           1       0.33      1.00      0.49     14153

    accuracy                           0.33     43115
   macro avg       0.63      0.50      0.25     43115
weighted avg       0.73      0.33      0.17     43115



##Get dataset, preprocess and build CNN and DNN models (EMCL/PKDD)

###Extract dataset and preprocess

In [None]:
emctrain_df = ExtractDataframe('/content/drive/My Drive/Dataset/dataset.csv')
emctrain_df= emctrain_df.loc[:,['Parameters','Label']]
data_emc=PreprocessAndSplit(emctrain_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ",0.3)
print("Max query lenght:",data_emc[1])
print("Vocabulary Size:",data_emc[2])
# print(data_emc[3].shape,data_emc[5].shape,data_emc[4].shape,data_emc[6].shape,sep='\n')

Max query lenght: 840
Vocabulary Size: 96
(30180, 840)
(30180, 2)
(12935, 840)
(12935, 2)


In [None]:
# BALANCE THE DATASET (add extra training/test data)
payload_all = ExtractDataframe("/content/drive/My Drive/Dataset/all/PayloadsAllTheThings_github.csv")
payload_all = payload_all.drop(['index','Type'],axis=1)
payload_all.columns = ['Parameters','label']
payload_all.insert(1,'Label',1)
payload_all = payload_all.drop(['label'],axis=1)
bal_df = emctrain_df.append(payload_all)
bal_df = bal_df.sample(frac=1).reset_index(drop=True)
# bal_df=bal_df.astype({'Label':'int32'})
data_emc=PreprocessAndSplit(bal_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ",0.3)
print("Max query lenght:",data_emc[1])
print("Vocabulary Size:",data_emc[2])
# print(data_emc[3].shape,data_emc[5],data_emc[4],data_emc[6],sep='\n')

Max query lenght: 840
Vocabulary Size: 96
(40184, 840)
[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]]
[[31 20  5 ...  0  0  0]
 [21 25  9 ...  0  0  0]
 [ 6 46 18 ...  0  0  0]
 ...
 [27 10 38 ...  0  0  0]
 [29  3 13 ...  0  0  0]
 [19 61 13 ...  0  0  0]]
[[0. 1.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [0. 1.]]


###Create CNN Model 

In [None]:
EmcCnnModel = BuildCNN(data_emc[2],data_emc[1],[[256, 7, 3],[256, 7, 3],[256, 3, -1],[256, 3, -1],[256, 3, -1], 
               [256, 3, 3]],[1024,1024],2,0.5,'adam','categorical_crossentropy')
EmcCnnModel.summary()

###Build DNN model

In [None]:
EmcDNNmodel = BuildDNN(data_emc[2],data_emc[1],[1024,1024,1024],2,'sgd','categorical_crossentropy')
EmcDNNmodel.summary()

###Train DNN and CNN model

####Train CNN model

In [None]:
EmcCnnModel.fit(data_emc[3],data_emc[5],epochs=2,batch_size=128,validation_data=(data_emc[4],data_emc[6]))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f6286287cc0>

####Train DNN

In [None]:
EmcDNNmodel.fit(data_emc[3],data_emc[5],batch_size=128,epochs=5,validation_data=(data_emc[4],data_emc[6]))

###Evaluate DNN and CNN




In [None]:
# FOR PREMADE MODELS'
# EmcDNNmodel = load_model("/content/drive/My Drive/Dataset/ecmldnnkeras.h5")
# EmcCnnModel = load_model("/content/drive/My Drive/Dataset/ecmlcnnkeras.h5")
EmcCnnModel.save("ecml_cnnbalanced.hs")
EmcDNNmodel.save("ecml_dnnbalanced.h5")

In [None]:
print("DNN")
printEvaluation(EmcDNNmodel,data_emc[4],data_emc[6])
# EmcDNNmodel.evaluate(data_emc[4],data_emc[6])
print("***********************************************************************************")
print("CNN")
printEvaluation(EmcCnnModel,data_emc[4],data_emc[6])
#EmcCnnModel.evaluate(data_emc[4],data_emc[6])

DNN
0.921447156486185
[[8517  212]
 [1056 7437]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      8729
           1       0.97      0.88      0.92      8493

    accuracy                           0.93     17222
   macro avg       0.93      0.93      0.93     17222
weighted avg       0.93      0.93      0.93     17222

***********************************************************************************
CNN
0.9596280890628823
[[8718   11]
 [ 649 7844]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      8729
           1       1.00      0.92      0.96      8493

    accuracy                           0.96     17222
   macro avg       0.96      0.96      0.96     17222
weighted avg       0.96      0.96      0.96     17222



####Test on valid

In [None]:
valid_df = ExtractDataframe('/content/drive/My Drive/Dataset/ValidemclDataset.csv')
valid_df = valid_df.loc[:,['Parameters','Label']]
valid_data = PreprocessAndSplit(valid_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
printEvaluation(EmcDNNmodel,valid_data[3],valid_data[4])
# EmcDNNmodel.evaluate(valid_data[3],valid_data[4])
print("***********************************************************************************")
printEvaluation(EmcCnnModel,valid_data[3],valid_data[4])
# EmcCnnModel.evaluate(valid_data[3],valid_data[4])

0.0
[[28437   525]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     28962
           1       0.00      0.00      0.00         0

    accuracy                           0.98     28962
   macro avg       0.50      0.49      0.50     28962
weighted avg       1.00      0.98      0.99     28962

***********************************************************************************


  _warn_prf(average, modifier, msg_start, len(result))


0.0
[[28944    18]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28962
           1       0.00      0.00      0.00         0

    accuracy                           1.00     28962
   macro avg       0.50      0.50      0.50     28962
weighted avg       1.00      1.00      1.00     28962



####Test on SSI

In [None]:
ssi_df = ExtractDataframe('/content/drive/My Drive/Dataset/SSIemclDataset.csv')
ssi_df = ssi_df.loc[:,['Parameters','Label']]
ssi_data = PreprocessAndSplit(ssi_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,ssi_data[3],ssi_data[4])
# EmcDNNmodel.evaluate(ssi_data[3],ssi_data[4])
print("***********************************************************************************")
print("CNN")
printEvaluation(EmcCnnModel,ssi_data[3],ssi_data[4])
# EmcCnnModel.evaluate(ssi_data[3],ssi_data[4])

DNN
0.8995157384987894
[[   0    0]
 [ 332 1486]]


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.82      0.90      1818

    accuracy                           0.82      1818
   macro avg       0.50      0.41      0.45      1818
weighted avg       1.00      0.82      0.90      1818

***********************************************************************************
CNN
0.9087635054021609
[[   0    0]
 [ 304 1514]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.83      0.91      1818

    accuracy                           0.83      1818
   macro avg       0.50      0.42      0.45      1818
weighted avg       1.00      0.83      0.91      1818



####Test on SQL Injection

In [None]:
sql_df = ExtractDataframe('/content/drive/My Drive/Dataset/SqlInjectionemclDataset.csv')
sql_df = sql_df.loc[:,['Parameters','Label']]
sql_data = PreprocessAndSplit(sql_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,sql_data[3],sql_data[4])
# EmcDNNmodel.evaluate(sql_data[3],sql_data[4])
print("***********************************************************************************")
print("CNN")
printEvaluation(EmcCnnModel,sql_data[3],sql_data[4])
# EmcCnnModel.evaluate(sql_data[3],sql_data[4])

DNN
0.8737322515212982
[[   0    0]
 [ 498 1723]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.78      0.87      2221

    accuracy                           0.78      2221
   macro avg       0.50      0.39      0.44      2221
weighted avg       1.00      0.78      0.87      2221

***********************************************************************************
CNN


  _warn_prf(average, modifier, msg_start, len(result))


0.908867600098256
[[   0    0]
 [ 371 1850]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.83      0.91      2221

    accuracy                           0.83      2221
   macro avg       0.50      0.42      0.45      2221
weighted avg       1.00      0.83      0.91      2221



####Test on Xpath injection

In [None]:
xpath_df = ExtractDataframe('/content/drive/My Drive/Dataset/XPathInjectionemclDataset.csv')
xpath_df = xpath_df.loc[:,['Parameters','Label']]
xpath_data = PreprocessAndSplit(xpath_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,xpath_data[3],xpath_data[4])
# EmcDNNmodel.evaluate(xpath_data[3],xpath_data[4])
print("***********************************************************************************")
print("CNN")
printEvaluation(EmcCnnModel,xpath_data[3],xpath_data[4])
# EmcCnnModel.evaluate(xpath_data[3],xpath_data[4])

DNN
0.9115779189057157
[[   0    0]
 [ 362 1866]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.84      0.91      2228

    accuracy                           0.84      2228
   macro avg       0.50      0.42      0.46      2228
weighted avg       1.00      0.84      0.91      2228

***********************************************************************************
CNN


  _warn_prf(average, modifier, msg_start, len(result))


0.9115779189057157
[[   0    0]
 [ 362 1866]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.84      0.91      2228

    accuracy                           0.84      2228
   macro avg       0.50      0.42      0.46      2228
weighted avg       1.00      0.84      0.91      2228



####Test on Path traversal

In [None]:
path_df = ExtractDataframe('/content/drive/My Drive/Dataset/PathTransversalemclDataset.csv')
path_df = path_df.loc[:,['Parameters','Label']]
path_data = PreprocessAndSplit(path_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,path_data[3],path_data[4])
# EmcDNNmodel.evaluate(path_data[3],path_data[4])
print("***********************************************************************************")
print("CNN")
printEvaluation(EmcCnnModel,path_data[3],path_data[4])
# EmcCnnModel.evaluate(path_data[3],path_data[4])

DNN
0.7644859813084113
[[   0    0]
 [ 756 1227]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.62      0.76      1983

    accuracy                           0.62      1983
   macro avg       0.50      0.31      0.38      1983
weighted avg       1.00      0.62      0.76      1983

***********************************************************************************
CNN


  _warn_prf(average, modifier, msg_start, len(result))


0.9131268840778295
[[   0    0]
 [ 317 1666]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.84      0.91      1983

    accuracy                           0.84      1983
   macro avg       0.50      0.42      0.46      1983
weighted avg       1.00      0.84      0.91      1983



####Test on OS commanding

In [None]:
oscom_df = ExtractDataframe('/content/drive/My Drive/Dataset/OsCommandingemclDataset.csv')
oscom_df = oscom_df.loc[:,['Parameters','Label']]
oscom_data = PreprocessAndSplit(oscom_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,oscom_data[3],oscom_data[4])
# EmcDNNmodel.evaluate(oscom_data[3],oscom_data[4])
print("***********************************************************************************")
print("CNN")
printEvaluation(EmcCnnModel,oscom_data[3],oscom_data[4])
# EmcCnnModel.evaluate(oscom_data[3],oscom_data[4])

DNN
0.9238275955543508
[[   0    0]
 [ 281 1704]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.86      0.92      1985

    accuracy                           0.86      1985
   macro avg       0.50      0.43      0.46      1985
weighted avg       1.00      0.86      0.92      1985

***********************************************************************************
CNN


  _warn_prf(average, modifier, msg_start, len(result))


0.9438680500133014
[[   0    0]
 [ 211 1774]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.89      0.94      1985

    accuracy                           0.89      1985
   macro avg       0.50      0.45      0.47      1985
weighted avg       1.00      0.89      0.94      1985



####Test on LDAP injection

In [None]:
ldap_df = ExtractDataframe('/content/drive/My Drive/Dataset/LdapInjectionemclDataset.csv')
ldap_df = ldap_df.loc[:,['Parameters','Label']]
ldap_data = PreprocessAndSplit(ldap_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,ldap_data[3],ldap_data[4])
# EmcDNNmodel.evaluate(ldap_data[3],ldap_data[4])
print("***********************************************************************************")
print("CNN")
printEvaluation(EmcCnnModel,ldap_data[3],ldap_data[4])
# EmcCnnModel.evaluate(ldap_data[3],ldap_data[4])

DNN
0.8609339117121715
[[   0    0]
 [ 545 1687]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.76      0.86      2232

    accuracy                           0.76      2232
   macro avg       0.50      0.38      0.43      2232
weighted avg       1.00      0.76      0.86      2232

***********************************************************************************
CNN


  _warn_prf(average, modifier, msg_start, len(result))


0.911219512195122
[[   0    0]
 [ 364 1868]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.84      0.91      2232

    accuracy                           0.84      2232
   macro avg       0.50      0.42      0.46      2232
weighted avg       1.00      0.84      0.91      2232



####Test on XSS

In [None]:
xss_df = ExtractDataframe('/content/drive/My Drive/Dataset/XSSemclDataset.csv')
xss_df = xss_df.loc[:,['Parameters','Label']]
xss_data = PreprocessAndSplit(xss_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,xss_data[3],xss_data[4])
# EmcDNNmodel.evaluate(xss_data[3],xss_data[4])
print("***********************************************************************************")
print("CNN")
printEvaluation(EmcCnnModel,xss_data[3],xss_data[4])
# EmcCnnModel.evaluate(xss_data[3],xss_data[4])

DNN
0.909796314258002
[[   0    0]
 [ 279 1407]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.83      0.91      1686

    accuracy                           0.83      1686
   macro avg       0.50      0.42      0.45      1686
weighted avg       1.00      0.83      0.91      1686

***********************************************************************************
CNN


  _warn_prf(average, modifier, msg_start, len(result))


0.9108527131782946
[[   0    0]
 [ 276 1410]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.84      0.91      1686

    accuracy                           0.84      1686
   macro avg       0.50      0.42      0.46      1686
weighted avg       1.00      0.84      0.91      1686



####Test on CSIC benign

In [None]:
bencsic_df = ExtractDataframe('/content/drive/My Drive/Dataset/CSIC_dataset_test_benign.csv')
bencsic_df = bencsic_df.loc[:,['Parameters','Label']]
data_csicx1 = PreprocessAndSplit(bencsic_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,data_csicx1[3],data_csicx1[4])
# EmcDNNmodel.evaluate(data_csicx1[3],data_csicx1[4])
print("CNN")
printEvaluation(EmcCnnModel,data_csicx1[3],data_csicx1[4])
# EmcCnnModel.evaluate(data_csicx1[3],data_csicx1[4])

DNN
0.0
[[  156 15844]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      0.01      0.02     16000
           1       0.00      0.00      0.00         0

    accuracy                           0.01     16000
   macro avg       0.50      0.00      0.01     16000
weighted avg       1.00      0.01      0.02     16000

CNN


  _warn_prf(average, modifier, msg_start, len(result))


0.0
[[11664  4336]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      0.73      0.84     16000
           1       0.00      0.00      0.00         0

    accuracy                           0.73     16000
   macro avg       0.50      0.36      0.42     16000
weighted avg       1.00      0.73      0.84     16000



####Test on CSIC malicious

In [None]:
malcsic_df = ExtractDataframe('/content/drive/My Drive/Dataset/CSIC_dataset_test_malicious.csv')
malcsic_df = malcsic_df.loc[:,['Parameters','Label']]
data_csict=PreprocessAndSplit(malcsic_df,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
print("DNN")
printEvaluation(EmcDNNmodel,data_csict[3],data_csict[4])
# EmcDNNmodel.evaluate(data_csict[3],data_csict[4])
print("CNN")
printEvaluation(EmcCnnModel,data_csict[3],data_csict[4])
# EmcCnnModel.evaluate(data_csict[3],data_csict[4])

DNN
0.9649359228431762
[[    0     0]
 [ 1327 18259]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.93      0.96     19586

    accuracy                           0.93     19586
   macro avg       0.50      0.47      0.48     19586
weighted avg       1.00      0.93      0.96     19586

CNN


  _warn_prf(average, modifier, msg_start, len(result))


0.6124964579200907
[[    0     0]
 [10940  8646]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.44      0.61     19586

    accuracy                           0.44     19586
   macro avg       0.50      0.22      0.31     19586
weighted avg       1.00      0.44      0.61     19586



In [None]:
# BALANCE THE DATASET
# payload_all = ExtractDataframe("/content/drive/My Drive/Dataset/all/PayloadsAllTheThings_github.csv")
# # payload_all = payload_all.drop(['index','Type'],axis=1)
# # payload_all.columns = ['Parameters','label']
# # payload_all.insert(1,'Label',1)
# # payload_all = payload_all.drop(['label'],axis=1)
# # data_emc1=PreprocessAndSplit(payload_all,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")
# print("DNN")
# printEvaluation(EmcDNNmodel,data_emc1[3],data_emc1[4])
# # EmcDNNmodel.evaluate(data_csict[3],data_csict[4])
# print("CNN")
# printEvaluation(EmcCnnModel,data_emc1[3],data_emc1[4])
# # EmcCnnModel.evaluate(data_csict[3],data_csict[4])
# len(payload_all.Type.unique())

####Test on mixed csic

In [None]:
#train_df = ExtractDataframe('/content/drive/My Drive/Dataset/dataset.csv')
#train_df = train_df.loc[:,['Parameters','Label']]
test1c = ExtractDataframe('/content/drive/My Drive/Dataset/CSIC_dataset_train.csv')
test2c = ExtractDataframe('/content/drive/My Drive/Dataset/CSIC_dataset_test_malicious.csv')
test_dfc = test1c.append(test2c)
test_dfc = test_dfc.loc[:,['Parameters','Label']]
test_dfc = test_dfc.sample(frac=1).reset_index(drop=True)

In [None]:
data_mixc=PreprocessAndSplit(test_dfc,"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} ")

In [None]:
# CNN = load_model('/content/drive/My Drive/Dataset/ecmlcnnkeras.h5')
# DNN = load_model('/content/drive/My Drive/Dataset/ecmldnnkeras.h5')

print("CNN")
printEvaluation(EmcCnnModel,data_mixc[3],data_mixc[4])
print("DNN")
printEvaluation(EmcDNNmodel,data_mixc[3],data_mixc[4])

CNN
0.5385329619312906
[[11976  4024]
 [10886  8700]]
              precision    recall  f1-score   support

           0       0.52      0.75      0.62     16000
           1       0.68      0.44      0.54     19586

    accuracy                           0.58     35586
   macro avg       0.60      0.60      0.58     35586
weighted avg       0.61      0.58      0.57     35586

DNN
0.6757253174806945
[[ 2060 13940]
 [ 2479 17107]]
              precision    recall  f1-score   support

           0       0.45      0.13      0.20     16000
           1       0.55      0.87      0.68     19586

    accuracy                           0.54     35586
   macro avg       0.50      0.50      0.44     35586
weighted avg       0.51      0.54      0.46     35586



In [None]:
# printEvaluation(EmcCnnModel,data_mixc[3],data_mixc[4])

In [None]:
# CNN = load_model('/content/drive/My Drive/ecml_cnnbalanced.hs')
# DNN = load_model('/content/drive/My Drive/ecml_dnnbalanced.h5')

# print("CNN")
# printEvaluation(CNN,data_mixc[3],data_mixc[4])
# print("DNN")
# printEvaluation(DNN,data_mixc[3],data_mixc[4])

In [None]:
# cm = ExtractDataframe('/content/drive/My Drive/Dataset/all/command.csv')
# ldap = ExtractDataframe('/content/drive/My Drive/Dataset/all/ldap.csv')
# sqli = ExtractDataframe('/content/drive/My Drive/Dataset/all/sqli.csv')
# xss = ExtractDataframe('/content/drive/My Drive/Dataset/all/xss.csv')

In [None]:
# xss = xss[xss.Label=='1']
# xss.Type.unique()

In [None]:
  # all = xss.append(cm,ignore_index=True)
  # all = all.append(ldap,ignore_index=True)
  # all= all.append(sqli,ignore_index=True)
  

In [None]:
# all = all.sample(frac=1).reset_index(drop=True)
# all.to_csv('/content/drive/My Drive/Dataset/all/PayloadsAllTheThings_github.csv',index=False)