In [1]:
import re

import tensorflow as tf
from tensorflow import keras
import csv

from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding, Bidirectional,LSTM,Dense,Dropout


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
# Load preprocessed text data
bad_requests = loadData('anomalousRequestTest.txt')
good_requests = loadData('normalRequestTraining.txt')

# Combine data
all_requests = bad_requests + good_requests

# Create labels
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

# TF-IDF vectorization
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
X = vectorizer.fit_transform(all_requests)

# Convert TF-IDF vectors to sequences of word indices
X_indices = []
for tfidf_vector in X:
    word_indices = np.nonzero(tfidf_vector)[1]  # Extract non-zero indices
    X_indices.append(word_indices)

# Pad sequences to the same length
max_length = max(len(seq) for seq in X_indices)
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_indices, maxlen=max_length)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=21)



In [3]:
# Model architecture
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)


model = Sequential()
model.add(Dense(64, input_dim=421))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dense(32))
model.add(Activation('relu'))

model.add(Dense(10))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))



model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()
#print(len(vectorizer.get_feature_names_out()) + 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))
# Train the model
model_json = model.to_json()
with open("modelMLPjs.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save("modelMLP.h5")
print("Saved model to disk")

Epoch 1/50
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5910 - loss: 0.6764 - val_accuracy: 0.5889 - val_loss: 0.6771
Epoch 2/50
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5903 - loss: 0.6766 - val_accuracy: 0.5889 - val_loss: 0.6771
Epoch 3/50
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5900 - loss: 0.6768 - val_accuracy: 0.5889 - val_loss: 0.6771
Epoch 4/50
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5892 - loss: 0.6769 - val_accuracy: 0.5889 - val_loss: 0.6772
Epoch 5/50
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5870 - loss: 0.6779 - val_accuracy: 0.5889 - val_loss: 0.6771
Epoch 6/50
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5900 - loss: 0.6767 - val_accuracy: 0.5889 - val_loss: 0.6771
Epoch 7/50
[1m1



Saved model to disk


In [7]:
# load json and create model
import tensorflow
import json
from keras.models import model_from_json
import keras_metrics
from data_utils import Data

loaded_model = tensorflow.keras.models.load_model("modelMLP.h5")


print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tensorflow.keras.metrics.Precision(), tensorflow.keras.metrics.Recall()])



Loaded model from disk


In [23]:
"""from sklearn.metrics import classification_report
import numpy as np

model = loaded_model
y_test = test_labels
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict(test_inputs)
y_pred = np.argmax(y_pred,axis=-1)
#print(classification_report(Y_test, y_pred))
class_mapping = {0: "not anomaly", 1: "anomaly"}

# Map integer labels to class names
classified_labels = [class_mapping[label] for label in y_pred]

# Now, classified_labels contains the class names for each sample, classified as "anomaly" or "not anomaly"
print(classified_labels)
# for ECMLPKDD
# valid = 0
# xss =  1
# sqlinjection = 2
# ldapinjection  = 3
# xpathinjection  = 4
# pathtransversal = 5
# oscommanding  = 6
# ssi = 7

# for  CISC
# valid = 0
# malicious = 1

# for Morzeux_HttpParamsDataset
# valid = 0
# sqli  = 1
# xss   = 2
# path-traversal = 3
# cmdi = 4"""

ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_40" is incompatible with the layer: expected axis -1 of input shape to have value 421, but received input with shape (32, 400)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 400), dtype=int64)
  • training=False
  • mask=None

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
# Load the text file
# Make predictions
predictions = loaded_model.predict(X_test)

# Convert predictions to class labels
class_mapping = {0: "not anomalous", 1: "anomalous"}
y_pred_labels = [class_mapping[int(pred)] for pred in predictions]

label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.fit_transform(y_pred_labels)

loss, accuracy = model.evaluate(X_test, y_test)

print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

# Print predictions
#print(y_pred_labels)

with open("MLP_predictions.txt", "w") as file:
    for pred_class in y_pred_labels:
        file.write(pred_class + "\n")



[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m 28/382[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.6246 - loss: 0.6646  

  y_pred_labels = [class_mapping[int(pred)] for pred in predictions]


[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5943 - loss: 0.6751
Test loss: 0.6771368384361267
Test accuracy: 0.588880717754364
