In [1]:
import re

import tensorflow as tf
from tensorflow import keras
import csv

from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding, Bidirectional,LSTM,Dense,Dropout

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
# Load preprocessed text data
bad_requests = loadData('anomalousRequestTest.txt')
good_requests = loadData('normalRequestTraining.txt')

# Combine data
all_requests = bad_requests + good_requests

# Create labels
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

# TF-IDF vectorization
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
X = vectorizer.fit_transform(all_requests)

# Convert TF-IDF vectors to sequences of word indices
X_indices = []
for tfidf_vector in X:
    word_indices = np.nonzero(tfidf_vector)[1]  # Extract non-zero indices
    X_indices.append(word_indices)

# Pad sequences to the same length
max_length = max(len(seq) for seq in X_indices)
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_indices, maxlen=max_length)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=21)



In [3]:
# Model architecture
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(vectorizer.get_feature_names_out()) + 1, output_dim=64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
model.summary()

In [5]:
model.fit(X_train, y_train, epochs=30, validation_data=(X_test, y_test))
# Train the model
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save("modelRNN2.h5")
print("Saved model to disk")

Epoch 1/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m654s[0m 420ms/step - accuracy: 0.7755 - loss: 0.3865 - val_accuracy: 0.9873 - val_loss: 0.0445
Epoch 2/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m674s[0m 441ms/step - accuracy: 0.9948 - loss: 0.0240 - val_accuracy: 0.9921 - val_loss: 0.0311
Epoch 3/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m685s[0m 449ms/step - accuracy: 0.9989 - loss: 0.0063 - val_accuracy: 0.9921 - val_loss: 0.0375
Epoch 4/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m666s[0m 436ms/step - accuracy: 0.9994 - loss: 0.0027 - val_accuracy: 0.9917 - val_loss: 0.0443
Epoch 5/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m665s[0m 436ms/step - accuracy: 0.9996 - loss: 0.0022 - val_accuracy: 0.9924 - val_loss: 0.0332
Epoch 6/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m725s[0m 475ms/step - accuracy: 0.9996 - loss: 0.0016 - val_accuracy: 0.9924 - val_loss:



Saved model to disk


In [4]:
# load json and create model
import tensorflow
import json
from keras.models import model_from_json
import keras_metrics
from data_utils import Data



loaded_model = tensorflow.keras.models.load_model("modelRNN2.h5")
# Load the model from JSON


# evaluate loaded model on test data
loaded_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tensorflow.keras.metrics.Precision(), tensorflow.keras.metrics.Recall()])




In [5]:

import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Assuming loaded_model is already defined and X_test is available

# Make predictions
predictions = loaded_model.predict(X_test)

# Normalize predictions
min_value = np.min(predictions)
max_value = np.max(predictions)
normalized_predictions = (predictions - min_value) / (max_value - min_value)

# Define threshold for classification
threshold = 0.5  # Adjust the threshold as needed

# Classify predictions as "normal" or "anomalous"
classified_predictions = ["not anomalous" if pred < threshold else "anomalous" for pred in normalized_predictions]

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.fit_transform(classified_predictions)

# Now you can use y_test_encoded and y_pred_encoded in precision_score
loss, accuracy = model.evaluate(X_test, y_test)

print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

# Write classified predictions to a text file
with open("RNN_predictions.txt", "w") as file:
    for pred_class in classified_predictions:
        file.write(pred_class + "\n")

[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 98ms/step
[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 90ms/step - accuracy: 0.5939 - loss: 0.6931
Test loss: 0.6930913329124451
Test accuracy: 0.5886350870132446
