In [1]:
import re

import tensorflow as tf
from tensorflow import keras
import csv

from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding, Bidirectional,LSTM,Dense,Dropout


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result
# Load preprocessed text data
bad_requests = loadData('anomalousRequestTest.txt')
good_requests = loadData('normalRequestTraining.txt')

# Combine data
all_requests = bad_requests + good_requests

# Create labels
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

# TF-IDF vectorization
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
X = vectorizer.fit_transform(all_requests)

# Convert TF-IDF vectors to sequences of word indices
X_indices = []
for tfidf_vector in X:
    word_indices = np.nonzero(tfidf_vector)[1]  # Extract non-zero indices
    X_indices.append(word_indices)

# Pad sequences to the same length
max_length = max(len(seq) for seq in X_indices)
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_indices, maxlen=max_length)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=21)



In [11]:
# Model architecture
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Flatten
from keras.layers import Conv1D
from keras.layers import MaxPooling1D

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)


import tensorflow as tf
from tensorflow.keras import layers, models

# Assuming vectorizer is your Tokenizer
input_dim = len(vectorizer.get_feature_names_out()) + 1
output_dim = 64


# Replace the Conv2D layers with Conv1D layers
model = Sequential()      # initilaizing the Sequential nature for CNN model
# Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
model.add(Embedding(input_dim=len(vectorizer.get_feature_names_out()) + 1, output_dim=64, mask_zero=True))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()
#print(len(vectorizer.get_feature_names_out()) + 1)


In [8]:
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))
# Train the model
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save("modelCNN.h5")
print("Saved model to disk")

Epoch 1/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 50ms/step - accuracy: 0.9056 - loss: 0.1979 - val_accuracy: 0.9893 - val_loss: 0.0300
Epoch 2/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 50ms/step - accuracy: 0.9988 - loss: 0.0052 - val_accuracy: 0.9925 - val_loss: 0.0252
Epoch 3/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 54ms/step - accuracy: 0.9999 - loss: 2.4701e-04 - val_accuracy: 0.9930 - val_loss: 0.0315
Epoch 4/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 51ms/step - accuracy: 1.0000 - loss: 1.7076e-05 - val_accuracy: 0.9930 - val_loss: 0.0335
Epoch 5/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 52ms/step - accuracy: 1.0000 - loss: 3.9424e-06 - val_accuracy: 0.9930 - val_loss: 0.0359
Epoch 6/30
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 53ms/step - accuracy: 1.0000 - loss: 1.5608e-06 - val_accuracy: 0.9930 - val_l



Saved model to disk


In [7]:
# load json and create model
import tensorflow
import json
from keras.models import model_from_json
import keras_metrics
from data_utils import Data



loaded_model = tensorflow.keras.models.load_model("modelCNN.h5")
# Load the model from JSON



print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tensorflow.keras.metrics.Precision(), tensorflow.keras.metrics.Recall()])




Loaded model from disk


In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
# Load the text file
# Make predictions
predictions = loaded_model.predict(X_test)

# Convert predictions to class labels
class_mapping = {0: "not anomalous", 1: "anomalous"}
y_pred_labels = [class_mapping[int(pred)] for pred in predictions]

label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.fit_transform(y_pred_labels)

loss, accuracy = model.evaluate(X_test, y_test)

print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')
# Print predictions
#print(y_pred_labels)

with open("CNN_predictions.txt", "w") as file:
    for pred_class in y_pred_labels:
        file.write(pred_class + "\n")



[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
[1m 10/382[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 6ms/step - accuracy: 0.9925 - loss: 0.0490      

  y_pred_labels = [class_mapping[int(pred)] for pred in predictions]


[1m382/382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9916 - loss: 0.0692
Test loss: 0.059420906007289886
Test accuracy: 0.9928764700889587
