<a href="https://colab.research.google.com/github/Al-Jazzazi/Final_Project_ML/blob/main/LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("chethuhn/network-intrusion-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/chethuhn/network-intrusion-dataset/versions/1


In [2]:
all_files = [
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
]

In [3]:
import os
import pandas as pd
import numpy as np

# Combine all CSV files into a single dataframe
dataframes = []
for file in all_files:
    # Make sure to replace with the correct file path
    df = pd.read_csv(os.path.join(path, file))
    dataframes.append(df)

In [4]:
# Concatenate all dataframes
df = pd.concat(dataframes, ignore_index=True)

In [5]:
# Data cleaning: Replace infinity with NaN and drop rows with NaN values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [6]:
# Extract features and labels
X = np.array(df.drop(columns=' Label'))
y = np.array(df[' Label'])

In [7]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [8]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
from tensorflow.keras.utils import to_categorical

num_sections = 100

x_sections = np.array_split(X_scaled, num_sections)
y_sections = np.array_split(y, num_sections)

indices = np.arange(num_sections)
np.random.shuffle(indices)

# Shuffle x and y sections using the same order
x_sections = [x_sections[i] for i in indices]
y_sections = [y_sections[i] for i in indices]

x_flattened = np.vstack(x_sections)
y_flattened = np.hstack(y_sections)

split_index = int(0.8 * len(X))

X_reshaped = x_flattened.reshape((x_flattened.shape[0], 1, x_flattened.shape[1]))
Y_onehot = to_categorical(y_flattened, num_classes=15)
# Sequential split
X_train, X_test = X_reshaped[:split_index], X_reshaped[split_index:]
y_train, y_test = Y_onehot[:split_index], Y_onehot[split_index:]

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False))  # LSTM layer
model.add(Dense(32, activation='relu'))  # Dense hidden layer
model.add(Dense(15, activation='softmax'))  # Output layer for 15 classes

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


  super().__init__(**kwargs)


In [None]:
# Train the model

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Test Accuracy: {accuracy*100:.2f}%")

In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

In [None]:
# Calculate FPR and FNR
FP = cm.sum(axis=0) - np.diag(cm)  # False Positives
FN = cm.sum(axis=1) - np.diag(cm)  # False Negatives
TP = np.diag(cm)                   # True Positives
TN = cm.sum() - (FP + FN + TP)     # True Negatives

# False Positive Rate and False Negative Rate
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)

print(f"False Positive Rates: {[round(fpr, 5) for fpr in FPR]}")
print(f"False Negative Rates: {[round(fnr, 5) for fnr in FNR]}")

In [None]:
print("The number of attacks predicted by the model: ", len(np.unique(y_pred)))
print("The number of attacks exist in dataset: ", len(np.unique(y_test)))


In [None]:
unique_elements, counts = np.unique(y, return_counts=True)
labels = np.unique(df[' Label'])
for label, element, count in zip( labels , unique_elements, counts):
      print(f"Element {label}: {count} occurrences")

**How does the Intrusion Detection System perform?**
without regard to wether it classified the attack correctly, How does the model perform in distinguishing attacks from benign traffic?

In [None]:
I_pred = y_pred == 0
I_test = y_test == 0


In [None]:
from sklearn.metrics import precision_score, accuracy_score, f1_score

# Calculate precision, accuracy, and F1-score
precision = precision_score(I_test, I_pred)
accuracy = accuracy_score(I_test, I_pred)
f1 = f1_score(I_test, I_pred)

print(f"Precision: {precision:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:

false_positives = np.sum((I_pred == True) & (I_test == False))/np.sum(I_pred == True)
false_negatives = np.sum((I_pred == False) & (I_test == True))/np.sum(I_pred == False)
print("false positives percent amongst all positives (ie predicting BENIGN when actual is attack) ", false_positives)
print("false negatives percent amongst all negatives (ie predicting Attack when actual is benign) ", false_negatives)


In [None]:
import matplotlib.pyplot as plt


# Plot training history
plt.figure(figsize=(12, 6))

# Plot training and validation accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot training and validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()