In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:
path = "/content/drive/MyDrive/MP-2/archive/UNSW_NB15_training-set.csv"
df = pd.read_csv(path)


In [None]:
print("Missing values:", df.isnull().sum().sum())



Missing values: 0


In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols


Index(['proto', 'service', 'state', 'attack_cat'], dtype='object')

In [None]:
encoder = LabelEncoder()

for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])


In [None]:
target_col = "label"

X1 = df.drop(target_col, axis=1)
y = df[target_col]


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X1)

X_scaled = pd.DataFrame(X_scaled, columns=X1.columns)

In [None]:
X1

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat
0,1,0.000011,117,0,4,2,0,496,0,90909.090200,...,1,1,2,0,0,0,1,2,0,6
1,2,0.000008,117,0,4,2,0,1762,0,125000.000300,...,1,1,2,0,0,0,1,2,0,6
2,3,0.000005,117,0,4,2,0,1068,0,200000.005100,...,1,1,3,0,0,0,1,3,0,6
3,4,0.000006,117,0,4,2,0,900,0,166666.660800,...,2,1,3,0,0,0,2,3,0,6
4,5,0.000010,117,0,4,2,0,2126,0,100000.002500,...,2,1,3,0,0,0,2,3,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,82328,0.000005,117,0,4,2,0,104,0,200000.005100,...,1,1,2,0,0,0,2,1,0,6
82328,82329,1.106101,111,0,3,20,8,18062,354,24.410067,...,1,1,1,0,0,0,3,2,0,6
82329,82330,0.000000,6,0,4,1,0,46,0,0.000000,...,1,1,1,0,0,0,1,1,1,6
82330,82331,0.000000,6,0,4,1,0,46,0,0.000000,...,1,1,1,0,0,0,1,1,1,6


In [None]:
# Feature Selection (Mutual Information)
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Define the number of features to select
K = 18

# Initialize the selector
selector = SelectKBest(score_func=mutual_info_classif, k=K)

# Perform feature selection
X_selected_array = selector.fit_transform(X1, y)

# Get the names of the selected features
selected_features = X1.columns[selector.get_support()]

# Update X to contain only the selected features, as a DataFrame
X = pd.DataFrame(X_selected_array, columns=selected_features)

print("Selected Features:")
print(selected_features)
print("Shape of X after feature selection:", X.shape)

Selected Features:
Index(['id', 'dur', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl',
       'sload', 'dload', 'sinpkt', 'dinpkt', 'tcprtt', 'synack', 'smean',
       'dmean', 'ct_state_ttl', 'attack_cat'],
      dtype='object')
Shape of X after feature selection: (82332, 18)


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=selected_features)

In [None]:
X_scaled.to_csv("/content/drive/MyDrive/UNSW_X_preprocessed.csv", index=False)
y.to_csv("/content/drive/MyDrive/UNSW_y.csv", index=False)

print("Preprocessing DONE!")


Preprocessing DONE!


In [None]:
!pip install seaborn matplotlib scikit-learn




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import numpy as np


In [None]:
cm = confusion_matrix(y_test, preds)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Normal", "Attack"],
            yticklabels=["Normal", "Attack"])
plt.title("Confusion Matrix - IDS")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


NameError: name 'confusion_matrix' is not defined

In [None]:
pred_probs = global_model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, pred_probs[:,1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7,6))
plt.plot(fpr, tpr, lw=2, label=f"ROC AUC = {roc_auc:.4f}")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Intrusion Detection")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, pred_probs[:,1])

plt.figure(figsize=(7,6))
plt.plot(recall, precision, lw=2)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve - IDS")
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x=preds, palette="viridis")
plt.xticks([0,1], ["Normal", "Attack"])
plt.title("Prediction Distribution - IDS")
plt.xlabel("Predicted Class")
plt.ylabel("Count")
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x=results_df["action"], palette="magma")
plt.title("IPS Actions: BLOCK vs ALLOW")
plt.xlabel("Action")
plt.ylabel("Count")
plt.show()


In [None]:
attack_df = results_df[results_df["true"] == 1]  # only attack samples

plt.figure(figsize=(6,4))
sns.countplot(x=attack_df["action"], palette="coolwarm")
plt.title("IPS Effectiveness: What happened to true attacks?")
plt.xlabel("Action")
plt.ylabel("Count")
plt.show()


In [None]:
fp_df = results_df[(results_df["true"] == 0) & (results_df["pred"] == 1)]

plt.figure(figsize=(6,4))
sns.histplot(fp_df.index, bins=30, color="red")
plt.title("False Positives - Normal Traffic Blocked")
plt.xlabel("Sample Index")
plt.ylabel("Count")
plt.show()


In [None]:
fn_df = results_df[(results_df["true"] == 1) & (results_df["pred"] == 0)]

plt.figure(figsize=(6,4))
sns.histplot(fn_df.index, bins=30, color="blue")
plt.title("False Negatives - Attacks that Bypassed IDS")
plt.xlabel("Sample Index")
plt.ylabel("Count")
plt.show()
