In [1]:
# This script performs network intrusion detection using the UNSW-NB15 dataset.
# It includes data loading, preprocessing, feature selection, model training, and saving.
# Models used: Random Forest, Isolation Forest, and Autoencoder.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import joblib

In [3]:
# Load the feature descriptions
features = pd.read_csv("datasets/NUSW-NB15_features.csv", encoding='latin1')
features

Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value


In [4]:
# Load the training dataset
train_data = pd.read_csv("datasets/UNSW_NB15_training-set.csv")
train_data

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.087490,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,175337,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,13,24,0,0,0,24,24,0,Generic,1
175337,175338,0.505762,tcp,-,FIN,10,8,620,354,33.612649,...,1,2,0,0,0,1,1,0,Shellcode,1
175338,175339,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,3,13,0,0,0,3,12,0,Generic,1
175339,175340,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,14,30,0,0,0,30,30,0,Generic,1


In [5]:
# Load the testing dataset
test_data = pd.read_csv("datasets/UNSW_NB15_testing-set.csv")
test_data

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.000011,udp,-,INT,2,0,496,0,90909.090200,...,1,2,0,0,0,1,2,0,Normal,0
1,2,0.000008,udp,-,INT,2,0,1762,0,125000.000300,...,1,2,0,0,0,1,2,0,Normal,0
2,3,0.000005,udp,-,INT,2,0,1068,0,200000.005100,...,1,3,0,0,0,1,3,0,Normal,0
3,4,0.000006,udp,-,INT,2,0,900,0,166666.660800,...,1,3,0,0,0,2,3,0,Normal,0
4,5,0.000010,udp,-,INT,2,0,2126,0,100000.002500,...,1,3,0,0,0,2,3,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,82328,0.000005,udp,-,INT,2,0,104,0,200000.005100,...,1,2,0,0,0,2,1,0,Normal,0
82328,82329,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,...,1,1,0,0,0,3,2,0,Normal,0
82329,82330,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0
82330,82331,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0


In [6]:
# checking if there are any missing values in the training data

print(train_data.isnull().sum())

id                   0
dur                  0
proto                0
service              0
state                0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
attack_cat 

In [7]:
#identifying categorical and numerical columns for preprocessing
categorical_cols = train_data.select_dtypes(include=['object']).columns
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
print("Categorical columns: ", categorical_cols)
print("Numerical columns: ", numerical_cols)

Categorical columns:  Index(['proto', 'service', 'state', 'attack_cat'], dtype='object')
Numerical columns:  Index(['id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit',
       'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat',
       'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src',
       'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label'],
      dtype='object')


In [8]:
# Encoding categorical data using LabelEncoder
encoders = {}
for col in categorical_cols:
    encoders[col] = LabelEncoder()
    train_data[col] = encoders[col].fit_transform(train_data[col])
    # Handle unknown categories in test data
    test_data[col] = test_data[col].map(lambda x: encoders[col].classes_.tolist().index(x) if x in encoders[col].classes_ else -1)

In [9]:
# Scaling Numerical features to [0, 1] range using MinMaxScaler
scaler = MinMaxScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

In [10]:
# Selecting important features and removing unimportant features
X = train_data.drop(columns=["id", "attack_cat", "label"])  # Drop the target column
y = train_data["attack_cat"]

# Test different values of K
for k in [15, 20, 25, 30, 35, 40, 42]:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    model = RandomForestClassifier()
    score = cross_val_score(model, X_selected, y, cv=5, scoring="accuracy").mean()
    print(f"K={k}, Accuracy={score:.4f}")


K=15, Accuracy=0.6923
K=20, Accuracy=0.7177
K=25, Accuracy=0.7596
K=30, Accuracy=0.7622
K=35, Accuracy=0.7631
K=40, Accuracy=0.7644
K=42, Accuracy=0.7666


In [11]:
# Selecting the K with best accuracy and selecting the top K features based on f_classif scoring
# Chosing the features based on cross-validation accuracy
X = train_data.drop(columns=["id", "attack_cat", "label"])  # Drop the target column
y = train_data["attack_cat"]

selector = SelectKBest(score_func=f_classif, k=42)
X_selected = selector.fit_transform(X, y)
selected_features = np.array(X.columns)[selector.get_support()]
print("Selected Features:", selected_features.tolist())


Selected Features: ['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']


In [12]:
X_train = train_data[selected_features]
y_train = train_data["attack_cat"]
X_test = test_data[selected_features]
y_test = test_data["attack_cat"]

print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)


Train set size: (175341, 42)
Test set size: (82332, 42)


In [13]:
from sklearn.cluster import KMeans
from sklearn.metrics import (
    adjusted_rand_score,
    normalized_mutual_info_score,
    f1_score
)
from scipy.stats import mode
import numpy as np

# --------------------------
# Step 1: Prepare Data
# --------------------------
X_cluster = train_data.drop(columns=["id", "label", "attack_cat"])  # Keep attack_cat for eval  # Keep attack_cat for eval
y_true = train_data["attack_cat"]  # Already label-encoded

n_clusters = len(np.unique(y_true))

# --------------------------
# Step 2: KMeans Clustering
# --------------------------
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
y_pred = kmeans.fit_predict(X_cluster)

# --------------------------
# Step 3: Map cluster IDs to true attack_cat using majority voting
# --------------------------
label_map = {}
for i in range(n_clusters):
    cluster_indices = (y_pred == i)
    if np.any(cluster_indices):
        label_map[i] = mode(y_true[cluster_indices], keepdims=True).mode[0]

# Remap predicted clusters to labels
y_pred_mapped = np.array([label_map[cluster] for cluster in y_pred])

# --------------------------
# Step 4: Evaluation
# --------------------------
ari = adjusted_rand_score(y_true, y_pred)
nmi = normalized_mutual_info_score(y_true, y_pred)

micro_f1 = f1_score(y_true, y_pred_mapped, average='micro')
macro_f1 = f1_score(y_true, y_pred_mapped, average='macro')
weighted_f1 = f1_score(y_true, y_pred_mapped, average='weighted')

print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")


[WinError 2] The system cannot find the file specified
  File "C:\Users\kdp46\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\kdp46\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kdp46\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\kdp46\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Adjusted Rand Index (ARI): 0.1963
Normalized Mutual Information (NMI): 0.2194
Micro F1 Score: 0.5199
Macro F1 Score: 0.1775
Weighted F1 Score: 0.4351


In [14]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
from scipy.stats import mode
import numpy as np

# --------------------------
# Step 1: Prepare Data
# --------------------------
X_cluster = train_data.drop(columns=["id", "label", "attack_cat"])  # Keep attack_cat for eval
y_true = train_data["attack_cat"]  # Already label-encoded

# --------------------------
# Step 2: GMM Clustering
# --------------------------
gmm = GaussianMixture(n_components=len(np.unique(y_true)), random_state=42)
y_pred = gmm.fit_predict(X_cluster)

# --------------------------
# Step 3: Map GMM Clusters to True Labels
# --------------------------
label_map = {}
for i in np.unique(y_pred):
    cluster_indices = (y_pred == i)
    if np.any(cluster_indices):
        label_map[i] = mode(y_true[cluster_indices], keepdims=True).mode[0]

# Remap predicted clusters to labels
y_pred_mapped = np.array([label_map[cluster] for cluster in y_pred])

# --------------------------
# Step 4: Evaluation
# --------------------------
ari = adjusted_rand_score(y_true, y_pred)
nmi = normalized_mutual_info_score(y_true, y_pred)

micro_f1 = f1_score(y_true, y_pred_mapped, average='micro')
macro_f1 = f1_score(y_true, y_pred_mapped, average='macro')
weighted_f1 = f1_score(y_true, y_pred_mapped, average='weighted')

print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")


Adjusted Rand Index (ARI): 0.3960
Normalized Mutual Information (NMI): 0.4259
Micro F1 Score: 0.6416
Macro F1 Score: 0.2555
Weighted F1 Score: 0.5896


In [15]:
# Install MiniSom if not already installed
# !pip install minisom

from minisom import MiniSom
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
from scipy.stats import mode
import numpy as np

# --------------------------
# Step 1: Prepare Data
# --------------------------
X_cluster = train_data.drop(columns=["id", "label", "attack_cat"]) # Keep attack_cat for evaluation
y_true = train_data["attack_cat"]  # Already label-encoded

# Normalize data as SOM is distance-based
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_cluster)

# --------------------------
# Step 2: Train SOM
# --------------------------
# Grid size — heuristic: sqrt(5 * sqrt(n_samples))
n_samples = X_scaled.shape[0]
som_size = int(np.sqrt(5 * np.sqrt(n_samples)))

som = MiniSom(x=som_size, y=som_size, input_len=X_scaled.shape[1], sigma=1.0, learning_rate=0.5, random_seed=42)
som.random_weights_init(X_scaled)
som.train(X_scaled, num_iteration=100)

# --------------------------
# Step 3: Map Data to SOM Nodes
# --------------------------
# Each input is mapped to a (x, y) coordinate — we'll flatten that to a single cluster id
win_map = np.array([som.winner(x) for x in X_scaled])
y_pred = np.array([coord[0] * som_size + coord[1] for coord in win_map])

# --------------------------
# Step 4: Map SOM Clusters to True Labels
# --------------------------
label_map = {}
for i in np.unique(y_pred):
    cluster_indices = (y_pred == i)
    if np.any(cluster_indices):
        label_map[i] = mode(y_true[cluster_indices], keepdims=True).mode[0]

# Remap predicted clusters to labels
y_pred_mapped = np.array([label_map[cluster] for cluster in y_pred])

# --------------------------
# Step 5: Evaluation
# --------------------------
ari = adjusted_rand_score(y_true, y_pred)
nmi = normalized_mutual_info_score(y_true, y_pred)

micro_f1 = f1_score(y_true, y_pred_mapped, average='micro')
macro_f1 = f1_score(y_true, y_pred_mapped, average='macro')
weighted_f1 = f1_score(y_true, y_pred_mapped, average='weighted')

print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")


Adjusted Rand Index (ARI): 0.0038
Normalized Mutual Information (NMI): 0.2517
Micro F1 Score: 0.7588
Macro F1 Score: 0.4174
Weighted F1 Score: 0.7449


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# --------------------------
# Step 1: Prepare Data
# --------------------------
X = train_data.drop(columns=["id", "label", "attack_cat"])  # Use all features except label
y_true = train_data["attack_cat"]             # True labels

# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# --------------------------
# Step 2: Train Random Forest Classifier
# --------------------------
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_scaled, y_true)

# --------------------------
# Step 3: Predict and Evaluate
# --------------------------
y_pred = rf_model.predict(X_scaled)

# Calculate F1 Scores
micro_f1 = f1_score(y_true, y_pred, average='micro')
macro_f1 = f1_score(y_true, y_pred, average='macro')
weighted_f1 = f1_score(y_true, y_pred, average='weighted')

# Print the evaluation metrics
print("Random Forest Classifier Evaluation:")
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")


Random Forest Classifier Evaluation:
Micro F1 Score: 0.9075
Macro F1 Score: 0.7944
Weighted F1 Score: 0.9029


In [17]:
import xgboost as xgb
from sklearn.metrics import f1_score

# Prepare data
X_train = train_data.drop(columns=["id", "label", "attack_cat"])  # Remove 'attack_cat'
y_train = train_data["attack_cat"]  # This will be your target variable
X_test = test_data.drop(columns=["id", "label", "attack_cat"])
y_test = test_data["attack_cat"]

# Convert data into DMatrix, XGBoost's internal format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters for XGBoost
params = {
    "objective": "multi:softmax",  # Multiclass classification
    "num_class": len(y_train.unique()),  # Number of unique classes in the target variable
    "eval_metric": "mlogloss",  # Multi-class log loss
    "max_depth": 6,  # Maximum depth of a tree
    "eta": 0.3,  # Learning rate
    "subsample": 0.8,  # Subsample ratio
    "colsample_bytree": 0.8  # Subsample ratio of columns when constructing each tree
}

# Train the model
num_round = 100  # Number of boosting rounds
bst = xgb.train(params, dtrain, num_round)

# Predict using the model
y_pred = bst.predict(dtest)

# Convert predictions to integers
y_pred = y_pred.astype(int)

# Evaluate the model using F1 scores
micro_f1 = f1_score(y_test, y_pred, average="micro")
macro_f1 = f1_score(y_test, y_pred, average="macro")
weighted_f1 = f1_score(y_test, y_pred, average="weighted")

# Display the F1 scores
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")


Micro F1 Score: 0.7615
Macro F1 Score: 0.5163
Weighted F1 Score: 0.7800


In [18]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Prepare data
X_train = train_data.drop(columns=["id", "label", "attack_cat"])  # Remove 'attack_cat'
y_train = train_data["attack_cat"]  # This will be your target variable
X_test = test_data.drop(columns=["id", "label", "attack_cat"])
y_test = test_data["attack_cat"]

# Initialize SVM with One-vs-Rest (OvR) strategy
svm = SVC(decision_function_shape='ovr')

# Train the model
svm.fit(X_train, y_train)

# Predict using the model
y_pred = svm.predict(X_test)

# Evaluate the model using F1 scores
micro_f1 = f1_score(y_test, y_pred, average="micro")
macro_f1 = f1_score(y_test, y_pred, average="macro")
weighted_f1 = f1_score(y_test, y_pred, average="weighted")

# Display the F1 scores
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")


Micro F1 Score: 0.6113
Macro F1 Score: 0.2432
Weighted F1 Score: 0.6166


In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Prepare data
X_train = train_data.drop(columns=["id", "label", "attack_cat"])  # Remove 'attack_cat'
y_train = train_data["attack_cat"]  # This will be your target variable
X_test = test_data.drop(columns=["id", "label", "attack_cat"])
y_test = test_data["attack_cat"]

# Initialize KNN with a chosen number of neighbors (k)
knn = KNeighborsClassifier(n_neighbors=5)  # You can tweak the number of neighbors as needed

# Train the model
knn.fit(X_train, y_train)

# Predict using the model
y_pred = knn.predict(X_test)

# Evaluate the model using F1 scores
micro_f1 = f1_score(y_test, y_pred, average="micro")
macro_f1 = f1_score(y_test, y_pred, average="macro")
weighted_f1 = f1_score(y_test, y_pred, average="weighted")

# Display the F1 scores
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")


Micro F1 Score: 0.7108
Macro F1 Score: 0.4150
Weighted F1 Score: 0.7396


In [21]:
# rf: Random Forest model
# scaler: StandardScaler or similar
# selected_features: list of selected feature names
# encoders: dictionary of {column_name: LabelEncoder}

# Save the Random Forest model
joblib.dump(rf_model, 'random_forest_model.joblib')

# Save the scaler
joblib.dump(scaler, 'scaler.joblib')

# Save the selected features
joblib.dump(selected_features, 'selected_features.joblib')

# Save each label encoder
for col, encoder in encoders.items():
    joblib.dump(encoder, f'{col}_encoder.joblib')