In [None]:
# Data cleaning
import pandas as pd
import numpy as np

# Data preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
import joblib

# Train model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import pickle

# Classifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

The number of records in the Dataset is 175,341 records from the different types, attack and normal.

attack_cat: This dataset has nine types of attacks, namely, Fuzzers, Analysis, Backdoors, DoS, Exploits, Generic, Reconnaissance, Shellcode and Worms.

Label: 0 for normal and 1 for attack records

In [46]:
initial_data = pd.read_csv('UNSW_NB15.csv')
initial_data.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


## Data Preprocessing

### Check missing values

In [47]:
initial_data.shape

(175341, 45)

In [48]:
data_to_use = initial_data.dropna()
data_to_use.shape

(175341, 45)

In [49]:
X = data_to_use.drop(axis=1, columns=['attack_cat'])
X = X.drop(axis=1, columns=['label'])

y1 = data_to_use['label'].values
y2 = data_to_use['attack_cat'].values

### Check imbalanced issue on y

In [50]:
def data_ratio(y):
    unique, count = np.unique(y, return_counts=True)
    ratio = round(count[0]/count[1], 1)
    return f'{ratio}:1 ({count[0]}/{count[1]})'

In [51]:
print('The class ratio for y1:', data_ratio(y1))

The class ratio for y1: 0.5:1 (56000/119341)


### Transform training and testing data

In [52]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

In [53]:
numerical_cols

Index(['id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit',
       'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat',
       'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src',
       'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'],
      dtype='object')

In [54]:
categorical_cols

Index(['proto', 'service', 'state'], dtype='object')

In [55]:
X = X[numerical_cols]
X.shape

(175341, 40)

### Load Data

In [56]:
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.3, random_state=1)

### Recursive Feature Elimination 

In [57]:
rf_model = RandomForestClassifier(random_state=123, n_estimators=100)
rfe = RFE(estimator=rf_model, n_features_to_select=10)
rfe.fit(X_train, y1_train)

selected_columns_mask = rfe.support_
selected_columns = X_train.columns[selected_columns_mask]

print("Selected columns after RFE:", selected_columns)

Selected columns after RFE: Index(['id', 'dur', 'sbytes', 'rate', 'sttl', 'sload', 'dload', 'dmean',
       'ct_state_ttl', 'ct_srv_dst'],
      dtype='object')


In [58]:
X_rfe_train = rfe.transform(X_train)
X_rfe_test = rfe.transform(X_test)

### Standardization

In [59]:
scaler = StandardScaler()
scaler.fit(X_rfe_train)
joblib.dump(scaler, 'col_transformer.pkl')

['col_transformer.pkl']

In [60]:
X_train_scaled = scaler.transform(X_rfe_train)
X_test_scaled = scaler.transform(X_rfe_test)

In [61]:
X_train_scaled.shape

(122738, 10)

In [62]:
X_test_scaled.shape

(52603, 10)

### Apply Label Encoder

In [64]:
target_trans = LabelEncoder()
target_trans.fit(y1_train)

joblib.dump(target_trans, 'label_encoder.pkl')

['label_encoder.pkl']

In [65]:
y1_train_scaled = target_trans.transform(y1_train)
y1_test_scaled = target_trans.transform(y1_test)

### Synthetic Minority Over-Sampling Technique 

In [67]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y1_train_scaled)

### Principle Component Analysis

In [None]:
pca = PCA(n_components=0.95)
pca.fit(X_resampled)

joblib.dump(pca, 'pca_model.pkl')

['pca_model.pkl']

In [70]:
X_pca_train = pca.transform(X_resampled)
X_pca_test = pca.transform(X_test_scaled)

## Neural Network

In [71]:
rf_model = RandomForestClassifier(random_state=123)

In [None]:
cv_fit_time_mean_list = []
cv_accuracy_mean_list = []

test_accuracy_list = []


rf_model.fit(X_pca_train, y_resampled) 

with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)



# ===== Step 1: Get Predictions from Random Forest Model =====
rf_predictions_train = rf_model.predict(X_pca_train)  
rf_predictions_test = rf_model.predict(X_pca_test)    

# Convert to NumPy arrays
rf_predictions_train = np.array(rf_predictions_train).reshape(-1, 1)
rf_predictions_test = np.array(rf_predictions_test).reshape(-1, 1)

# Append RF predictions to the feature set
X_train_final = np.hstack((X_pca_train, rf_predictions_train))
X_test_final = np.hstack((X_pca_test, rf_predictions_test))



# ===== Step 2: Cross-validation with Neural Network =====

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
scoring = ['accuracy', 'precision_micro', 'recall_micro', 'f1_micro']

for train_index, val_index in cv.split(X_train_final, y_resampled):
    X_train_cv, X_val_cv = X_train_final[train_index], X_train_final[val_index]
    y_train_cv, y_val_cv = y_resampled[train_index], y_resampled[val_index]

    # Neural Network Model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train_final.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(9, activation='softmax')  # 9 attack classes
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    history = model.fit(X_train_cv, y_train_cv, epochs=20, batch_size=32, verbose=0, validation_data=(X_val_cv, y_val_cv))

    y_val_pred = np.argmax(model.predict(X_val_cv), axis=1)

    cv_accuracy_mean_list.append(accuracy_score(y_val_cv, y_val_pred))

cv_fit_time_mean = np.mean(cv_fit_time_mean_list)
cv_accuracy_mean = np.mean(cv_accuracy_mean_list)



# ===== Step 3: Train Final Model on Entire Training Data =====

model.fit(X_train_final, y_resampled, epochs=20, batch_size=32, verbose=1)

model.save("attack_classification_model.h5")



# ===== Step 4: Evaluate on Test Data =====

y_pred_class = np.argmax(model.predict(X_test_final), axis=1)

accuracy_ontest = accuracy_score(y1_test_scaled, y_pred_class)

test_accuracy_list.append(accuracy_ontest)

print(f"Test Accuracy: {accuracy_ontest:.4f}")

[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Epoch 1/20
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 1.0000 - loss: 7.5247e-08
Epoch 2/20
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 1.0000 - loss: 2.5134e-04
Epoch 3/20
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 1.0000 - loss: 1.5721e-04
Epoch 4/20
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 1.0000 - loss: 3.8025e-05
Epoch 5/20
[1m5222/5222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 1.0000



[1m1644/1644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Test Accuracy: 0.9863
