# Intrusion Detection Mini Project
# Dataset: UNSW-NB15
# Models: Decision Tree, Random Forest, Random Forest (GridSearchCV)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

## 1. Load Data


In [2]:

training_df = pd.read_csv('../.venv/UNSW_NB15_training-set.csv')
testing_df = pd.read_csv('../.venv/UNSW_NB15_testing-set.csv')


## 2. Prepare Features and Labels


In [3]:
X = training_df.drop(columns=['label', 'attack_cat'])
y = training_df['label']

# One-hot encode categorical features
X = pd.get_dummies(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


## 3. Baseline Models


In [4]:

# Decision Tree
clf_dt = DecisionTreeClassifier(max_depth=50, random_state=42)
clf_dt.fit(X_train, y_train)

# Random Forest
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42)
clf_rf.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 4. Improved Model with Pipeline + GridSearchCV


In [5]:
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [10, 20, None],
    'clf__min_samples_split': [2, 5],
    'clf__class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X, y)


Fitting 3 folds for each of 24 candidates, totalling 72 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'clf__class_weight': [None, 'balanced'], 'clf__max_depth': [10, 20, ...], 'clf__min_samples_split': [2, 5], 'clf__n_estimators': [100, 200]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 5. Evaluation


In [7]:

# --- Test Set Evaluation ---
real_X = pd.get_dummies(testing_df.drop(columns=['label', 'attack_cat']))
real_y = testing_df['label']
real_X = real_X.reindex(columns=X_train.columns, fill_value=0)

print("Decision Tree (Test Set):")
print(classification_report(real_y, clf_dt.predict(real_X)))

print("Random Forest (Test Set):")
print(classification_report(real_y, clf_rf.predict(real_X)))

print("GridSearchCV Best Model (Test Set):")
print(classification_report(real_y, grid_search.predict(real_X)))



Decision Tree (Test Set):
              precision    recall  f1-score   support

           0       0.48      0.69      0.57     37000
           1       0.61      0.39      0.48     45332

    accuracy                           0.53     82332
   macro avg       0.54      0.54      0.52     82332
weighted avg       0.55      0.53      0.52     82332

Random Forest (Test Set):
              precision    recall  f1-score   support

           0       0.64      0.66      0.65     37000
           1       0.72      0.70      0.71     45332

    accuracy                           0.68     82332
   macro avg       0.68      0.68      0.68     82332
weighted avg       0.68      0.68      0.68     82332

GridSearchCV Best Model (Test Set):
              precision    recall  f1-score   support

           0       0.80      0.65      0.72     37000
           1       0.75      0.87      0.81     45332

    accuracy                           0.77     82332
   macro avg       0.78      0.76      0