In [1]:
#1
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
#1.5: Define base data paths for NSL-KDD and other datasets
nsl_path = "/content/drive/MyDrive/NIDS_Project/NIDS_Data/NSL_KDD"
unsw_path = "/content/drive/MyDrive/NIDS_Project/NIDS_Data/UNSW_NB15"
hybrid_path = "/content/drive/MyDrive/NIDS_Project/NIDS_Data/hybrid"

In [3]:
#2 → Replace with:
# Install ARFF parser only if needed (avoids slowing down runtime)
try:
    import arff
except ImportError:
    !pip install -q liac-arff

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone


In [4]:
#3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
import joblib
import pickle

warnings.filterwarnings('ignore')



In [5]:
#4
nsl_extract_folder = "/content/drive/MyDrive/NIDS_Project/Raw_Datasets/NSL_KDD_Raw"

In [6]:
#5 List extracted files
nsl_extracted_files = os.listdir(nsl_extract_folder)
print("Extracted Files:", nsl_extracted_files)

Extracted Files: ['nsl-kdd', 'KDDTest+.arff', 'KDDTest+.txt', 'KDDTest-21.arff', 'KDDTest1.jpg', 'KDDTest-21.txt', 'index.html', 'KDDTrain1.jpg', 'KDDTrain+_20Percent.txt', 'KDDTrain+_20Percent.arff', 'KDDTrain+.arff', 'KDDTrain+.txt', 'KDDTrain+.csv']


In [7]:
#6: Convert KDDTrain+.arff to CSV if needed (for KDDTrain+ only)

import os
import pandas as pd

# Correct full paths based on your file structure
arff_path = "/content/drive/MyDrive/NIDS_Project/Raw_Datasets/NSL_KDD_Raw/KDDTrain+.arff"
csv_path  = "/content/drive/MyDrive/NIDS_Project/Raw_Datasets/NSL_KDD_Raw/KDDTrain+.csv"

# Only convert if CSV doesn't already exist
if not os.path.exists(csv_path):
    if not os.path.exists(arff_path):
        raise FileNotFoundError(f"[!] ARFF file not found: {arff_path}")

    with open(arff_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Locate the start of the data section
    start_idx = next(i for i, line in enumerate(lines) if line.strip().lower() == '@data')
    data_lines = lines[start_idx + 1:]

    # Parse data lines, skipping comments and blanks
    parsed_data = [line.strip().split(',') for line in data_lines if line.strip() and not line.startswith('%')]

    # Save to CSV (no headers)
    pd.DataFrame(parsed_data).to_csv(csv_path, index=False, header=False)
    print(f"[✓] ARFF converted and saved to CSV: {csv_path}")
else:
    print(f"[i] CSV already exists → skipping conversion: {csv_path}")

[i] CSV already exists → skipping conversion: /content/drive/MyDrive/NIDS_Project/Raw_Datasets/NSL_KDD_Raw/KDDTrain+.csv


In [8]:
#7 Read CSV (now it has proper headers)
csv_path = "/content/drive/MyDrive/NIDS_Project/Raw_Datasets/NSL_KDD_Raw/KDDTrain+.csv"

nsl_df = pd.read_csv(csv_path)

print("First few rows:")
print(nsl_df.head())
print(f"{nsl_df.shape[0]} rows × {nsl_df.shape[1]} columns")

First few rows:
   0  tcp ftp_data   SF  491   0.1  0.2  0.3  0.4  0.5  ...   25  0.17  0.03  \
0  0  udp    other   SF  146     0    0    0    0    0  ...    1  0.00  0.60   
1  0  tcp  private   S0    0     0    0    0    0    0  ...   26  0.10  0.05   
2  0  tcp     http   SF  232  8153    0    0    0    0  ...  255  1.00  0.00   
3  0  tcp     http   SF  199   420    0    0    0    0  ...  255  1.00  0.00   
4  0  tcp  private  REJ    0     0    0    0    0    0  ...   19  0.07  0.07   

   0.17.1  0.00.6  0.00.7  0.00.8  0.05  0.00.9   normal  
0    0.88    0.00    0.00    0.00   0.0    0.00   normal  
1    0.00    0.00    1.00    1.00   0.0    0.00  anomaly  
2    0.03    0.04    0.03    0.01   0.0    0.01   normal  
3    0.00    0.00    0.00    0.00   0.0    0.00   normal  
4    0.00    0.00    0.00    0.00   1.0    1.00  anomaly  

[5 rows x 42 columns]
125972 rows × 42 columns


In [9]:
#8 Full column names for NSL-KDD
nsl_column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'label'
]

# Assign correct column names
nsl_df.columns = nsl_column_names

# Display first few rows to confirm
print(nsl_df.head())

   duration protocol_type  service flag  src_bytes  dst_bytes  land  \
0         0           udp    other   SF        146          0     0   
1         0           tcp  private   S0          0          0     0   
2         0           tcp     http   SF        232       8153     0   
3         0           tcp     http   SF        199        420     0   
4         0           tcp  private  REJ          0          0     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                   1   
1               0       0    0  ...                  26   
2               0       0    0  ...                 255   
3               0       0    0  ...                 255   
4               0       0    0  ...                  19   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.00                    0.60   
1                    0.10                    0.05   
2                    1.00                    0.00   
3     

In [10]:
#10 → Replace with:
# Add missing legacy columns if absent – maintain NSL-KDD compatibility
for col in nsl_column_names:
    if col not in nsl_df.columns:
        print(f"Adding missing column: {col}")
        nsl_df[col] = 0

In [11]:
#11 Encode categorical features
nsl_categorical_cols = ['protocol_type', 'service', 'flag']
nsl_label_encoders = {}

for col in nsl_categorical_cols:
    le = LabelEncoder()
    nsl_df[col] = le.fit_transform(nsl_df[col])
    nsl_label_encoders[col] = le

print("Categorical columns encoded.")

Categorical columns encoded.


In [12]:
#12 → Replace with:
# Normalize target column (binary: normal=0, attack=1)
nsl_df['label'] = nsl_df['label'].str.strip().str.lower().apply(lambda x: 0 if x == 'normal' else 1).astype(int)
print("Target encoded. Distribution:\n", nsl_df['label'].value_counts())

Target encoded. Distribution:
 label
0    67342
1    58630
Name: count, dtype: int64


In [13]:
#13
nsl_X = nsl_df.drop(columns=['label'])
nsl_y = nsl_df['label']

nsl_X_train, nsl_X_test, nsl_y_train, nsl_y_test = train_test_split(
    nsl_X, nsl_y, test_size=0.3, random_state=42
)

print("Train/Test Split:")
print("Train X:", nsl_X_train.shape, " | Train y:", nsl_y_train.shape)
print("Test X :", nsl_X_test.shape, " | Test y :", nsl_y_test.shape)

Train/Test Split:
Train X: (88180, 41)  | Train y: (88180,)
Test X : (37792, 41)  | Test y : (37792,)


In [14]:
#14 → Replace with:
# Normalize and save NSL-KDD data
nsl_output_path = "/content/drive/MyDrive/NIDS_Project/NIDS_Data/NSL_KDD"
os.makedirs(nsl_output_path, exist_ok=True)

nsl_scaler = MinMaxScaler()
nsl_X_train_scaled = nsl_scaler.fit_transform(nsl_X_train)
nsl_X_test_scaled = nsl_scaler.transform(nsl_X_test)

joblib.dump(nsl_scaler, os.path.join(nsl_output_path, "nslkdd_scaler.pkl"))
pd.DataFrame(nsl_X_train_scaled).to_csv(os.path.join(nsl_output_path, "nslkdd_X_train.csv"), index=False)
pd.DataFrame(nsl_X_test_scaled).to_csv(os.path.join(nsl_output_path, "nslkdd_X_test.csv"), index=False)
pd.Series(nsl_y_train).to_csv(os.path.join(nsl_output_path, "nslkdd_y_train.csv"), index=False)
pd.Series(nsl_y_test).to_csv(os.path.join(nsl_output_path, "nslkdd_y_test.csv"), index=False)

# UNSW

In [15]:
# 15: Load UNSW-NB15 Parquet files
import pandas as pd
# Define UNSW file paths
unsw_train_path = "/content/drive/MyDrive/NIDS_Project/Raw_Datasets/UNSW_NB15_Raw/UNSW_NB15_training-set.parquet"
unsw_test_path = "/content/drive/MyDrive/NIDS_Project/Raw_Datasets/UNSW_NB15_Raw/UNSW_NB15_testing-set.parquet"
# Load data using pandas
unsw_train_df = pd.read_parquet(unsw_train_path)
unsw_test_df = pd.read_parquet(unsw_test_path)
# Quick shape check
print("Training shape:", unsw_train_df.shape)
print("Testing shape:", unsw_test_df.shape)
# Peek at column names
unsw_train_df.head()

Training shape: (175341, 36)
Testing shape: (82332, 36)


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,0.121478,tcp,-,FIN,6,4,258,172,74.087486,14158.942383,...,0,0,1,1,0,0,0,0,Normal,0
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473373,8395.112305,...,0,0,1,1,0,0,0,0,Normal,0
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,1572.271851,...,0,0,1,1,0,0,0,0,Normal,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,2740.178955,...,0,0,1,1,1,1,0,0,Normal,0
4,0.449454,tcp,-,FIN,10,6,534,268,33.373825,8561.499023,...,0,0,2,1,0,0,0,0,Normal,0


In [16]:
#16 Preview the first few records of both datasets
print("Training Data Preview:")
display(unsw_train_df.head())

print("Testing Data Preview:")
display(unsw_test_df.head())
# Check the shape of each dataset
print("Training set shape:", unsw_train_df.shape)
print("Testing set shape:", unsw_test_df.shape)
# Check for missing values
print("Missing values in training set:")
display(unsw_train_df.isnull().sum().sort_values(ascending=False).head(10))

print("Missing values in testing set:")
display(unsw_test_df.isnull().sum().sort_values(ascending=False).head(10))
# Review data types
print("Data types in training set:")
display(unsw_train_df.dtypes.value_counts())
# Investigate target variable distribution
if 'label' in unsw_train_df.columns:
    print("Binary class distribution (label column):")
    display(unsw_train_df['label'].value_counts())

if 'attack_cat' in unsw_train_df.columns:
    print("Multiclass attack type distribution (attack_cat column):")
    display(unsw_train_df['attack_cat'].value_counts())
# Display all column names
print("UNSW Column names:")
print(unsw_train_df.columns.tolist())

Training Data Preview:


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,0.121478,tcp,-,FIN,6,4,258,172,74.087486,14158.942383,...,0,0,1,1,0,0,0,0,Normal,0
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473373,8395.112305,...,0,0,1,1,0,0,0,0,Normal,0
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,1572.271851,...,0,0,1,1,0,0,0,0,Normal,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,2740.178955,...,0,0,1,1,1,1,0,0,Normal,0
4,0.449454,tcp,-,FIN,10,6,534,268,33.373825,8561.499023,...,0,0,2,1,0,0,0,0,Normal,0


Testing Data Preview:


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


Training set shape: (175341, 36)
Testing set shape: (82332, 36)
Missing values in training set:


Unnamed: 0,0
dur,0
proto,0
service,0
state,0
spkts,0
dpkts,0
sbytes,0
dbytes,0
rate,0
sload,0


Missing values in testing set:


Unnamed: 0,0
dur,0
proto,0
service,0
state,0
spkts,0
dpkts,0
sbytes,0
dbytes,0
rate,0
sload,0


Data types in training set:


Unnamed: 0,count
float32,11
int16,9
int8,7
int32,3
int64,2
category,1
category,1
category,1
category,1


Binary class distribution (label column):


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,119341
0,56000


Multiclass attack type distribution (attack_cat column):


Unnamed: 0_level_0,count
attack_cat,Unnamed: 1_level_1
Normal,56000
Generic,40000
Exploits,33393
Fuzzers,18184
DoS,12264
Reconnaissance,10491
Analysis,2000
Backdoor,1746
Shellcode,1133
Worms,130


UNSW Column names:
['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports', 'attack_cat', 'label']


In [17]:
# 17: Drop labels + define features
unsw_cols_to_drop = ['id', 'label', 'attack_cat'] if 'id' in unsw_train_df.columns else ['label', 'attack_cat']
unsw_X_train = unsw_train_df.drop(columns=unsw_cols_to_drop, errors='ignore')
unsw_y_train = unsw_train_df['label']
unsw_X_test = unsw_test_df.drop(columns=unsw_cols_to_drop, errors='ignore')
unsw_y_test = unsw_test_df['label']

# Always convert labels to binary
unsw_y_train = unsw_y_train.apply(lambda x: 0 if str(x).lower() == 'normal' or x == 0 else 1).astype(np.int32)
unsw_y_test = unsw_y_test.apply(lambda x: 0 if str(x).lower() == 'normal' or x == 0 else 1).astype(np.int32)

In [18]:
from sklearn.preprocessing import LabelEncoder

unsw_categorical_cols = unsw_X_train.select_dtypes(include='object').columns.tolist()
print("Encoding these columns:", unsw_categorical_cols)

unsw_label_encoders = {}
for col in unsw_categorical_cols:
    le = LabelEncoder()
    unsw_X_train[col] = le.fit_transform(unsw_X_train[col].astype(str))
    unsw_X_test[col] = le.transform(unsw_X_test[col].astype(str))
    unsw_label_encoders[col] = le

Encoding these columns: []


In [19]:
#20 Step X+5: Check for object (string) columns in training data
print("Columns with object (string) type in training data:")
print(unsw_X_train.select_dtypes(include=['object']).columns.tolist())

# Optional preview of problematic values (to identify misplacements)
for col in unsw_X_train.select_dtypes(include=['object']).columns:
    print(f"\nUnique values in column '{col}':")
    print(unsw_X_train[col].unique())

Columns with object (string) type in training data:
[]


In [20]:
#22 → Replace with:
# Correlation check — helpful for SHAP & feature selection
corr_df = unsw_train_df.copy()
corr_df = corr_df.drop(columns=['id', 'attack_cat'], errors='ignore')

for col in ['proto', 'service', 'state']:
    if corr_df[col].dtype == 'object':
        corr_df[col] = LabelEncoder().fit_transform(corr_df[col].astype(str))

corr_df = corr_df.select_dtypes(include='number').loc[:, lambda df: df.nunique() > 1]
correlations = corr_df.corr(numeric_only=True)['label'].sort_values(ascending=False)

print("Top correlated features with label:")
display(correlations.head(15))

Top correlated features with label:


Unnamed: 0,label
label,1.0
ct_dst_sport_ltm,0.357213
rate,0.337979
ct_src_dport_ltm,0.305579
sload,0.18287
ackdat,0.097364
tcprtt,0.081584
synack,0.058299
dur,0.036175
sbytes,0.018576


In [21]:
# 23–24: Prepare encoded versions of X for scaling

from sklearn.preprocessing import MinMaxScaler

# Ensure both are proper numeric types (convert 'category' too)
unsw_X_train_encoded = unsw_X_train.copy()
unsw_X_test_encoded = unsw_X_test.copy()

for df in [unsw_X_train_encoded, unsw_X_test_encoded]:
    for col in df.columns:
        if df[col].dtype.name in ['object', 'category']:
            try:
                df[col] = df[col].astype(float)
            except:
                df[col], _ = pd.factorize(df[col])
                print(f"[i] Factorised column: {col}")

# Scale using MinMaxScaler
unsw_scaler = MinMaxScaler()

unsw_X_train_encoded = pd.DataFrame(
    unsw_scaler.fit_transform(unsw_X_train_encoded),
    columns=unsw_X_train_encoded.columns
)

unsw_X_test_encoded = pd.DataFrame(
    unsw_scaler.transform(unsw_X_test_encoded),
    columns=unsw_X_test_encoded.columns
)

[i] Factorised column: proto
[i] Factorised column: service
[i] Factorised column: state
[i] Factorised column: proto
[i] Factorised column: service
[i] Factorised column: state


In [22]:
# 26: Final conversion and saving
# Ensure all data is fully numeric before conversion
assert unsw_X_train_encoded.select_dtypes(include='object').empty, "Train data still contains object columns!"
assert unsw_X_test_encoded.select_dtypes(include='object').empty, "Test data still contains object columns!"

# Convert to NumPy arrays
X_train_final = unsw_X_train_encoded.to_numpy()
X_test_final = unsw_X_test_encoded.to_numpy()
y_train_final = unsw_y_train.to_numpy()
y_test_final = unsw_y_test.to_numpy()

# Save all processed data
unsw_save_path = "/content/drive/MyDrive/NIDS_Project/NIDS_Data/UNSW_NB15"
os.makedirs(unsw_save_path, exist_ok=True)

np.save(f"{unsw_save_path}/unsw_X_train_final.npy", X_train_final)
np.save(f"{unsw_save_path}/unsw_X_test_final.npy", X_test_final)
np.save(f"{unsw_save_path}/unsw_y_train_final.npy", y_train_final)
np.save(f"{unsw_save_path}/unsw_y_test_final.npy", y_test_final)

# Optional: Save pickled DataFrame and columns
unsw_X_train_encoded.to_pickle(f"{unsw_save_path}/unsw_X_train_encoded.pkl")
pd.Series(y_train_final).to_pickle(f"{unsw_save_path}/unsw_y_train_encoded.pkl")

with open(f"{unsw_save_path}/unsw_feature_names.pkl", "wb") as f:
    pickle.dump(list(unsw_X_train_encoded.columns), f)

joblib.dump(unsw_scaler, f"{unsw_save_path}/unsw_scaler.pkl")

print("All files saved successfully to:", unsw_save_path)

All files saved successfully to: /content/drive/MyDrive/NIDS_Project/NIDS_Data/UNSW_NB15
