# Network Anomaly Detection 
## 02. Data Preprocessing

This phase aim to transform the cleaned data into a model-ready dataset so that ML/DL can learn


### 1. Setup

In [None]:
import os
from pathlib import Path
import json
import joblib
import numpy as np
import pandas as pd

# Paths
RAW_PATH = Path("../../data/raw/nsl_kdd/KDDTrain+_20Percent.txt")       
PROC_DIR = Path("../../data/processed/nsl_kdd")
RESULTS_DIR = Path("../../results")
MODEL_DIR = Path("../../models/nsl_kdd")
PROC_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

print("Paths ready:")
print(" RAW:", RAW_PATH)
print(" PROC_DIR:", PROC_DIR)
print(" RESULTS_DIR:", RESULTS_DIR)
print(" MODEL_DIR:", MODEL_DIR)


Paths ready:
 RAW: ..\..\data\raw\nsl_kdd\KDDTrain+_20Percent.txt
 PROC_DIR: ..\..\data\processed\nsl_kdd
 RESULTS_DIR: ..\..\results
 MODEL_DIR: ..\..\models\nsl_kdd


### 2. Load dataset

In [None]:
columns = [
        "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
        "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
        "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
        "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
        "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
        "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
        "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"
    ]
df = pd.read_csv(RAW_PATH, names=columns)
print("Loaded df from file:", RAW_PATH, "shape:", df.shape)
    
df.head(5)


Loaded df from file: ..\..\data\raw\nsl_kdd\KDDTrain+_20Percent.txt shape: (25192, 43)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19


### 4. Handle dtypes

In [None]:
# Ensure binary target 'is_attack' exists
if 'is_attack' not in df.columns:
    df['is_attack'] = df['label'].apply(lambda x: 0 if str(x).strip() == 'normal' else 1)

print("Class distribution (is_attack):")
print(df['is_attack'].value_counts())


Columns normalized. Example columns: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot']
Class distribution (is_attack):
is_attack
0    13449
1    11743
Name: count, dtype: int64


In [None]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
for col in num_cols:
    if pd.api.types.is_float_dtype(df[col]):
        if df[col].dropna().apply(float.is_integer).all():
            df[col] = df[col].astype('int64')

print("Numeric columns (post-fix):", len(num_cols))

Numeric columns (post-fix): 40


### 4. Feature selection

In [21]:
# Cell 5: Drop columns we don't want in X (keep is_attack separately)
# We keep 'difficulty' for reference but drop from features. We also keep label for reference.
cols_to_drop = ['label', 'difficulty']  # drop from X, but we keep is_attack as y
df_proc = df.drop(columns=cols_to_drop).copy()

print("Dropped columns:", cols_to_drop)
print("Processed df shape:", df_proc.shape)


Dropped columns: ['label', 'difficulty']
Processed df shape: (25192, 42)


### 5. Categorical encoding

In [None]:
# 1) Frequency encoding for 'service'
service_freq = df_proc['service'].value_counts(normalize=True).to_dict()
df_proc['service_freq'] = df_proc['service'].map(service_freq)

# 2) One-hot for protocol_type and flag (drop_first=False to keep interpretability)
df_proc = pd.get_dummies(df_proc, columns=['protocol_type','flag'], prefix=['proto','flag'])

# 3) Optionally drop original 'service' (we keep only service_freq)
df_proc = df_proc.drop(columns=['service'])

# Save mapping
joblib.dump(service_freq, PROC_DIR / "service_freq_map.joblib")
print("Service frequency mapping saved to:", PROC_DIR / "service_freq_map.joblib")

# Quick checks
print("After encoding, shape:", df_proc.shape)
print("Example columns:", df_proc.columns.tolist()[:30])


Service frequency mapping saved to: ..\..\data\processed\nsl_kdd\service_freq_map.joblib
After encoding, shape: (25192, 54)
Example columns: ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count']


### 6. Build X and y

In [None]:
# Remove is_host_login/is_guest_login if they are non-numeric (they are numeric 0/1)
non_numeric = df_proc.select_dtypes(include=['object']).columns.tolist()
print("Non-numeric cols (should be none):", non_numeric)

# Define X and y
y = df_proc['is_attack'].values
X = df_proc.drop(columns=['is_attack']).copy()

print("Final X shape:", X.shape)
print("Final y shape:", y.shape)
print("Feature sample:")
display(X.head())


Non-numeric cols (should be none): []
Final X shape: (25192, 53)
Final y shape: (25192,)
Feature sample:


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
1,0,146,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
2,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
3,0,232,8153,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
4,0,199,420,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False


### 7. Saving

In [25]:
# Cell 8: Save X, y, feature names, and encoders (no scaling)
np.save(PROC_DIR / "X.npy", X.values.astype(np.float64))
np.save(PROC_DIR / "y.npy", y.astype(np.int64))

# Save feature names for mapping later
feature_names = X.columns.tolist()
with open(PROC_DIR / "feature_names.json", "w") as f:
    json.dump(feature_names, f)

# Save the full processed dataframe as csv for inspection
df_proc.to_csv(PROC_DIR / "processed_nslkdd_20pct.csv", index=False)

# Save any other encoders/mappings
joblib.dump(feature_names, PROC_DIR / "feature_names.joblib")
# service_freq already saved earlier

print("Saved:")
print(" - ", PROC_DIR / "X.npy")
print(" - ", PROC_DIR / "y.npy")
print(" - ", PROC_DIR / "feature_names.json")
print(" - ", PROC_DIR / "processed_nslkdd_20pct.csv")


Saved:
 -  ..\..\data\processed\nsl_kdd\X.npy
 -  ..\..\data\processed\nsl_kdd\y.npy
 -  ..\..\data\processed\nsl_kdd\feature_names.json
 -  ..\..\data\processed\nsl_kdd\processed_nslkdd_20pct.csv


### 8. Checking

In [26]:
# Cell 9: Sanity checks
import numpy as np
print("X shape:", np.load(PROC_DIR / "X.npy").shape)
print("y shape:", np.load(PROC_DIR / "y.npy").shape)
print("dtype:", np.load(PROC_DIR / "X.npy").dtype)

# class balance
y_loaded = np.load(PROC_DIR / "y.npy")
(unique, counts) = np.unique(y_loaded, return_counts=True)
print("Class distribution (y):", dict(zip(unique, counts)))

# a quick glance at feature value ranges (first 10 features)
print("\nFeature ranges (first 10):")
for col in feature_names[:10]:
    series = X[col]
    print(f"{col}: min={series.min()}, median={series.median()}, max={series.max()}")


X shape: (25192, 53)
y shape: (25192,)
dtype: float64
Class distribution (y): {0: 13449, 1: 11743}

Feature ranges (first 10):
duration: min=0, median=0.0, max=42862
src_bytes: min=0, median=44.0, max=381709090
dst_bytes: min=0, median=0.0, max=5151385
land: min=0, median=0.0, max=1
wrong_fragment: min=0, median=0.0, max=3
urgent: min=0, median=0.0, max=1
hot: min=0, median=0.0, max=77
num_failed_logins: min=0, median=0.0, max=4
logged_in: min=0, median=0.0, max=1
num_compromised: min=0, median=0.0, max=884
