# Network Anomaly Detection 
## 02. Data Preprocessing

This phase aim to transform the cleaned data into a model-ready dataset so that ML/DL can learn


#### 1. Setup

In [26]:
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.preprocessing import StandardScaler
import joblib


#### 2. Load processed dataset

In [27]:
DATA_DIR = Path("../data/raw/nsl_kdd")

TRAIN_PATH = DATA_DIR / "KDDTrain+.txt"
TEST_PATH  = DATA_DIR / "KDDTest+.txt"

columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
    "wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised",
    "root_shell","su_attempted","num_root","num_file_creations","num_shells",
    "num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
    "count","srv_count","serror_rate","srv_serror_rate","rerror_rate",
    "srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
    "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate",
    "dst_host_srv_rerror_rate","label","difficulty"
]

df_train = pd.read_csv(TRAIN_PATH, names=columns)
df_test  = pd.read_csv(TEST_PATH, names=columns)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)


Train shape: (125973, 43)
Test shape: (22544, 43)


#### 3. Create binary label

In [28]:
df_train['is_attack'] = (df_train['label'] != 'normal').astype(int)
df_test['is_attack']  = (df_test['label']  != 'normal').astype(int)

print("Train label distribution:")
print(df_train['is_attack'].value_counts())

print("\nTest label distribution:")
print(df_test['is_attack'].value_counts())


Train label distribution:
is_attack
0    67343
1    58630
Name: count, dtype: int64

Test label distribution:
is_attack
1    12833
0     9711
Name: count, dtype: int64


#### 4. Separate 'Normal' traffic 

In [29]:
df_train_normal = df_train[df_train['label'] == 'normal']

print("Normal-only train shape:", df_train_normal.shape)


Normal-only train shape: (67343, 44)


#### 5. Drop non-feature columns

In [30]:
drop_cols = ['label', 'difficulty', 'is_attack']

feature_cols = [c for c in df_train.columns if c not in drop_cols]

df_train_normal = df_train_normal[feature_cols]
df_test_features = df_test[feature_cols]
y_test = df_test['is_attack'].values


#### 6. Log transform skewed network features

In [31]:
skewed_cols = ['duration', 'src_bytes', 'dst_bytes']

for col in skewed_cols:
    df_train_normal[col] = np.log1p(df_train_normal[col])
    df_test_features[col] = np.log1p(df_test_features[col])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_features[col] = np.log1p(df_test_features[col])


#### 7. Features encoding

In [32]:
categorical_cols = ['protocol_type', 'service', 'flag']

df_all = pd.concat([df_train_normal, df_test_features], axis=0)

df_all_encoded = pd.get_dummies(
    df_all,
    columns=categorical_cols,
    drop_first=False
)

X_train_encoded = df_all_encoded.iloc[:len(df_train_normal)]
X_test_encoded  = df_all_encoded.iloc[len(df_train_normal):]


#### 8. Features Scaling

In [33]:
scaler = StandardScaler()

scaler.fit(X_train_encoded)

X_train_scaled = scaler.transform(X_train_encoded)
X_test_scaled  = scaler.transform(X_test_encoded)


#### 9. Saving processed data

In [34]:
PROC_DIR = Path("../data/processed")
PROC_DIR.mkdir(exist_ok=True)

np.save(PROC_DIR / "X_train_normal.npy", X_train_scaled.astype(np.float32))
np.save(PROC_DIR / "X_test.npy", X_test_scaled.astype(np.float32))
np.save(PROC_DIR / "y_test.npy", y_test.astype(np.int64))

joblib.dump(scaler, PROC_DIR / "scaler.joblib")

print("Saved processed data:")
print("X_train_normal:", X_train_scaled.shape)
print("X_test:", X_test_scaled.shape)
print("y_test:", y_test.shape)

#Sanity checks
print("Any NaN in train:", np.isnan(X_train_scaled).any())
print("Any NaN in test:", np.isnan(X_test_scaled).any())


Saved processed data:
X_train_normal: (67343, 118)
X_test: (22544, 118)
y_test: (22544,)
Any NaN in train: False
Any NaN in test: False


#### 10. Summary
- Training and test datasets were processed separately to prevent data leakage.
- Only normal traffic from the training set was used to fit encoders and scalers.
- Logarithmic transformation was applied to highly skewed network traffic features.
- All categorical features were one-hot encoded to preserve semantic meaning.
- The resulting processed datasets are suitable for unsupervised and semi-supervised anomaly detection models.

