In [18]:
# MSc Cyber Security Project — Pratham Shrestha
# Notebook 02 — Preprocessing 

# --- Imports and setup ---
# importing all the main libraries I need for data preparation
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE  # for balancing attack/benign classes

# using one constant random seed everywhere for reproducibility
RANDOM_STATE = 42

# folder setup: same logic as before to make sure paths work in any environment
BASE = Path("..") if Path.cwd().name == "notebooks" else Path(".")
RAW_CSV = BASE / "dataset" / "raw" / "ALLFLOWMETER_HIKARI2021.csv"
PROC_DIR = BASE / "dataset" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)


In [20]:
# --- Load the dataset ---
# loading the HIKARI-2021 dataset
df = pd.read_csv(RAW_CSV, low_memory=False)
print("Raw dataset shape:", df.shape)

# making sure Label column is available (0=normal, 1=attack)
assert "Label" in df.columns, "Expected a 'Label' column in the dataset."


Raw dataset shape: (555278, 88)


In [22]:
# --- Drop leakage and irrelevant columns ---
# removing columns that either leak label info or don't contribute to model learning
LEAKY_OR_ID = [
    "traffic_category",       # duplicates the label info (causes leakage)
    "attack_cat", "attack_category", "label_name",
    "LabelNum", "label_num",  # other variations of the same label
    "uid", "originh", "responh",  # identifiers or hostnames (high cardinality)
    "Unnamed: 0", "Unnamed: 0.1"  # leftover index columns
]

# dropping only if they exist in the dataframe
to_drop = [c for c in LEAKY_OR_ID if c in df.columns]
df = df.drop(columns=to_drop) if to_drop else df
print("Dropped columns:", to_drop if to_drop else "None")


Dropped columns: ['traffic_category', 'uid', 'originh', 'responh', 'Unnamed: 0', 'Unnamed: 0.1']


In [24]:
# --- Remove duplicate rows ---
# this prevents data leakage between training and test splits
before = len(df)
df = df.drop_duplicates()
print(f"Removed {before - len(df)} duplicate rows. New size: {len(df)}")


Removed 36358 duplicate rows. New size: 518920


In [26]:
# --- Separate features and labels ---
# assigning 'Label' as target (y) and rest as features (X)
y = df["Label"].astype(int)
X_all = df.drop(columns=["Label"])

# selecting only numeric features to simplify preprocessing
num_cols = list(X_all.select_dtypes(include=[np.number]).columns)
X = X_all[num_cols].copy()

print(f"Kept {len(num_cols)} numeric features out of {X_all.shape[1]} total columns.")


Kept 81 numeric features out of 81 total columns.


In [28]:
# --- Train-test split ---
# stratified split keeps the same class ratio in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)
print("\nTraining class balance (%):\n", (y_train.value_counts(normalize=True) * 100).round(2))
print("\nTesting class balance (%):\n", (y_test.value_counts(normalize=True) * 100).round(2))


Train shape: (415136, 81)
Test shape : (103784, 81)

Training class balance (%):
 Label
0    92.77
1     7.23
Name: proportion, dtype: float64

Testing class balance (%):
 Label
0    92.77
1     7.23
Name: proportion, dtype: float64


In [30]:
# --- Scaling and imputing numeric features ---
# simple preprocessing: fill missing values with median and scale values
preprocessor = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

# fitting only on training data to prevent data leakage
preprocessor.fit(X_train)

# transforming both train and test sets using the same fitted preprocessor
X_train_proc = preprocessor.transform(X_train)
X_test_proc = preprocessor.transform(X_test)

print("After preprocessing:", X_train_proc.shape, X_test_proc.shape)


After preprocessing: (415136, 81) (103784, 81)


In [36]:
# --- Balancing with SMOTE (on training set only) ---
# applying SMOTE moderately to reach about 70:30 ratio of benign:attack
TARGET_RATIO = 0.30 / 0.70  # ≈ 0.43 minority/majority ratio

smote = SMOTE(sampling_strategy=TARGET_RATIO, random_state=RANDOM_STATE)
X_train_bal, y_train_bal = smote.fit_resample(X_train_proc, y_train)

print("Balanced training shape:", X_train_bal.shape)
print("\nClass distribution after SMOTE (train):")

# show both counts and percentage
value_counts = pd.Series(y_train_bal).value_counts()
percentages = (value_counts / len(y_train_bal) * 100).round(2)

summary_df = pd.DataFrame({
    "Count": value_counts,
    "Percentage": percentages
})

print(summary_df)




Balanced training shape: (550162, 81)

Class distribution after SMOTE (train):
        Count  Percentage
Label                    
0      385114        70.0
1      165048        30.0


In [38]:
# --- Save processed data and objects ---
# saving arrays, labels, and fitted preprocessor for next notebook
np.save(PROC_DIR / "X_train_bal_70_30.npy", X_train_bal)
np.save(PROC_DIR / "X_test_proc.npy", X_test_proc)

pd.Series(y_train_bal, name="Label").to_csv(PROC_DIR / "y_train_bal_70_30.csv", index=False)
pd.Series(y_test, name="Label").to_csv(PROC_DIR / "y_test.csv", index=False)

joblib.dump(preprocessor, PROC_DIR / "preprocessor.joblib")

print("Saved all processed files to:", PROC_DIR)


Saved all processed files to: ..\dataset\processed
