In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

raw_path = "../data/raw/basic.csv"
out_path = "../data/processed/basic_processed.csv"
Path("../data/processed").mkdir(parents=True, exist_ok=True)

df = pd.read_csv(raw_path)

# standardise column names
df.columns = [c.strip().lower().replace('-', '_') for c in df.columns]

# convert categorical to category type
cat_cols = ["protocol_type", "service", "flag"]
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype("category")

# add binary attack flag (1=attack, 0=normal)
df["is_attack"] = (df["label"].str.lower() != "normal").astype(int)

# numeric columns
num_cols = [c for c in df.columns if c not in cat_cols + ["label","is_attack"]]

# replace infinities and fill missing values
df[num_cols] = df[num_cols].replace([np.inf, -np.inf], np.nan)
df[num_cols] = df[num_cols].apply(lambda s: s.fillna(s.median()))

# one-hot encode categoricals
df_enc = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("Processed shape:", df_enc.shape)
print("Attack distribution:")
print(df_enc["is_attack"].value_counts())

# save
df_enc.to_csv(out_path, index=False)
print("Saved to", out_path)
df_enc.head()


Processed shape: (25192, 85)
Attack distribution:
is_attack
0    13449
1    11743
Name: count, dtype: int64
Saved to ../data/processed/basic_processed.csv


Unnamed: 0,duration,src_bytes,dst_bytes,count,srv_count,serror_rate,label,is_attack,protocol_type_tcp,protocol_type_udp,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,491.0,0.0,2.0,2.0,0.0,normal,0,True,False,...,False,False,False,False,False,False,False,False,True,False
1,0.0,146.0,0.0,13.0,1.0,0.0,normal,0,False,True,...,False,False,False,False,False,False,False,False,True,False
2,0.0,0.0,0.0,123.0,6.0,1.0,neptune,1,True,False,...,False,False,False,False,True,False,False,False,False,False
3,0.0,232.0,8153.0,5.0,5.0,0.2,normal,0,True,False,...,False,False,False,False,False,False,False,False,True,False
4,0.0,199.0,420.0,30.0,32.0,0.0,normal,0,True,False,...,False,False,False,False,False,False,False,False,True,False
