In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

raw_path = "../data/raw/cicids2017_cleaned.csv"
out_path = "../data/processed/cicids_processed.csv"
Path("../data/processed").mkdir(parents=True, exist_ok=True)

# read the dataset
df = pd.read_csv(raw_path)

# unify the label column
label_col = "Attack Type" if "Attack Type" in df.columns else "Label"
df.rename(columns={label_col: "attack_type"}, inplace=True)

# create binary flag: 1=attack, 0=normal
df["is_attack"] = (df["attack_type"].str.lower() != "normal traffic").astype(int)

# handle infinities and missing values
df = df.replace([np.inf, -np.inf], np.nan)
num_cols = [c for c in df.columns if c not in ["attack_type","is_attack"]]
df[num_cols] = df[num_cols].apply(lambda s: s.fillna(s.median()))

# save processed dataset
df.to_csv(out_path, index=False)

print("Processed shape:", df.shape)
print("Attack distribution:")
print(df["is_attack"].value_counts())
print("Saved to", out_path)

df.head()


Processed shape: (1048575, 54)
Attack distribution:
is_attack
0    818574
1    230001
Name: count, dtype: int64
Saved to ../data/processed/cicids_processed.csv


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Length of Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Max,Active Min,Idle Mean,Idle Max,Idle Min,attack_type,is_attack
0,22,1266342,41,2664,456,0,64.97561,109.864573,976,0,...,24,32,0.0,0,0,0.0,0,0,Normal Traffic,0
1,22,1319353,41,2664,456,0,64.97561,109.864573,976,0,...,24,32,0.0,0,0,0.0,0,0,Normal Traffic,0
2,22,160,1,0,0,0,0.0,0.0,0,0,...,0,32,0.0,0,0,0.0,0,0,Normal Traffic,0
3,22,1303488,41,2728,456,0,66.536585,110.129945,976,0,...,24,32,0.0,0,0,0.0,0,0,Normal Traffic,0
4,35396,77,1,0,0,0,0.0,0.0,0,0,...,0,32,0.0,0,0,0.0,0,0,Normal Traffic,0
