In [3]:
import pandas as pd
import glob

files = glob.glob("dataset/*.csv")

print(files)  # check if files are detected

df_list = [pd.read_csv(file) for file in files]
df = pd.concat(df_list, ignore_index=True)

print(df.shape)


['dataset\\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'dataset\\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'dataset\\Friday-WorkingHours-Morning.pcap_ISCX.csv', 'dataset\\Monday-WorkingHours.pcap_ISCX.csv', 'dataset\\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'dataset\\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'dataset\\Tuesday-WorkingHours.pcap_ISCX.csv', 'dataset\\Wednesday-workingHours.pcap_ISCX.csv']
(2830743, 79)


In [None]:
df.columns = df.columns.str.strip()

for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()
df.to_csv("dataset/combined_cleaned.csv", index=False)

In [None]:
print(df.shape)

In [10]:
print(df.select_dtypes(include='object').columns)

Index(['Label'], dtype='object')


In [12]:
df.columns = df.columns.str.strip()

df['Label'] = df['Label'].str.strip()

print(df['Label'].unique())
print(df.shape)

['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']
(2830743, 79)


In [14]:
df['Label'] = df['Label'].str.replace('�', '-', regex=False)

print(df['Label'].unique())


['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack - Brute Force' 'Web Attack - XSS'
 'Web Attack - Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']


In [18]:
import numpy as np

numeric_df = df.select_dtypes(include=[np.number])

print("Total Infinite Values:", np.isinf(numeric_df).sum().sum())
print("Total Missing Values:", numeric_df.isnull().sum().sum())


Total Infinite Values: 4376
Total Missing Values: 1358


In [20]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
numeric_df = df.select_dtypes(include=[np.number])
print("After Cleaning - Infinite:", np.isinf(numeric_df).sum().sum())
print("After Cleaning - Missing:", numeric_df.isnull().sum().sum())

After Cleaning - Infinite: 0
After Cleaning - Missing: 0


In [35]:
df_binary = df.copy()

df_binary['Label'] = df_binary['Label'].apply(
    lambda x: 0 if x == "BENIGN" else 1
)

print(df_binary['Label'].value_counts())
df_binary.to_parquet("dataset/binary_dataset.parquet", index=False)

Label
0    2271320
1     556556
Name: count, dtype: int64


In [24]:
df_binary['Label'].value_counts(normalize=True)

Label
0    0.803189
1    0.196811
Name: proportion, dtype: float64

In [26]:
X = df_binary.drop("Label", axis=1)
y = df_binary["Label"]

In [37]:
from sklearn.preprocessing import LabelEncoder

df_multiclass = df.copy()

le = LabelEncoder()
df_multiclass['Label'] = le.fit_transform(df_multiclass['Label'])

print(dict(zip(le.classes_, le.transform(le.classes_))))

{'BENIGN': 0, 'Bot': 1, 'DDoS': 2, 'DoS GoldenEye': 3, 'DoS Hulk': 4, 'DoS Slowhttptest': 5, 'DoS slowloris': 6, 'FTP-Patator': 7, 'Heartbleed': 8, 'Infiltration': 9, 'PortScan': 10, 'SSH-Patator': 11, 'Web Attack - Brute Force': 12, 'Web Attack - Sql Injection': 13, 'Web Attack - XSS': 14}


In [39]:
df_multiclass.to_parquet("dataset/multiclass_dataset.parquet", index=False)

In [41]:
class_counts = df['Label'].value_counts()
print(class_counts)
threshold = 1000

rare_classes = class_counts[class_counts < threshold].index

df_merged = df.copy()

df_merged['Label'] = df_merged['Label'].apply(
    lambda x: "Other_Attack" if x in rare_classes else x
)

print(df_merged['Label'].value_counts())
#merging all rare cases into a single other attack category

Label
BENIGN                        2271320
DoS Hulk                       230124
PortScan                       158804
DDoS                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1956
Web Attack - Brute Force         1507
Web Attack - XSS                  652
Infiltration                       36
Web Attack - Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64
Label
BENIGN                      2271320
DoS Hulk                     230124
PortScan                     158804
DDoS                         128025
DoS GoldenEye                 10293
FTP-Patator                    7935
SSH-Patator                    5897
DoS slowloris                  5796
DoS Slowhttptest               5499
Bot                            1956
Web Attack - Brute Force       1

In [43]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_merged['Label'] = le.fit_transform(df_merged['Label'])

print(dict(zip(le.classes_, le.transform(le.classes_))))

{'BENIGN': 0, 'Bot': 1, 'DDoS': 2, 'DoS GoldenEye': 3, 'DoS Hulk': 4, 'DoS Slowhttptest': 5, 'DoS slowloris': 6, 'FTP-Patator': 7, 'Other_Attack': 8, 'PortScan': 9, 'SSH-Patator': 10, 'Web Attack - Brute Force': 11}


In [47]:
df_merged.to_parquet("dataset/multiclass_merged_dataset.parquet", index=False)
#saving new dataset
#we merged the attacks and made anothe rdataset and we are saving these chnages permanently so that we can use it in other notebooks

In [48]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [51]:
np.save("dataset/X_train.npy", X_train)
np.save("dataset/X_test.npy", X_test)
np.save("dataset/y_train.npy", y_train)
np.save("dataset/y_test.npy", y_test)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load binary dataset
df_binary = pd.read_parquet("dataset/binary_dataset.parquet")

X_bin = df_binary.drop("Label", axis=1)
y_bin = df_binary["Label"]

X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X_bin,
    y_bin,
    test_size=0.3,
    random_state=42,
    stratify=y_bin
)

np.save("dataset/X_train_binary.npy", X_train_bin.to_numpy())
np.save("dataset/X_test_binary.npy", X_test_bin.to_numpy())
np.save("dataset/y_train_binary.npy", y_train_bin.to_numpy())
np.save("dataset/y_test_binary.npy", y_test_bin.to_numpy())

print("Binary split saved.")


Binary split saved.


In [3]:
# Load multiclass dataset
df_multi = pd.read_parquet("dataset/multiclass_merged_dataset.parquet")

X_multi = df_multi.drop("Label", axis=1)
y_multi = df_multi["Label"]

X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi,
    y_multi,
    test_size=0.3,
    random_state=42,
    stratify=y_multi
)

np.save("dataset/X_train_multi.npy", X_train_multi.to_numpy())
np.save("dataset/X_test_multi.npy", X_test_multi.to_numpy())
np.save("dataset/y_train_multi.npy", y_train_multi.to_numpy())
np.save("dataset/y_test_multi.npy", y_test_multi.to_numpy())

print("Multiclass split saved.")


Multiclass split saved.


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_parquet("dataset/binary_dataset.parquet")

print("Unique labels in binary dataset:", df["Label"].unique())


Unique labels in binary dataset: [0 1]


In [7]:
X = df.drop("Label", axis=1)
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Binary test unique:", np.unique(y_test))


Binary test unique: [0 1]


In [9]:
np.save("dataset/X_train_binary.npy", X_train.values)
np.save("dataset/X_test_binary.npy", X_test.values)
np.save("dataset/y_train_binary.npy", y_train.values)
np.save("dataset/y_test_binary.npy", y_test.values)
