%% [markdown]<br>
# Sybil Attack Detection with Multiple Classifiers<br>
This code evaluates the performance of different classifiers on the Sybil Attack dataset.

%% [module imports]

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

%% [markdown]<br>
## Data Extraction<br>
Load the dataset.

%% [data loading]

In [2]:
file_path_full_training_set = "../Sybil Attack Detection/New folder/KDDTrain+.txt"
file_path_test = "../Sybil Attack Detection/New folder/KDDTest+.txt"

Load the datasets

In [3]:
df = pd.read_csv(file_path_full_training_set)
test_df = pd.read_csv(file_path_test)

%% [markdown]<br>
## Data Preprocessing<br>
Add column names and create binary attack flags.

%% [data preprocessing]<br>
Add column names

In [4]:
columns = [
    "duration",
    "protocol_type",
    "service",
    "flag",
    "src_bytes",
    "dst_bytes",
    "land",
    "wrong_fragment",
    "urgent",
    "hot",
    "num_failed_logins",
    "logged_in",
    "num_compromised",
    "root_shell",
    "su_attempted",
    "num_root",
    "num_file_creations",
    "num_shells",
    "num_access_files",
    "num_outbound_cmds",
    "is_host_login",
    "is_guest_login",
    "count",
    "srv_count",
    "serror_rate",
    "srv_serror_rate",
    "rerror_rate",
    "srv_rerror_rate",
    "same_srv_rate",
    "diff_srv_rate",
    "srv_diff_host_rate",
    "dst_host_count",
    "dst_host_srv_count",
    "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate",
    "dst_host_srv_serror_rate",
    "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate",
    "attack",
    "level",
]

In [5]:
df.columns = columns
test_df.columns = columns

Create binary attack flags

In [6]:
df["attack_flag"] = df["attack"].map(lambda a: 0 if a == "normal" else 1)
test_df["attack_flag"] = test_df["attack"].map(lambda a: 0 if a == "normal" else 1)

%% [markdown]<br>
## Feature Engineering<br>
Encode categorical features and prepare the dataset for modeling.

%% [feature engineering]<br>
One-hot encode categorical features

In [7]:
features_to_encode = ["protocol_type", "service", "flag"]
encoded = pd.get_dummies(df[features_to_encode])
test_encoded_base = pd.get_dummies(test_df[features_to_encode])

Handle column differences between train and test sets

In [8]:
test_index = np.arange(len(test_df.index))
column_diffs = list(set(encoded.columns.values) - set(test_encoded_base.columns.values))
diff_df = pd.DataFrame(0, index=test_index, columns=column_diffs)

Reorder columns to match

In [9]:
column_order = encoded.columns.to_list()
test_encoded_temp = test_encoded_base.join(diff_df)
test_final = test_encoded_temp[column_order].fillna(0)

Get numeric features

In [10]:
numeric_features = ["duration", "src_bytes", "dst_bytes"]
to_fit = encoded.join(df[numeric_features])
test_set = test_final.join(test_df[numeric_features])

Create target variable

In [11]:
binary_y = df["attack_flag"]
test_binary_y = test_df["attack_flag"]

Split the dataset into training and validation sets

In [12]:
binary_train_X, binary_val_X, binary_train_y, binary_val_y = train_test_split(
    to_fit, binary_y, test_size=0.2, random_state=42
)

%% [markdown]<br>
## Model Training and Evaluation<br>
Train and evaluate multiple classifiers.

%% [model evaluation]<br>
Define the models to evaluate

In [13]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
}

Store the accuracy results

In [14]:
accuracy_results = {}

Train and evaluate each model

In [15]:
for model_name, model in models.items():
    # Fit the model
    model.fit(binary_train_X, binary_train_y)

    # Make predictions
    predictions = model.predict(binary_val_X)

    # Calculate accuracy
    accuracy

NameError: name 'accuracy' is not defined