In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import random
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Model imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

# Processing imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Data extraction****
here we upload our dataset

In [None]:
# Data extraction
file_path_20_percent = '../Sybil Attack Detection/New folder/nsl-kdd/KDDTrain+_20Percent.txt'
file_path_full_training_set = '../Sybil Attack Detection/New folder/KDDTest+.txt'
file_path_test = '../Sybil Attack Detection/New folder/KDDTest+.txt' 

df = pd.read_csv(file_path_full_training_set)
test_df = pd.read_csv(file_path_test)

In [None]:
# Add column names
columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
           'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
           'root_shell','su_attempted','num_root','num_file_creations','num_shells',
           'num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count',
           'srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate',
           'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count',
           'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate',
           'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate',
           'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate',
           'attack','level'])

df.columns = columns
test_df.columns = columns

# sanity check
df.head(10)

**Data transformations**
I adding a column that encodes 'normal' values as 0 and any other value as 1. We will use this as our classifier for a simple binary model that idenfities any attack.

In [None]:
# map normal to 0, all attacks to 1
is_attack = df.attack.map(lambda a: 0 if a == 'normal' else 1)
test_attack = test_df.attack.map(lambda a: 0 if a == 'normal' else 1)
df['attack_flag'] = is_attack
test_df['attack_flag'] = test_attack

# view the result
df.head()

I classify each of the attacks according to attack type for a more granular prediction model.

In [None]:
# lists to hold our attack classifications
# Attack type classification
dos_attacks = ['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm']
probe_attacks = ['ipsweep','mscan','nmap','portsweep','saint','satan']
U2R = ['buffer_overflow','loadmdoule','perl','ps','rootkit','sqlattack','xterm']
Sybil = ['ftp_write','guess_passwd','http_tunnel','imap','multihop','named','phf','sendmail',
         'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xclock','xsnoop']
attack_labels = ['Normal','DoS','Probe','U2R','Sybil']

def map_attack(attack):
    if attack in dos_attacks:
        return 1
    elif attack in probe_attacks:
        return 2
    elif attack in U2R:
        return 3
    elif attack in Sybil:
        return 4
    else:
        return 0

attack_map = df.attack.apply(map_attack)
df['attack_map'] = attack_map
test_attack_map = test_df.attack.apply(map_attack)
test_df['attack_map'] = test_attack_map

# view the result
df.head()

**Data profiling**
i put in table of attack by protocol. 

In [None]:
# attack vs MCS protocols
attack_vs_protocol = pd.crosstab(df.attack, df.protocol_type)
attack_vs_protocol

**Visual Representation of Dataset**
 see how things are distributed.

In [None]:
# Visualization helper function
def bake_pies(data_list, labels):
    list_length = len(data_list)
    color_list = sns.color_palette()
    color_cycle = itertools.cycle(color_list)
    cdict = {}
    
    fig, axs = plt.subplots(1, list_length, figsize=(18,10), tight_layout=False)
    plt.subplots_adjust(wspace=1/list_length)
    
    for count, data_set in enumerate(data_list): 
        for num, value in enumerate(np.unique(data_set.index)):
            if value not in cdict:
                cdict[value] = next(color_cycle)
       
        wedges,texts = axs[count].pie(data_set,
                           colors=[cdict[v] for v in data_set.index])
        axs[count].legend(wedges, data_set.index,
                           title="Flags",
                           loc="center left",
                           bbox_to_anchor=(1, 0, 0.5, 1))
        axs[count].set_title(labels[count])
    return axs   

In [None]:
# Protocol distribution visualization
icmp_attacks = attack_vs_protocol.icmp
tcp_attacks = attack_vs_protocol.tcp
udp_attacks = attack_vs_protocol.udp
bake_pies([icmp_attacks, tcp_attacks, udp_attacks], ['icmp','tcp','udp'])
plt.show()

Our monitoring strategy uses a series continuous network protocols and capable of learning subtle distinction between threats and legal ones. Historical information fed the machine learning model classify Sybil and other attacks (ddos, probe, u2r). It helps to identify recurring patterns of Sybil, DDoS, U2R, Probe attacks and locate in long-term traffic chain.

In [None]:
normal_flags = df.loc[df.attack_flag == 0].flag.value_counts()
attack_flags = df.loc[df.attack_flag == 1].flag.value_counts()
flag_axs = bake_pies([normal_flags, attack_flags], ['normal','attack'])        
plt.show()

In [None]:
# Service distribution visualization
normal_services = df.loc[df.attack_flag == 0].service.value_counts()
attack_services = df.loc[df.attack_flag == 1].service.value_counts()
service_axs = bake_pies([normal_services, attack_services], ['normalMCS','attack'])        
plt.show()

Our data show that huge normal traffic is http, our attack traffic is all over the MCS. Sybil ttacks are searching for many different paths into MCS systems; some well traveled and some not.



**Feature engineering**


In [None]:
# Feature engineering
features_to_encode = ['protocol_type', 'service', 'flag']
encoded = pd.get_dummies(df[features_to_encode])
test_encoded_base = pd.get_dummies(test_df[features_to_encode])

test_index = np.arange(len(test_df.index))
column_diffs = list(set(encoded.columns.values)-set(test_encoded_base.columns.values))
diff_df = pd.DataFrame(0, index=test_index, columns=column_diffs)
column_order = encoded.columns.to_list()
test_encoded_temp = test_encoded_base.join(diff_df)
test_final = test_encoded_temp[column_order].fillna(0)

numeric_features = ['duration', 'src_bytes', 'dst_bytes']
to_fit = encoded.join(df[numeric_features])
test_set = test_final.join(test_df[numeric_features])

# Prepare training and testing sets
binary_y = df['attack_flag']
multi_y = df['attack_map']
test_binary_y = test_df['attack_flag']
test_multi_y = test_df['attack_map']

I divide the data into testing and training sets to start: binrary and multi classifications.

In [None]:
# Prepare training and testing sets
binary_y = df['attack_flag']
multi_y = df['attack_map']
test_binary_y = test_df['attack_flag']
test_multi_y = test_df['attack_map']

binary_train_X, binary_val_X, binary_train_y, binary_val_y = train_test_split(to_fit, binary_y, test_size=0.6)
multi_train_X, multi_val_X, multi_train_y, multi_val_y = train_test_split(to_fit, multi_y, test_size=0.6)

**Model fitting**
Based on the nature of the data, decision trees are a good starting point for building out predictive models. In this case we'll use a random forest to build and combine multiple trees. 

In [None]:
# Model fitting and evaluation
binary_model = RandomForestClassifier()
binary_model.fit(binary_train_X, binary_train_y)
binary_predictions = binary_model.predict(binary_val_X)
base_rf_score = accuracy_score(binary_predictions, binary_val_y)
base_rf_score

I got 99% accuracy 


Comapre with different model using cross_val_score.

In [None]:
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

warnings.filterwarnings("ignore", category=FutureWarning)

# Define models with simplified configurations
models = [
    RandomForestClassifier(n_estimators=50),  # Reduce number of estimators
    LogisticRegression(max_iter=500),  # Reduce max iterations
    SVC(kernel="linear", C=1.0),
    DecisionTreeClassifier(max_depth=10),  # Add Decision Tree with limited depth
]

model_comps = []

# Use parallel processing and fewer folds for cross-validation
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(
        model, binary_train_X, binary_train_y, scoring="accuracy", cv=3, n_jobs=-1
    )
    for count, accuracy in enumerate(accuracies):
        model_comps.append((model_name, count, accuracy))

In [None]:
result_df = pd.DataFrame(model_comps, columns=['model_name', 'count', 'accuracy'])
result_df.pivot(index='count',columns='model_name',values='accuracy').boxplot(rot=45)
plt.show()

 random forest and K-nearest perform well but logistic give loww accuracy

**Confusion Matrix**
Now, Summarizing the performance of a classification algorithm. 

In [None]:
# Confusion Matrix Analysis
def add_predictions(data_set, predictions, y):
    prediction_series = pd.Series(predictions, index=y.index)
    predicted_vs_actual = data_set.assign(predicted=prediction_series)
    original_data = predicted_vs_actual.assign(actual=y).dropna()
    conf_matrix = confusion_matrix(original_data['actual'], original_data['predicted'])
    
    base_errors = original_data[original_data['actual'] != original_data['predicted']]
    non_zeros = base_errors.loc[:,(base_errors != 0).any(axis=0)]
    false_positives = non_zeros.loc[non_zeros.actual==0]
    false_negatives = non_zeros.loc[non_zeros.actual==1]

    return {
        'data': original_data,
        'confusion_matrix': conf_matrix,
        'errors': base_errors,
        'non_zeros': non_zeros,
        'false_positives': false_positives,
        'false_negatives': false_negatives
    }

In [None]:
binary_prediction_data = add_predictions(df, binary_predictions, binary_val_y)

plt.figure(figsize=(8, 6))
sns.heatmap(data=binary_prediction_data['confusion_matrix'],
            xticklabels=['Predicted Normal','Predicted Attack'],
            yticklabels=['Actual Normal','Actual Attack'],
            cmap="YlGnBu",
            fmt='d',
            annot=True)
plt.title('Random Forest Confusion Matrix')
plt.show()

In [None]:
# SVM specific analysis
svm_model = svm.SVC(kernel='linear', C=1.0)
svm_model.fit(binary_train_X, binary_train_y)
svm_predictions = svm_model.predict(binary_val_X)
svm_score = accuracy_score(svm_predictions, binary_val_y)

svm_prediction_data = add_predictions(df, svm_predictions, binary_val_y)

plt.figure(figsize=(8, 6))
sns.heatmap(data=svm_prediction_data['confusion_matrix'],
            xticklabels=['Predicted Normal', 'Predicted Attack'],
            yticklabels=['Actual Normal', 'Actual Attack'],
            cmap="YlGnBu",
            fmt='d',
            annot=True)
plt.title('SVM Confusion Matrix')
plt.show()