# **Fares Ahmed Moustafa Ahmed**
## *F.ahmed2270@nu.edu.eg*

In [None]:
import pandas as pd

train_path = "C:/Fares Data/University/Samsung X LifeMakers Internship/SVM & Naive Bayes/NSL_KDD_Train.csv"
test_path  = "C:/Fares Data/University/Samsung X LifeMakers Internship/SVM & Naive Bayes/NSL_KDD_Train.csv"

# **Preparing The Dataset to Start Working On it**

In [3]:
# standard NSL-KDD column names
column_names = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
    "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
    "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label"
]

# load data with proper column names
train = pd.read_csv(train_path, header=None, names=column_names)
test  = pd.read_csv(test_path, header=None, names=column_names)

train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [4]:
# convert labels to binary
def to_binary_label(x):
    return "normal" if str(x).lower().startswith("normal") else "attack"

train["label"] = train["label"].apply(to_binary_label)
test["label"]  = test["label"].apply(to_binary_label)

train["label"].unique()

array(['normal', 'attack'], dtype=object)

In [5]:
# drop useless column
train = train.drop(columns=["num_outbound_cmds"])
test  = test.drop(columns=["num_outbound_cmds"])

In [6]:
# counting top 20 features so we can choose only few number of values to encode so that the dataset won't be too complicated cause the srevice column got 70+ values
print("Top 20 services in TRAIN:")
print(train["service"].value_counts().head(20))

Top 20 services in TRAIN:
service
http         40338
private      21853
domain_u      9043
smtp          7313
ftp_data      6860
eco_i         4586
other         4359
ecr_i         3077
telnet        2353
finger        1767
ftp           1754
auth           955
Z39_50         862
uucp           780
courier        734
bgp            710
whois          693
uucp_path      689
iso_tsap       687
time           654
Name: count, dtype: int64


In [7]:
# Counting to
top_services = train["service"].value_counts().nlargest(11).index.tolist()
top_services

['http',
 'private',
 'domain_u',
 'smtp',
 'ftp_data',
 'eco_i',
 'other',
 'ecr_i',
 'telnet',
 'finger',
 'ftp']

In [8]:
# categorical columns to encode
cat_cols = ["protocol_type", "service", "flag"]

# one-hot encode them
train_enc = pd.get_dummies(train, columns=cat_cols, drop_first=True)
test_enc  = pd.get_dummies(test, columns=cat_cols, drop_first=True)

# align train and test so they have the same columns
train_enc, test_enc = train_enc.align(test_enc, join="left", axis=1, fill_value=0)

# save encoded versions
train_enc.to_csv("NSL_KDD_Train_step3.csv", index=False)
test_enc.to_csv("NSL_KDD_Test_step3.csv", index=False)

print("Train shape:", train_enc.shape)
print("Test shape:", test_enc.shape)

Train shape: (125973, 119)
Test shape: (125973, 119)


In [9]:
# check duplicates
print("Duplicates in train:", train_enc.duplicated().sum())
print("Duplicates in test:", test_enc.duplicated().sum())

# drop duplicates if any
train_enc = train_enc.drop_duplicates()
test_enc  = test_enc.drop_duplicates()

Duplicates in train: 9
Duplicates in test: 9


In [10]:
# check for missing values
print("Missing values in train:", train_enc.isnull().sum().sum())
print("Missing values in test:", test_enc.isnull().sum().sum())


Missing values in train: 0
Missing values in test: 0


In [11]:
!pip install scikit-learn




In [12]:
from sklearn.preprocessing import StandardScaler

# split features and labels
X_train = train_enc.drop("label", axis=1)
y_train = train_enc["label"]

X_test = test_enc.drop("label", axis=1)
y_test = test_enc["label"]

# scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


X_train_scaled shape: (125964, 118)
X_test_scaled shape: (125964, 118)


In [13]:
train_enc.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
1,0,146,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
2,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
3,0,232,8153,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
4,0,199,420,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False


# **Training The SVM Model**

In [18]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

kernels = ["linear", "poly", "rbf", "sigmoid"]


for k in kernels:
    print(f"\n==== Kernel: {k} ====")
    model = SVC(kernel=k)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=["attack","normal"]))


==== Kernel: linear ====
Accuracy: 0.977144263440348
              precision    recall  f1-score   support

      attack       0.98      0.97      0.98     58621
      normal       0.98      0.98      0.98     67343

    accuracy                           0.98    125964
   macro avg       0.98      0.98      0.98    125964
weighted avg       0.98      0.98      0.98    125964


==== Kernel: poly ====
Accuracy: 0.9860515702899241
              precision    recall  f1-score   support

      attack       0.99      0.98      0.98     58621
      normal       0.98      0.99      0.99     67343

    accuracy                           0.99    125964
   macro avg       0.99      0.99      0.99    125964
weighted avg       0.99      0.99      0.99    125964


==== Kernel: rbf ====
Accuracy: 0.9930138769807246
              precision    recall  f1-score   support

      attack       0.99      0.99      0.99     58621
      normal       0.99      0.99      0.99     67343

    accuracy           

In [19]:
from sklearn.model_selection import train_test_split

# take a fixed subset (e.g., 20,000 rows) for tuning
X_sub, _, y_sub, _ = train_test_split(
    X_train_scaled, y_train,
    train_size=20000,
    stratify=y_train,   # keeps class balance
    random_state=42     # ensures reproducibility
)

print("Subset shape:", X_sub.shape, y_sub.shape)

Subset shape: (20000, 118) (20000,)


In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.1, 1, 10],
    "gamma": ["scale", "auto"],
    "kernel": ["linear", "poly", "rbf", "sigmoid"]
}

grid = GridSearchCV(SVC(), param_grid, cv=3, verbose=2, n_jobs=-1)
grid.fit(X_sub, y_sub)


print("\nBest parameters from Grid Search:", grid.best_params_)
print("Best score:", grid.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits

Best parameters from Grid Search: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best score: 0.990450042454373


# **Training Naive Bayes Model**

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# train
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

# predict
y_pred_nb = nb.predict(X_test_scaled)

# evaluate
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=["attack","normal"]))

Naive Bayes Accuracy: 0.8406528849512559
              precision    recall  f1-score   support

      attack       1.00      0.66      0.79     58621
      normal       0.77      1.00      0.87     67343

    accuracy                           0.84    125964
   macro avg       0.88      0.83      0.83    125964
weighted avg       0.88      0.84      0.83    125964



In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# parameter grid for GaussianNB
param_grid = {
    "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

grid_nb = GridSearchCV(
    GaussianNB(),
    param_grid,
    cv=3,         # 3-fold cross validation
    verbose=2,
    n_jobs=-1
)

grid_nb.fit(X_train_scaled, y_train)

print("Best parameters (Naive Bayes):", grid_nb.best_params_)
print("Best CV score:", grid_nb.best_score_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters (Naive Bayes): {'var_smoothing': 1e-05}
Best CV score: 0.867724111650948


# The Randomized search didnt work with cause it took so much time in both models