In [None]:
import kagglehub
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pandas as pd
from lightgbm import LGBMClassifier

path = kagglehub.dataset_download("chethuhn/network-intrusion-dataset")

print("Path to dataset files:", path)
path = str(path)

print(os.listdir(path))

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\abdul\.cache\kagglehub\datasets\chethuhn\network-intrusion-dataset\versions\1
['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Monday-WorkingHours.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv']


In [2]:
ddos_data = pd.read_csv(path + "/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
normal_data = pd.read_csv(path + "/Monday-WorkingHours.pcap_ISCX.csv")

df = pd.concat([ddos_data, normal_data], ignore_index=True)
df.columns = df.columns.str.strip()

df.replace([np.inf, -np.inf], np.nan, inplace=True)


df.dropna(inplace=True)


In [3]:
selected_features = [
    "Total Fwd Packets", "Total Backward Packets", "Total Length of Fwd Packets",
    "Total Length of Bwd Packets", "Fwd Packet Length Mean", "Bwd Packet Length Mean",
    "Packet Length Mean", "Flow Bytes/s", "Flow Packets/s", "Fwd Packets/s", "Bwd Packets/s",
    "Subflow Fwd Packets", "Subflow Bwd Packets", "Flow Duration", "Flow IAT Mean",
    "Fwd IAT Mean", "Bwd IAT Mean", "Active Mean", "Idle Mean", "SYN Flag Count",
    "ACK Flag Count", "FIN Flag Count", "RST Flag Count", "PSH Flag Count",
    "Init_Win_bytes_forward", "Init_Win_bytes_backward", "min_seg_size_forward"
]

X = df[selected_features]
y = df["Label"].apply(lambda x: 1 if "DDoS" in str(x) else 0)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
lgbm_model = LGBMClassifier(n_estimators=200, boosting_type='gbdt', random_state=42)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = LogisticRegression()

In [6]:
model.fit(X_train, y_train)

In [7]:
rf_model.fit(X_train, y_train)

In [8]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
lgbm_model.fit(X_train, y_train)

found 0 physical cores < 1
  File "c:\Users\abdul\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


[LightGBM] [Info] Number of positive: 102453, number of negative: 501700
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025813 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5383
[LightGBM] [Info] Number of data points in the train set: 604153, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169581 -> initscore=-1.588598
[LightGBM] [Info] Start training from score -1.588598


In [10]:
y_pred = model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lgbm = lgbm_model.predict(X_test)



In [11]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9843

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    125467
           1       0.98      0.93      0.95     25572

    accuracy                           0.98    151039
   macro avg       0.98      0.96      0.97    151039
weighted avg       0.98      0.98      0.98    151039


Confusion Matrix:
[[125008    459]
 [  1907  23665]]


In [12]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9998940670952535
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    125467
           1       1.00      1.00      1.00     25572

    accuracy                           1.00    151039
   macro avg       1.00      1.00      1.00    151039
weighted avg       1.00      1.00      1.00    151039



In [13]:
print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print(classification_report(y_test, y_pred_lgbm))

LightGBM Accuracy: 0.9999404127410801
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    125467
           1       1.00      1.00      1.00     25572

    accuracy                           1.00    151039
   macro avg       1.00      1.00      1.00    151039
weighted avg       1.00      1.00      1.00    151039



In [14]:
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.9999536543541734
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    125467
           1       1.00      1.00      1.00     25572

    accuracy                           1.00    151039
   macro avg       1.00      1.00      1.00    151039
weighted avg       1.00      1.00      1.00    151039

