In [1]:
import pandas as pd

# Load the final balanced dataset
df = pd.read_csv("raw_scaled.csv")
print(df.shape)
print(df["Label"].value_counts())


(2829385, 28)
Label
0    2272688
1     556697
Name: count, dtype: int64


In [2]:
df.columns

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Length of Fwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Min',
       'Fwd IAT Min', 'Fwd Header Length', 'Bwd Header Length',
       'Bwd Packets/s', 'Min Packet Length', 'PSH Flag Count',
       'ACK Flag Count', 'URG Flag Count', 'Init_Win_bytes_forward',
       'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward',
       'Active Max', 'Label'],
      dtype='object')

In [3]:
from sklearn.model_selection import train_test_split

# Separate features and label
X = df.drop("Label", axis=1)
y = df["Label"]

# Train-Test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [4]:
X


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Length of Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Min,Flow Bytes/s,...,Bwd Packets/s,Min Packet Length,PSH Flag Count,ACK Flag Count,URG Flag Count,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Max
0,54865,3,2,12,6,6,6.0,0,0,4.000000e+06,...,0.000000,6,0,1,0,33,0,1,20,0
1,55054,109,1,6,6,6,6.0,6,6,1.100917e+05,...,9174.311927,6,0,1,1,29,256,0,20,0
2,55055,52,1,6,6,6,6.0,6,6,2.307692e+05,...,19230.769230,6,0,1,1,29,256,0,20,0
3,46236,34,1,6,6,6,6.0,6,6,3.529412e+05,...,29411.764710,6,0,1,1,31,329,0,20,0
4,54863,3,2,12,6,6,6.0,0,0,4.000000e+06,...,0.000000,6,0,1,0,32,0,1,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829380,53,32215,4,112,28,28,28.0,76,76,8.194940e+03,...,62.082881,28,0,0,0,0,0,3,20,0
2829381,53,324,2,84,42,42,42.0,181,181,1.376543e+06,...,6172.839506,42,0,0,0,0,0,1,20,0
2829382,58030,82,2,31,31,0,15.5,6,6,4.512195e+05,...,12195.121950,0,0,1,0,1006,0,0,32,0
2829383,53,1048635,6,192,32,32,32.0,128,128,4.272221e+02,...,1.907241,32,0,0,0,0,0,5,20,0


In [6]:
y

0          0
1          0
2          0
3          0
4          0
          ..
2829380    0
2829381    0
2829382    0
2829383    0
2829384    0
Name: Label, Length: 2829385, dtype: int64

## DECISION TREE

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

print("=== Decision Tree ===")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


=== Decision Tree ===
              precision    recall  f1-score   support

           0     0.9992    0.9993    0.9993    454538
           1     0.9971    0.9968    0.9970    111339

    accuracy                         0.9988    565877
   macro avg     0.9982    0.9981    0.9981    565877
weighted avg     0.9988    0.9988    0.9988    565877

Confusion Matrix:
 [[454218    320]
 [   353 110986]]


## RANDOM FOREST

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

print("=== Random Forest ===")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


=== Random Forest ===
              precision    recall  f1-score   support

           0     0.9994    0.9994    0.9994    454538
           1     0.9975    0.9974    0.9974    111339

    accuracy                         0.9990    565877
   macro avg     0.9984    0.9984    0.9984    565877
weighted avg     0.9990    0.9990    0.9990    565877

Confusion Matrix:
 [[454261    277]
 [   292 111047]]


## XG BOOST

In [9]:
from xgboost import XGBClassifier

xg_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xg_model.fit(X_train, y_train)
y_pred = xg_model.predict(X_test)

print("=== XGBoost ===")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



=== XGBoost ===
              precision    recall  f1-score   support

           0     0.9997    0.9994    0.9996    454538
           1     0.9976    0.9988    0.9982    111339

    accuracy                         0.9993    565877
   macro avg     0.9986    0.9991    0.9989    565877
weighted avg     0.9993    0.9993    0.9993    565877

Confusion Matrix:
 [[454268    270]
 [   132 111207]]


In [10]:
import joblib

# Example for logistic regression
joblib.dump(xg_model, "model_raw_scaled.pkl") 

['model_raw_scaled.pkl']