In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings

In [2]:
import os
print(os.getcwd())

d:\VII\ANT\model


In [3]:
# --- 1. Load Data ---
# IMPORTANT: Replace 'your_dataset.csv' with the actual path to your CSE-CIC-IDS2018 dataset file.
try:
    df = pd.read_csv('../data\combined_realistic.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset file not found. Please update the file path.")
    exit()

Dataset loaded successfully.


#### Filtered Dataset CSV, taken from 30 features that were decided in further cells

In [4]:
# The list of top 30 features you identified
top_30_features = [
    'Init Fwd Win Byts', 'Fwd Seg Size Min', 'Dst Port', 'Fwd Header Len', 'Flow IAT Min', 
    'Flow Duration', 'Fwd IAT Max', 'Fwd IAT Min', 'Fwd Pkts/s', 'Flow Pkts/s', 
    'Fwd IAT Tot', 'Flow IAT Max', 'Fwd Pkt Len Mean', 'Fwd IAT Mean', 'Flow IAT Mean', 
    'Bwd Pkts/s', 'Pkt Len Mean', 'TotLen Fwd Pkts', 'Flow Byts/s', 'Init Bwd Win Byts', 
    'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Bwd Pkt Len Max', 'Pkt Len Var', 
    'Fwd Seg Size Avg', 'Tot Fwd Pkts', 'Bwd Header Len', 'Pkt Len Max', 
    'Subflow Fwd Byts', 'Subflow Fwd Pkts'
]

# Create the final list of columns to keep
columns_to_keep = top_30_features + ['Label']


In [5]:
# --- 2. Filter the DataFrame ---
print("Filtering the existing DataFrame in memory...")

# Best practice: Clean column names to avoid errors
df.columns = df.columns.str.strip()

# Create the new subset DataFrame by selecting only the columns you need
df_subset = df[columns_to_keep]

print("New subset DataFrame created successfully.")

# --- 3. Save the New, Smaller Dataset ---
filtered_filename = '../data/filtered_dataset.csv'
print(f"Saving the new dataset to '{filtered_filename}'...")

# Using index=False prevents pandas from writing an unnecessary row index
df_subset.to_csv(filtered_filename, index=False)

print("\nScript finished successfully! 🎉")
print(f"Your new file '{filtered_filename}' is ready to use.")

Filtering the existing DataFrame in memory...
New subset DataFrame created successfully.
Saving the new dataset to '../data/filtered_dataset.csv'...

Script finished successfully! 🎉
Your new file '../data/filtered_dataset.csv' is ready to use.


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8192732 entries, 0 to 8229038
Data columns (total 79 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Dst Port           int64  
 1   Protocol           int64  
 2   Flow Duration      int64  
 3   Tot Fwd Pkts       int64  
 4   Tot Bwd Pkts       int64  
 5   TotLen Fwd Pkts    int64  
 6   TotLen Bwd Pkts    float64
 7   Fwd Pkt Len Max    int64  
 8   Fwd Pkt Len Min    int64  
 9   Fwd Pkt Len Mean   float64
 10  Fwd Pkt Len Std    float64
 11  Bwd Pkt Len Max    int64  
 12  Bwd Pkt Len Min    int64  
 13  Bwd Pkt Len Mean   float64
 14  Bwd Pkt Len Std    float64
 15  Flow Byts/s        float64
 16  Flow Pkts/s        float64
 17  Flow IAT Mean      float64
 18  Flow IAT Std       float64
 19  Flow IAT Max       float64
 20  Flow IAT Min       float64
 21  Fwd IAT Tot        float64
 22  Fwd IAT Mean       float64
 23  Fwd IAT Std        float64
 24  Fwd IAT Max        float64
 25  Fwd IAT Min        floa

In [5]:
# --- 2. Data Cleaning & Preprocessing ---
print("Cleaning and preprocessing data...")

# Drop the timestamp column as it's not a useful feature in its raw format
if 'Timestamp' in df.columns:
    df = df.drop(columns=['Timestamp'])

# Clean column names (strip leading/trailing spaces)
df.columns = df.columns.str.strip()

# The dataset is known to have infinity and NaN values.
# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Drop rows with NaN values. For this dataset, this is a safe approach.
df.dropna(inplace=True)

Cleaning and preprocessing data...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8192732 entries, 0 to 8229038
Data columns (total 79 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Dst Port           int64  
 1   Protocol           int64  
 2   Flow Duration      int64  
 3   Tot Fwd Pkts       int64  
 4   Tot Bwd Pkts       int64  
 5   TotLen Fwd Pkts    int64  
 6   TotLen Bwd Pkts    float64
 7   Fwd Pkt Len Max    int64  
 8   Fwd Pkt Len Min    int64  
 9   Fwd Pkt Len Mean   float64
 10  Fwd Pkt Len Std    float64
 11  Bwd Pkt Len Max    int64  
 12  Bwd Pkt Len Min    int64  
 13  Bwd Pkt Len Mean   float64
 14  Bwd Pkt Len Std    float64
 15  Flow Byts/s        float64
 16  Flow Pkts/s        float64
 17  Flow IAT Mean      float64
 18  Flow IAT Std       float64
 19  Flow IAT Max       float64
 20  Flow IAT Min       float64
 21  Fwd IAT Tot        float64
 22  Fwd IAT Mean       float64
 23  Fwd IAT Std        float64
 24  Fwd IAT Max        float64
 25  Fwd IAT Min        floa

#### Checking on how the Flags are in the data

In [13]:
cols = [
    "FIN Flag Cnt",
    "SYN Flag Cnt",
    "RST Flag Cnt",
    "PSH Flag Cnt",
    "ACK Flag Cnt",
    "URG Flag Cnt",
    "CWE Flag Count",
    "ECE Flag Cnt"
]

df_flags = df[cols]

In [14]:
df_flags

Unnamed: 0,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
8229034,0,0,1,1,0,0,0,1
8229035,0,0,1,1,0,0,0,1
8229036,0,0,1,1,0,0,0,1
8229037,0,0,1,1,0,0,0,1


#### Feature Selection of top 30 models based on feature importance using RF

In [10]:
# --- 3. Feature and Label Preparation ---
print("Preparing features and labels...")
X = df.drop(columns=['Label'])
y = df['Label']

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# --- 4. Train a Model for Feature Importance ---
print("Training a RandomForest model to find important features...")
# We don't need to scale the data for a RandomForest's feature importance
model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
model.fit(X, y_encoded)

Preparing features and labels...
Training a RandomForest model to find important features...


0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
# --- 5. Get and Print Top Features ---
print("Extracting top 30 features...")

# Create a dataframe of features and their importance scores
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Get the top 30 feature names
top_30_features = feature_importance_df.head(30)['feature'].tolist()

print("\n--- Top 30 Most Important Features ---")
# Print the list in a format that's easy to copy and paste
print(repr(top_30_features))

print("\nScript finished successfully! 🎉")

Extracting top 30 features...

--- Top 30 Most Important Features ---
['Init Fwd Win Byts', 'Fwd Seg Size Min', 'Dst Port', 'Fwd Header Len', 'Flow IAT Min', 'Flow Duration', 'Fwd IAT Max', 'Fwd IAT Min', 'Fwd Pkts/s', 'Flow Pkts/s', 'Fwd IAT Tot', 'Flow IAT Max', 'Fwd Pkt Len Mean', 'Fwd IAT Mean', 'Flow IAT Mean', 'Bwd Pkts/s', 'Pkt Len Mean', 'TotLen Fwd Pkts', 'Flow Byts/s', 'Init Bwd Win Byts', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Bwd Pkt Len Max', 'Pkt Len Var', 'Fwd Seg Size Avg', 'Tot Fwd Pkts', 'Bwd Header Len', 'Pkt Len Max', 'Subflow Fwd Byts', 'Subflow Fwd Pkts']

Script finished successfully! 🎉


#### Model Prep

In [15]:
# --- 3. Feature Selection & Label Prep ---
# Using a representative list of top 30 features from this dataset.
# Replace this with the exact list from your feature selection script if different.
top_30_features = ['Init Fwd Win Byts', 'Fwd Seg Size Min', 'Dst Port', 
'Fwd Header Len', 'Flow IAT Min', 'Flow Duration', 'Fwd IAT Max', 'Fwd IAT Min', 
'Fwd Pkts/s', 'Flow Pkts/s', 'Fwd IAT Tot', 'Flow IAT Max', 'Fwd Pkt Len Mean', 
'Fwd IAT Mean', 'Flow IAT Mean', 'Bwd Pkts/s', 'Pkt Len Mean', 'TotLen Fwd Pkts', 
'Flow Byts/s', 'Init Bwd Win Byts', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 
'Bwd Pkt Len Max', 'Pkt Len Var', 'Fwd Seg Size Avg', 'Tot Fwd Pkts', 'Bwd Header Len',
 'Pkt Len Max', 'Subflow Fwd Byts', 'Subflow Fwd Pkts']


# Ensure all selected features are present in the dataframe
top_30_features = [f for f in top_30_features if f in df.columns]
print(f"Using {len(top_30_features)} available features for modeling.")

X = df[top_30_features]
y = df['Label']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

Using 30 available features for modeling.


In [16]:
# --- 4. Split and Scale Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
# --- 5. Initialize Models ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

model_results = {}

In [25]:
# # --- 6. Train and Evaluate Each Model ---
# for name, model in models.items():
#     print(f"\n--- Training {name} ---")
#     model.fit(X_train_scaled, y_train)
    
#     print(f"--- Evaluating {name} ---")
#     y_pred = model.predict(X_test_scaled)
    
#     accuracy = accuracy_score(y_test, y_pred)
#     model_results[name] = accuracy
    
#     print(f"\nAccuracy for {name}: {accuracy:.4f}")
#     print(f"Classification Report for {name}:")
#     print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# --- 6. Train and Evaluate Each Model ---
for name, model in models.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train_scaled, y_train)
    
    print(f"--- Evaluating {name} ---")
    
    # 1. Calculate Training Accuracy (NEW)
    y_train_pred = model.predict(X_train_scaled)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    # 2. Calculate Testing Accuracy (Original)
    y_test_pred = model.predict(X_test_scaled)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    model_results[name] = test_accuracy # We still judge the model on its test accuracy
    
    # 3. Print both (NEW)
    print(f"\nTraining Accuracy for {name}: {train_accuracy:.4f}")
    print(f"Testing Accuracy for {name}: {test_accuracy:.4f}")
    
    print(f"Classification Report for {name} (on Test Data):")
    print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))


--- Training Logistic Regression ---
--- Evaluating Logistic Regression ---

Training Accuracy for Logistic Regression: 0.9439
Testing Accuracy for Logistic Regression: 0.9438
Classification Report for Logistic Regression (on Test Data):
                          precision    recall  f1-score   support

                  Benign       0.95      0.99      0.97   1215429
                     Bot       0.82      0.50      0.62     57238
        DDOS attack-HOIC       0.98      1.00      0.99    137203
        DoS attacks-Hulk       0.99      1.00      0.99     92382
DoS attacks-SlowHTTPTest       0.65      0.55      0.60     27978
          FTP-BruteForce       0.71      0.78      0.74     38671
           Infilteration       0.26      0.00      0.01     32128
          SSH-Bruteforce       0.99      1.00      0.99     37518

                accuracy                           0.94   1638547
               macro avg       0.79      0.73      0.74   1638547
            weighted avg       0.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Evaluating XGBoost ---

Training Accuracy for XGBoost: 0.9697
Testing Accuracy for XGBoost: 0.9696
Classification Report for XGBoost (on Test Data):
                          precision    recall  f1-score   support

                  Benign       0.98      1.00      0.99   1215429
                     Bot       1.00      1.00      1.00     57238
        DDOS attack-HOIC       1.00      1.00      1.00    137203
        DoS attacks-Hulk       1.00      1.00      1.00     92382
DoS attacks-SlowHTTPTest       0.77      0.51      0.62     27978
          FTP-BruteForce       0.72      0.89      0.79     38671
           Infilteration       0.55      0.03      0.06     32128
          SSH-Bruteforce       1.00      1.00      1.00     37518

                accuracy                           0.97   1638547
               macro avg       0.88      0.80      0.81   1638547
            weighted avg       0.96      0.97      0.96   1638547


--- Training LightGBM ---
[LightGBM] [Info] Auto-ch




Training Accuracy for LightGBM: 0.9716
Testing Accuracy for LightGBM: 0.9715
Classification Report for LightGBM (on Test Data):
                          precision    recall  f1-score   support

                  Benign       0.97      1.00      0.99   1215429
                     Bot       1.00      1.00      1.00     57238
        DDOS attack-HOIC       1.00      1.00      1.00    137203
        DoS attacks-Hulk       1.00      1.00      1.00     92382
DoS attacks-SlowHTTPTest       0.91      0.53      0.66     27978
          FTP-BruteForce       0.74      0.96      0.83     38671
           Infilteration       0.59      0.03      0.05     32128
          SSH-Bruteforce       1.00      1.00      1.00     37518

                accuracy                           0.97   1638547
               macro avg       0.90      0.81      0.82   1638547
            weighted avg       0.97      0.97      0.96   1638547



In [26]:
# --- 7. Compare Models and Save the Best One ---
print("\n--- Model Comparison Summary ---")
results_df = pd.DataFrame.from_dict(model_results, orient='index', columns=['Accuracy'])
print(results_df.sort_values(by='Accuracy', ascending=False))

best_model_name = max(model_results, key=model_results.get)
best_model = models[best_model_name]

print(f"\nBest performing model is: {best_model_name} with an accuracy of {model_results[best_model_name]:.4f}")


--- Model Comparison Summary ---
                     Accuracy
LightGBM             0.971511
XGBoost              0.969581
Random Forest        0.964376
Logistic Regression  0.943799

Best performing model is: LightGBM with an accuracy of 0.9715


In [27]:
print("Saving the best model, scaler, and encoder...")
joblib.dump(best_model, 'best_detection_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')

print("\nScript finished successfully! 🎉")
print("The best model has been saved as 'best_detection_model.joblib'.")

Saving the best model, scaler, and encoder...

Script finished successfully! 🎉
The best model has been saved as 'best_detection_model.joblib'.


#### Since classes were imbalanced, we try with SMOTE now

In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE  # Import SMOTE
import numpy as np

In [3]:
filtered_df = pd.read_csv('../data/filtered_dataset.csv')
print("Filtered dataset loaded successfully.")

Filtered dataset loaded successfully.


In [4]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8229039 entries, 0 to 8229038
Data columns (total 31 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Init Fwd Win Byts  int64  
 1   Fwd Seg Size Min   int64  
 2   Dst Port           int64  
 3   Fwd Header Len     int64  
 4   Flow IAT Min       float64
 5   Flow Duration      int64  
 6   Fwd IAT Max        float64
 7   Fwd IAT Min        float64
 8   Fwd Pkts/s         float64
 9   Flow Pkts/s        float64
 10  Fwd IAT Tot        float64
 11  Flow IAT Max       float64
 12  Fwd Pkt Len Mean   float64
 13  Fwd IAT Mean       float64
 14  Flow IAT Mean      float64
 15  Bwd Pkts/s         float64
 16  Pkt Len Mean       float64
 17  TotLen Fwd Pkts    int64  
 18  Flow Byts/s        float64
 19  Init Bwd Win Byts  int64  
 20  Bwd Pkt Len Mean   float64
 21  Bwd Pkt Len Std    float64
 22  Bwd Pkt Len Max    int64  
 23  Pkt Len Var        float64
 24  Fwd Seg Size Avg   float64
 25  Tot Fwd Pkts      

In [5]:
filtered_df.head()

Unnamed: 0,Init Fwd Win Byts,Fwd Seg Size Min,Dst Port,Fwd Header Len,Flow IAT Min,Flow Duration,Fwd IAT Max,Fwd IAT Min,Fwd Pkts/s,Flow Pkts/s,...,Bwd Pkt Len Std,Bwd Pkt Len Max,Pkt Len Var,Fwd Seg Size Avg,Tot Fwd Pkts,Bwd Header Len,Pkt Len Max,Subflow Fwd Byts,Subflow Fwd Pkts,Label
0,-1,0,0,0,56320761.0,112641719,56320958.0,56320761.0,0.026633,0.026633,...,0.0,0,0.0,0.0,3,0,0,0,3,Benign
1,-1,0,0,0,56320652.0,112641466,56320814.0,56320652.0,0.026633,0.026633,...,0.0,0,0.0,0.0,3,0,0,0,3,Benign
2,-1,0,0,0,56319098.0,112638623,56319525.0,56319098.0,0.026634,0.026634,...,0.0,0,0.0,0.0,3,0,0,0,3,Benign
3,65535,32,22,488,22.0,6453966,673900.0,229740.0,2.324152,3.873587,...,371.677892,976,77192.153846,82.6,15,328,976,1239,15,Benign
4,5808,32,22,456,21.0,8804066,1928102.0,246924.0,1.590174,2.839597,...,362.249864,976,78267.353846,81.642857,14,360,976,1143,14,Benign


In [6]:
filtered_df['Label'].value_counts()

Label
Benign                      6112151
DDOS attack-HOIC             686012
DoS attacks-Hulk             461912
Bot                          286191
FTP-BruteForce               193360
SSH-Bruteforce               187589
Infilteration                161934
DoS attacks-SlowHTTPTest     139890
Name: count, dtype: int64

In [7]:
filtered_df.columns = filtered_df.columns.str.strip()

# The dataset is known to have infinity and NaN values.
# Replace infinite values with NaN
filtered_df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Drop rows with NaN values. For this dataset, this is a safe approach.
filtered_df.dropna(inplace=True)

In [8]:
X = filtered_df.drop(columns=["Label"])
y = filtered_df['Label']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [9]:
print("Balanced training script started...")
# --- 4. Split and Scale Data ---
# Split data BEFORE applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Balanced training script started...


In [10]:
from imblearn.under_sampling import RandomUnderSampler # Import the undersampler
from imblearn.pipeline import Pipeline # Import the pipeline tool
from collections import Counter

In [11]:
# # --- 5. Apply SMOTE to the Training Data ---
# print("Applying SMOTE to the training data to handle imbalance...")
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# print("SMOTE applied. Training set is now balanced.")

# --- 5. Define and Apply the Hybrid Sampling Strategy ---
# print("Applying a hybrid undersampling and oversampling strategy...")

# # Define the undersampling strategy. 
# # This will reduce the 'Benign' class to have 500,000 samples. Adjust as needed.
# # We first need to find the numerical label for 'Benign'
# benign_label_numeric = label_encoder.transform(['Benign'])[0]
# under_sampler = RandomUnderSampler(sampling_strategy={benign_label_numeric: 500000}, random_state=42)

# # Define the SMOTE strategy. It will oversample all other classes to match the new majority.
# over_sampler = SMOTE(random_state=42)

# # Create a pipeline to apply the steps in sequence
# pipeline = Pipeline(steps=[('u', under_sampler), ('o', over_sampler)])

# # Apply the pipeline to the training data
# X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train_scaled, y_train)

# print("Hybrid sampling applied. Training set is now smaller and balanced.")
# print(f"New training set shape: {X_train_resampled.shape}")

# --- 5. Define and Apply the Corrected Dynamic Hybrid Sampling Strategy ---
print("Applying a dynamic hybrid sampling strategy...")

# Get the counts of each class in the training data
class_counts = Counter(y_train)

# --- THE FIX STARTS HERE ---
# 1. Find the numeric label of the most common class (which is 'Benign')
majority_class_label_numeric = class_counts.most_common(1)[0][0]
print(f"The majority class has the numeric label: {majority_class_label_numeric}")

# 2. Find the size of the second-largest class
second_largest_class_size = sorted(class_counts.values(), reverse=True)[1]

# 3. Set the undersampling target for the majority class
majority_class_target_size = second_largest_class_size * 3
print(f"Second largest class has {second_largest_class_size} samples.")
print(f"Setting majority class target to {majority_class_target_size} samples.")
# --- THE FIX ENDS HERE ---


# Define the undersampling and oversampling strategies using the new variables
under_sampler = RandomUnderSampler(sampling_strategy={majority_class_label_numeric: majority_class_target_size}, random_state=42)
over_sampler = SMOTE(random_state=42)

# Create and apply the pipeline
pipeline = Pipeline(steps=[('u', under_sampler), ('o', over_sampler)])
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train_scaled, y_train)

print("Hybrid sampling applied. Training set is now smaller and balanced.")
print(f"New training set shape: {X_train_resampled.shape}")
print(f"Class distribution after resampling: {Counter(y_train_resampled)}")



Applying a dynamic hybrid sampling strategy...
The majority class has the numeric label: 0
Second largest class has 548809 samples.
Setting majority class target to 1646427 samples.
Hybrid sampling applied. Training set is now smaller and balanced.
New training set shape: (13171416, 30)
Class distribution after resampling: Counter({np.int64(0): 1646427, np.int64(1): 1646427, np.int64(2): 1646427, np.int64(3): 1646427, np.int64(4): 1646427, np.int64(5): 1646427, np.int64(6): 1646427, np.int64(7): 1646427})


In [12]:
# --- 4. Initialize Models ---
# models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
#     "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
#     "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
#     "LightGBM": LGBMClassifier(random_state=42)
# }
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),

    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=20,         # <--- LIMITS tree depth to control size
        min_samples_leaf=5,   # <--- Prevents overfitting to small groups
        random_state=42,
        n_jobs=-1
    ),

    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        max_depth=15,         # <--- LIMITS tree depth
        random_state=42
    ),

    "LightGBM": LGBMClassifier(
        max_depth=15,         # <--- LIMITS tree depth
        num_leaves=60,        # <--- Another way to control complexity
        random_state=42
    )
}

model_results = {}

# --- 5. Train and Evaluate Each Model on Balanced Data ---
for name, model in models.items():
    print(f"\n--- Training {name} on Balanced Data ---")
    # Train on the NEW, resampled data
    model.fit(X_train_resampled, y_train_resampled)
    
    print(f"--- Evaluating {name} on Original Test Data ---")
    # Evaluate on the ORIGINAL, unbalanced test data
    y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    model_results[name] = accuracy
    
    print(f"\nAccuracy for {name}: {accuracy:.4f}")
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


--- Training Logistic Regression on Balanced Data ---


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


--- Evaluating Logistic Regression on Original Test Data ---

Accuracy for Logistic Regression: 0.5733
Classification Report for Logistic Regression:
                          precision    recall  f1-score   support

                  Benign       0.99      0.45      0.62   1215429
                     Bot       0.62      1.00      0.76     57238
        DDOS attack-HOIC       0.97      1.00      0.99    137203
        DoS attacks-Hulk       0.98      1.00      0.99     92382
DoS attacks-SlowHTTPTest       0.64      0.55      0.59     27978
          FTP-BruteForce       0.71      0.78      0.74     38671
           Infilteration       0.04      0.77      0.07     32128
          SSH-Bruteforce       0.97      1.00      0.99     37518

                accuracy                           0.57   1638547
               macro avg       0.74      0.82      0.72   1638547
            weighted avg       0.94      0.57      0.67   1638547


--- Training Random Forest on Balanced Data ---
--- Ev

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Evaluating XGBoost on Original Test Data ---

Accuracy for XGBoost: 0.8520
Classification Report for XGBoost:
                          precision    recall  f1-score   support

                  Benign       0.99      0.82      0.90   1215429
                     Bot       1.00      1.00      1.00     57238
        DDOS attack-HOIC       1.00      1.00      1.00    137203
        DoS attacks-Hulk       1.00      1.00      1.00     92382
DoS attacks-SlowHTTPTest       0.76      0.52      0.62     27978
          FTP-BruteForce       0.72      0.88      0.79     38671
           Infilteration       0.09      0.70      0.17     32128
          SSH-Bruteforce       1.00      1.00      1.00     37518

                accuracy                           0.85   1638547
               macro avg       0.82      0.87      0.81   1638547
            weighted avg       0.96      0.85      0.90   1638547


--- Training LightGBM on Balanced Data ---
[LightGBM] [Info] Auto-choosing row-wise multi-




Accuracy for LightGBM: 0.8721
Classification Report for LightGBM:
                          precision    recall  f1-score   support

                  Benign       0.99      0.84      0.91   1215429
                     Bot       1.00      1.00      1.00     57238
        DDOS attack-HOIC       1.00      1.00      1.00    137203
        DoS attacks-Hulk       1.00      1.00      1.00     92382
DoS attacks-SlowHTTPTest       0.93      0.53      0.68     27978
          FTP-BruteForce       0.74      0.97      0.84     38671
           Infilteration       0.12      0.81      0.21     32128
          SSH-Bruteforce       1.00      1.00      1.00     37518

                accuracy                           0.87   1638547
               macro avg       0.85      0.89      0.83   1638547
            weighted avg       0.97      0.87      0.91   1638547



In [13]:
# --- 6. Compare Models and Save the Best One ---
print("\n--- Model Comparison Summary (Trained on Balanced Data) ---")
results_df = pd.DataFrame.from_dict(model_results, orient='index', columns=['Accuracy'])
print(results_df.sort_values(by='Accuracy', ascending=False))

best_model_name = max(model_results, key=model_results.get)
best_model = models[best_model_name]

print(f"\nBest performing model is: {best_model_name} with an accuracy of {model_results[best_model_name]:.4f}")

print("Saving the best model, scaler, and encoder...")
joblib.dump(best_model, 'best_balanced_model_2nd.joblib')
joblib.dump(scaler, 'scaler_balance_2nd.joblib')
joblib.dump(label_encoder, 'label_encoder_balanced_2nd.joblib')

print("\nScript finished successfully! 🎉")
print("The best model has been saved as 'best_balanced_model.joblib'.")


--- Model Comparison Summary (Trained on Balanced Data) ---
                     Accuracy
LightGBM             0.872061
XGBoost              0.852024
Random Forest        0.808711
Logistic Regression  0.573322

Best performing model is: LightGBM with an accuracy of 0.8721
Saving the best model, scaler, and encoder...

Script finished successfully! 🎉
The best model has been saved as 'best_balanced_model.joblib'.


#### Unsupervised Isolation Forest

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import warnings
from sklearn.metrics import classification_report, confusion_matrix, f1_score # Import f1_score


In [2]:
# --- 1. Load Your Filtered Data ---
try:
    df = pd.read_csv('../data/filtered_dataset.csv')
    print("Filtered dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'filtered_dataset.csv' not found. Please create it first.")
    exit()


Filtered dataset loaded successfully.


In [3]:
# --- THE FIX STARTS HERE ---
# 1. Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("Replaced infinite values with NaN.")

# 2. Drop all rows that now contain NaN
df.dropna(inplace=True)
print("Dropped rows with NaN values. Data is now clean.")
# --- THE FIX ENDS HERE ---

Replaced infinite values with NaN.
Dropped rows with NaN values. Data is now clean.


In [4]:
# --- 2. Prepare Data for Unsupervised Training ---
print("Preparing data...")
X = df.drop(columns=['Label'])
y = df['Label']

# Create a training set with ONLY 'Benign' (normal) data
X_train_full_benign = X[y == 'Benign']

# Create a smaller sample of benign data for the slower models
X_train_sample_benign = X_train_full_benign.sample(n=100000, random_state=42)
print(f"Using full benign set for Isolation Forest: {len(X_train_full_benign)} samples.")
print(f"Using smaller benign sample for LOF and One-Class SVM: {len(X_train_sample_benign)} samples.")

# The test set is the entire dataset
X_test = X
y_test = y


Preparing data...
Using full benign set for Isolation Forest: 6077145 samples.
Using smaller benign sample for LOF and One-Class SVM: 100000 samples.


In [5]:
# --- 3. Scale the Data ---
# We fit the scaler ONLY on the full normal training data to learn the 'normal' distribution
scaler = StandardScaler()
scaler.fit(X_train_full_benign)

# Transform all datasets
X_train_full_benign_scaled = scaler.transform(X_train_full_benign)
X_train_sample_benign_scaled = scaler.transform(X_train_sample_benign)
X_test_scaled = scaler.transform(X_test)

##### This is to check the percentage of the benign attacks in the dataset

In [6]:
# --- 2. Calculate the Percentage ---

# Get the total number of rows
total_samples = len(df)

# Get the counts of each class in the 'Label' column
label_counts = df['Label'].value_counts()

# Get the number of 'Benign' (normal) samples
benign_samples = label_counts['Benign']

# The number of anomalies is everything that isn't 'Benign'
anomaly_samples = total_samples - benign_samples

# Calculate the contamination ratio (percentage of anomalies)
contamination_ratio = anomaly_samples / total_samples

# --- 3. Print the Results ---
print("\n--- Dataset Analysis ---")
print(f"Total Samples: {total_samples}")
print(f"Normal ('Benign') Samples: {benign_samples}")
print(f"Anomaly (Attack) Samples: {anomaly_samples}")
print(f"\nExact Anomaly Percentage (Contamination): {contamination_ratio:.4f}")


--- Dataset Analysis ---
Total Samples: 8192732
Normal ('Benign') Samples: 6077145
Anomaly (Attack) Samples: 2115587

Exact Anomaly Percentage (Contamination): 0.2582


##### Train model

In [None]:
# # # --- 4. Initialize Models OLD ---
# # models = {
# #     "Isolation Forest": {
# #         "model": IsolationForest(n_estimators=100, contamination='auto', random_state=42, n_jobs=-1),
# #         "train_data": X_train_full_benign_scaled
# #     },
# #     "Local Outlier Factor": {
# #         # novelty=True is essential for predicting on new data
# #         "model": LocalOutlierFactor(n_neighbors=20, contamination='auto', novelty=True, n_jobs=-1),
# #         "train_data": X_train_sample_benign_scaled
# #     },
# #     "One-Class SVM": {
# #         "model": OneClassSVM(nu=0.01, kernel="rbf", gamma='auto'),
# #         "train_data": X_train_sample_benign_scaled
# #     }
# # }

# # The actual anomaly ratio in your data is ~0.25 (2,115,587 / 8,192,732)
# known_contamination = 0.25 

# models = {
#     "Isolation Forest": {
#         # FIX: Change contamination from 'auto' to the known ratio to increase sensitivity.
#         "model": IsolationForest(
#             n_estimators=100, 
#             contamination=known_contamination, #<-- THE FIX
#             random_state=42, 
#             n_jobs=-1
#         ),
#         "train_data": X_train_full_benign_scaled
#     },
#     "Local Outlier Factor": {
#         # TUNE: Adjust n_neighbors to fine-tune the precision/recall balance.
#         # Start with a larger value like 30 to potentially reduce false positives.
#         "model": LocalOutlierFactor(
#             n_neighbors=30, #<-- TUNE THIS
#             contamination=known_contamination, 
#             novelty=True, 
#             n_jobs=-1
#         ),
#         "train_data": X_train_sample_benign_scaled
#     },
#     "One-Class SVM": {
#         # FIX: Increase 'nu' to make the model less conservative and catch more anomalies.
#         "model": OneClassSVM(
#             nu=known_contamination, #<-- THE FIX
#             kernel="rbf", 
#             gamma='auto'
#         ),
#         "train_data": X_train_sample_benign_scaled
#     }
# }

New Code for better

In [8]:
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline

In [9]:
# Known contamination ratio in dataset
known_contamination = 0.2582  

# =======================
# Isolation Forest (IF)
# =======================
isolation_forest_params = {
    "n_estimators": 300,      # more trees for stability
    "max_samples": 0.5,       # sample half the data per tree
    "contamination": known_contamination,
    "bootstrap": True,        # improves variance
    "random_state": 42,
    "n_jobs": -1
}

# =======================
# Local Outlier Factor (LOF)
# =======================
lof_params = {
    "n_neighbors": 50,         # <-- try 20, 30, 50, 100 in sweeps
    "contamination": known_contamination,
    "novelty": True,           # needed for predicting on new data
    "n_jobs": -1
}

# =======================
# One-Class SVM (OCSVM)
# =======================
# Subsample benign training data to keep memory manageable
sample_size = 100_000
if len(X_train_sample_benign_scaled) > sample_size:
    X_ocsvm_train = X_train_sample_benign_scaled.sample(n=sample_size, random_state=42)
else:
    X_ocsvm_train = X_train_sample_benign_scaled

# Option A: Simple linear kernel
ocsvm_linear = OneClassSVM(
    nu=known_contamination,
    kernel="linear"
)

# Option B: RBF kernel approximation + linear OCSVM
rbf_feature = RBFSampler(gamma=1, n_components=300, random_state=42)
ocsvm_rbf_approx = make_pipeline(
    rbf_feature,
    OneClassSVM(nu=known_contamination, kernel="linear")
)

# =======================
# Collect models
# =======================
models = {
    "Isolation Forest": {
        "model": IsolationForest(**isolation_forest_params),
        "train_data": X_train_full_benign_scaled
    },
    "Local Outlier Factor": {
        "model": LocalOutlierFactor(**lof_params),
        "train_data": X_train_sample_benign_scaled
    },
    "One-Class SVM (Linear)": {
        "model": ocsvm_linear,
        "train_data": X_ocsvm_train
    },
    "One-Class SVM (RBF Approx)": {
        "model": ocsvm_rbf_approx,
        "train_data": X_ocsvm_train
    }
}

In [None]:
# # --- 5. Train and Evaluate Each Model ---
# # Convert true labels to the 1/-1 format for evaluation
# y_test_mapped = y_test.apply(lambda x: 1 if x == 'Benign' else -1)

# for name, model_info in models.items():
#     print(f"\n--- Training {name} ---")
#     model_instance = model_info["model"]
#     train_data = model_info["train_data"]
    
#     model_instance.fit(train_data)
    
#     print(f"--- Evaluating {name} ---")
#     y_pred = model_instance.predict(X_test_scaled)
    
#     print(f"\nAnomaly Detection Report for {name}:")
#     print(classification_report(y_test_mapped, y_pred, target_names=['Anomaly (-1)', 'Normal (1)']))
#     print(f"Confusion Matrix for {name}:\n", confusion_matrix(y_test_mapped, y_pred))

# print("\nScript finished successfully! 🎉")

# --- 5. Train, Evaluate, and Find the Best Model ---
y_test_mapped = y_test.apply(lambda x: 1 if x == 'Benign' else -1)

# Variables to track the best model
best_f1_score = -1
best_model_name = ""
best_model_to_save = None

for name, model_info in models.items():
    print(f"\n--- Training {name} ---")
    model_instance = model_info["model"]
    train_data = model_info["train_data"]
    
    model_instance.fit(train_data)
    
    print(f"--- Evaluating {name} ---")
    y_pred = model_instance.predict(X_test_scaled)
    
    # --- NEW LOGIC: TRACK THE BEST MODEL ---
    # Calculate F1-score specifically for the anomaly class (-1)
    current_f1 = f1_score(y_test_mapped, y_pred, pos_label=-1)
    
    if current_f1 > best_f1_score:
        best_f1_score = current_f1
        best_model_name = name
        best_model_to_save = model_instance
        print(f"*** New best model found: {name} with Anomaly F1-Score: {current_f1:.4f} ***")
    # --- END OF NEW LOGIC ---

    print(f"\nAnomaly Detection Report for {name}:")
    print(classification_report(y_test_mapped, y_pred, target_names=['Anomaly (-1)', 'Normal (1)']))
    print(f"Confusion Matrix for {name}:\n", confusion_matrix(y_test_mapped, y_pred))


--- Training Isolation Forest ---
--- Evaluating Isolation Forest ---
*** New best model found: Isolation Forest with Anomaly F1-Score: 0.3390 ***

Anomaly Detection Report for Isolation Forest:
              precision    recall  f1-score   support

Anomaly (-1)       0.32      0.36      0.34   2115587
  Normal (1)       0.77      0.74      0.75   6077145

    accuracy                           0.64   8192732
   macro avg       0.55      0.55      0.55   8192732
weighted avg       0.65      0.64      0.65   8192732

Confusion Matrix for Isolation Forest:
 [[ 751936 1363651]
 [1569119 4508026]]

--- Training Local Outlier Factor ---
--- Evaluating Local Outlier Factor ---
*** New best model found: Local Outlier Factor with Anomaly F1-Score: 0.6701 ***

Anomaly Detection Report for Local Outlier Factor:
              precision    recall  f1-score   support

Anomaly (-1)       0.54      0.88      0.67   2115587
  Normal (1)       0.95      0.74      0.83   6077145

    accuracy          

In [None]:
# --- 6. Save the Final Best Model ---
print("\n--- Evaluation Complete ---")
if best_model_to_save:
    print(f"The best model is '{best_model_name}' with an Anomaly F1-Score of {best_f1_score:.4f}")
    
    print("\nSaving the best unsupervised model and its scaler...")
    joblib.dump(best_model_to_save, 'best_unsupervised_model.joblib')
    joblib.dump(scaler, 'unsupervised_scaler.joblib')
    print("Files saved successfully!")
else:
    print("No model was saved as no evaluation was successful.")

print("\nScript finished successfully! 🎉")