In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 📥 Step 1: Load the dataset
df = pd.read_csv("custom_sdn_dataset_large.csv")  # Use your actual file name here

# 🏷 Step 2: Encode the labels (BENIGN=0, DDoS=1)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# 🧹 Step 3: Drop IP address columns and highly predictive features
df = df.drop(columns=['src_ip', 'dst_ip', 'packet_count', 'byte_count'])

# 🔉 Step 4: Add slight noise to 'duration' to simulate real-world variance
np.random.seed(42)
df['duration'] += np.random.normal(0, 0.05, size=df.shape[0])

# 🎯 Step 5: Flip 1% of labels to introduce realistic label noise
y = df['label'].copy()
flip_indices = np.random.choice(y.index, size=int(0.01 * len(y)), replace=False)
y.loc[flip_indices] = 1 - y.loc[flip_indices]  # Flip 0 to 1 and 1 to 0

# 🔎 Step 6: Extract features
X = df.drop(columns=['label'])

# 📏 Step 7: Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ✂️ Step 8: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# 🤖 Step 9: Train Random Forest and Decision Tree classifiers
ml_models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

# 🧪 Step 10: Evaluate and display results
for name, model in ml_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n🔍 {name}")
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")



🔍 Random Forest
Accuracy:  0.9850
Precision: 0.9872
Recall:    0.9747
F1 Score:  0.9809

🔍 Decision Tree
Accuracy:  0.9850
Precision: 0.9872
Recall:    0.9747
F1 Score:  0.9809


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 📥 Load dataset
df = pd.read_csv("custom_sdn_dataset_large.csv")

# 🏷 Encode labels (BENIGN=0, DDoS=1)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# 🧹 Drop IPs and highly predictive features
df = df.drop(columns=['src_ip', 'dst_ip', 'packet_count', 'byte_count'])

# 🔉 Add small noise to 'duration'
np.random.seed(42)
df['duration'] = pd.to_numeric(df['duration'], errors='coerce').fillna(0)
df['duration'] += np.random.normal(0, 0.02, size=df.shape[0])

# 🎯 Flip 2% of labels to simulate noise
y = df['label'].astype(int).copy()
flip_indices = np.random.choice(y.index, size=int(0.02 * len(y)), replace=False)
y.loc[flip_indices] = 1 - y.loc[flip_indices]

# 📊 Features and scaling
X = df.drop(columns=['label'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ✂️ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# 🤖 Define models
models = {
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM (Linear Kernel)": SVC(kernel='linear', probability=True, random_state=42)
}

# 🧪 Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n🔍 {name}")
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")



🔍 Naive Bayes
Accuracy:  0.9550
Precision: 0.9167
Recall:    0.9747
F1 Score:  0.9448

🔍 Logistic Regression
Accuracy:  0.9750
Precision: 0.9625
Recall:    0.9747
F1 Score:  0.9686

🔍 SVM (Linear Kernel)
Accuracy:  0.9750
Precision: 0.9625
Recall:    0.9747
F1 Score:  0.9686
