<a href="https://colab.research.google.com/github/Arashi283/AIRepoOne/blob/main/DDoS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================== CICDDoS2019 DDoS Detection Pipeline ==================
# Train DecisionTree, RandomForest, XGBoost, LightGBM
# ========================================================================

import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from google.colab import files
import io
import os

# ================== Step 1: Dataset Upload ==================
if not os.path.exists("CICDDoS2019.csv"):
    print("Upload your CICDDoS2019 CSV file:")
    uploaded = files.upload()
    filename = list(uploaded.keys())[0]
else:
    filename = "CICDDoS2019.csv"

print(f"Using dataset: {filename}")

# ================== Step 2: Data Loading and Preprocessing ==================
df = pd.read_csv(filename)

# Drop unnamed or irrelevant columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Identify label column (commonly ' Label' or 'Label')
label_col = [col for col in df.columns if 'label' in col.lower()][0]
df[label_col] = df[label_col].astype(str).str.strip()

# Encode target (Normal = 0, Attack = 1)
df[label_col] = np.where(df[label_col].str.contains("BENIGN", case=False), 0, 1)

# Drop rows with NaN or inf
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# Encode non-numeric columns
for col in df.select_dtypes(include=['object']).columns:
    if col != label_col:
        df[col] = LabelEncoder().fit_transform(df[col])

# Split features and labels
X = df.drop(columns=[label_col])
y = df[label_col]

# Scale numeric data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# ================== Step 3: Balance Data Using SMOTE ==================
print("Applying SMOTE to balance classes...")
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)
print("Balanced class distribution:")
print(pd.Series(y_train_bal).value_counts())

# ================== Step 4: Define Models ==================
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=300, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=300, random_state=42)
}

# ================== Step 5: Training with Iterations ==================
epochs = 10
results = []
roc_data = {}
feature_importances = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    epoch_accuracies = []
    start_time = time.time()

    for i in range(epochs):
        epoch_start = time.time()
        model.fit(X_train_bal, y_train_bal)
        y_pred_train = model.predict(X_train_bal)
        acc = accuracy_score(y_train_bal, y_pred_train)
        epoch_accuracies.append(acc)
        print(f"Epoch {i+1}/{epochs} - Accuracy: {acc:.4f} - Time: {time.time() - epoch_start:.2f}s")

    total_time = time.time() - start_time
    y_pred_test = model.predict(X_test)
    y_proba_test = model.predict_proba(X_test)[:, 1]
    acc_test = accuracy_score(y_test, y_pred_test)
    roc_auc = roc_auc_score(y_test, y_proba_test)
    results.append([name, acc_test, roc_auc, total_time])

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba_test)
    roc_data[name] = (fpr, tpr, roc_auc)

    # Feature importance
    if hasattr(model, "feature_importances_"):
        feature_importances[name] = model.feature_importances_

    # Accuracy vs Epochs
    plt.figure()
    plt.plot(range(1, epochs + 1), epoch_accuracies, marker='o')
    plt.title(f"Accuracy vs Epochs - {name}")
    plt.xlabel("Epoch")
    plt.ylabel("Training Accuracy")
    plt.grid(True)
    plt.show()

# ================== Step 6: Display Results ==================
results_df = pd.DataFrame(results, columns=["Model", "Test Accuracy", "ROC-AUC", "Training Time (s)"])
print("\nFinal Model Performance:")
print(results_df)

# ================== Step 7: ROC-AUC Curves ==================
plt.figure(figsize=(8, 6))
for name, (fpr, tpr, auc_val) in roc_data.items():
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc_val:.4f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC-AUC Curves for All Models")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.show()

# ================== Step 8: Feature Importance ==================
for name, importances in feature_importances.items():
    feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)[:15]
    plt.figure(figsize=(8, 5))
    sns.barplot(x=feat_imp.values, y=feat_imp.index)
    plt.title(f"Top 15 Important Features - {name}")
    plt.show()

# ================== Step 9: Correlation-Based Feature Selection (CFS-like) ==================
corr = pd.DataFrame(X_train_bal, columns=X.columns).corrwith(pd.Series(y_train_bal)).abs().sort_values(ascending=False)
top_corr_features = corr.head(20).index
print("\nTop correlated features selected for CFS-like FS:")
print(list(top_corr_features))

X_train_cfs = X_train_bal[:, [X.columns.get_loc(f) for f in top_corr_features]]
X_test_cfs = X_test[:, [X.columns.get_loc(f) for f in top_corr_features]]

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train_cfs, y_train_bal)
y_pred_cfs = model.predict(X_test_cfs)
print("\nAccuracy with CFS-like Feature Selection:", accuracy_score(y_test, y_pred_cfs))

# ================== Step 10: Embedded Feature Selection ==================
embed_model = SelectFromModel(RandomForestClassifier(n_estimators=200, random_state=42), threshold="median")
embed_model.fit(X_train_bal, y_train_bal)
X_train_emb = embed_model.transform(X_train_bal)
X_test_emb = embed_model.transform(X_test)

rf_emb = RandomForestClassifier(n_estimators=200, random_state=42)
rf_emb.fit(X_train_emb, y_train_bal)
y_pred_emb = rf_emb.predict(X_test_emb)
print("Accuracy with Embedded Feature Selection:", accuracy_score(y_test, y_pred_emb))


Upload your CICDDoS2019 CSV file:


In [5]:
# ================== CICDDoS2019 DDoS Detection Pipeline ==================
# Handles multiple CSVs, merges them, trains 4 ML models with SMOTE balancing
# Displays training progress, ROC-AUC, Feature Importance, and Feature Selection results
# ========================================================================

import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from google.colab import files

# ================== Step 1: Dataset Upload ==================
folder_path = "/content/cicddos2019_csvs"

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print("Upload all your CICDDoS2019 CSV files (8 files for various DDoS types).")
    uploaded = files.upload()
    for fname in uploaded.keys():
        os.rename(fname, os.path.join(folder_path, fname))

# ================== Step 2: Merge All CSV Files ==================
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")]
if not all_files:
    raise FileNotFoundError("No CSV files found in the folder. Please upload the dataset CSVs.")

print(f"Found {len(all_files)} CSV files. Merging...")

dfs = []
for file in all_files:
    try:
        temp = pd.read_csv(file)
        temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
        dfs.append(temp)
        print(f"Loaded: {file} ({len(temp)} rows)")
    except Exception as e:
        print(f"Error reading {file}: {e}")

df = pd.concat(dfs, ignore_index=True)
print(f"\nTotal combined dataset shape: {df.shape}")

# ================== Step 3: Detect Label Column ==================
possible_labels = [col for col in df.columns if any(x in col.lower() for x in ['label', 'attack', 'target'])]
if len(possible_labels) == 0:
    print("\nNo column named like 'Label' or 'Attack' found. Available columns:")
    print(df.columns.tolist())
    label_col = input("Enter the label column name: ").strip()
else:
    label_col = possible_labels[0]
    print(f"Detected label column: {label_col}")

# ================== Step 4: Data Cleaning ==================
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

df[label_col] = df[label_col].astype(str).str.strip()

# Encode target (Normal = 0, Attack = 1)
df[label_col] = np.where(df[label_col].str.contains("BENIGN", case=False), 0, 1)

# Encode non-numeric features
for col in df.select_dtypes(include=['object']).columns:
    if col != label_col:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

X = df.drop(columns=[label_col])
y = df[label_col]

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# ================== Step 5: Balance Using SMOTE ==================
print("\nApplying SMOTE to balance classes...")
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)
print("Class distribution after SMOTE:")
print(pd.Series(y_train_bal).value_counts())

# ================== Step 6: Define Models ==================
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=500, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=500, random_state=42)
}

epochs = 10
results = []
roc_data = {}
feature_importances = {}

# ================== Step 7: Train and Evaluate ==================
for name, model in models.items():
    print(f"\nTraining {name}...")
    epoch_accuracies = []
    start_time = time.time()

    for i in range(epochs):
        epoch_start = time.time()
        model.fit(X_train_bal, y_train_bal)
        y_pred_train = model.predict(X_train_bal)
        acc = accuracy_score(y_train_bal, y_pred_train)
        epoch_accuracies.append(acc)
        print(f"Epoch {i+1}/{epochs} - Accuracy: {acc:.4f} - Time: {time.time() - epoch_start:.2f}s")

    total_time = time.time() - start_time
    y_pred_test = model.predict(X_test)
    y_proba_test = model.predict_proba(X_test)[:, 1]
    acc_test = accuracy_score(y_test, y_pred_test)
    roc_auc = roc_auc_score(y_test, y_proba_test)

    results.append([name, acc_test, roc_auc, total_time])

    fpr, tpr, _ = roc_curve(y_test, y_proba_test)
    roc_data[name] = (fpr, tpr, roc_auc)

    if hasattr(model, "feature_importances_"):
        feature_importances[name] = model.feature_importances_

    plt.figure()
    plt.plot(range(1, epochs + 1), epoch_accuracies, marker='o')
    plt.title(f"Accuracy vs Epochs - {name}")
    plt.xlabel("Epoch")
    plt.ylabel("Training Accuracy")
    plt.grid(True)
    plt.show()

# ================== Step 8: Results Summary ==================
results_df = pd.DataFrame(results, columns=["Model", "Test Accuracy", "ROC-AUC", "Training Time (s)"])
print("\nFinal Model Performance:")
print(results_df)

# ================== Step 9: ROC-AUC ==================
plt.figure(figsize=(8, 6))
for name, (fpr, tpr, auc_val) in roc_data.items():
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc_val:.4f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC-AUC Curves for All Models")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.show()

# ================== Step 10: Feature Importances ==================
for name, importances in feature_importances.items():
    feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)[:15]
    plt.figure(figsize=(8, 5))
    sns.barplot(x=feat_imp.values, y=feat_imp.index)
    plt.title(f"Top 15 Important Features - {name}")
    plt.show()

# ================== Step 11: CFS-like Feature Selection ==================
corr = pd.DataFrame(X_train_bal, columns=X.columns).corrwith(pd.Series(y_train_bal)).abs().sort_values(ascending=False)
top_corr_features = corr.head(20).index
print("\nTop correlated features selected (CFS-like):")
print(list(top_corr_features))

X_train_cfs = X_train_bal[:, [X.columns.get_loc(f) for f in top_corr_features]]
X_test_cfs = X_test[:, [X.columns.get_loc(f) for f in top_corr_features]]

model = RandomForestClassifier(n_estimators=300, random_state=42)
model.fit(X_train_cfs, y_train_bal)
y_pred_cfs = model.predict(X_test_cfs)
print("\nAccuracy with CFS-like Feature Selection:", accuracy_score(y_test, y_pred_cfs))

# ================== Step 12: Embedded Feature Selection ==================
embed_model = SelectFromModel(RandomForestClassifier(n_estimators=300, random_state=42), threshold="median")
embed_model.fit(X_train_bal, y_train_bal)
X_train_emb = embed_model.transform(X_train_bal)
X_test_emb = embed_model.transform(X_test)

rf_emb = RandomForestClassifier(n_estimators=300, random_state=42)
rf_emb.fit(X_train_emb, y_train_bal)
y_pred_emb = rf_emb.predict(X_test_emb)
print("Accuracy with Embedded Feature Selection:", accuracy_score(y_test, y_pred_emb))


Upload all your CICDDoS2019 CSV files (8 files for various DDoS types).


KeyboardInterrupt: 