<a href="https://colab.research.google.com/github/ARAVINTH342006/ImageGallery/blob/main/WebAttackDetecter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import pandas and os library

In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc,roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE


**Load all datasets**





In [None]:
os.kill(os.getpid(), 9)
csv_files=["/content/drive/MyDrive/WebAttackDetector/WebAttack ML/TrafficLabelling/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
           "/content/drive/MyDrive/WebAttackDetector/WebAttack ML/TrafficLabelling/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
           "/content/drive/MyDrive/WebAttackDetector/WebAttack ML/TrafficLabelling/Friday-WorkingHours-Morning.pcap_ISCX.csv",
           "/content/drive/MyDrive/WebAttackDetector/WebAttack ML/TrafficLabelling/Monday-WorkingHours.pcap_ISCX.csv",
           "/content/drive/MyDrive/WebAttackDetector/WebAttack ML/TrafficLabelling/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
           "/content/drive/MyDrive/WebAttackDetector/WebAttack ML/TrafficLabelling/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
           "/content/drive/MyDrive/WebAttackDetector/WebAttack ML/TrafficLabelling/Tuesday-WorkingHours.pcap_ISCX.csv",
           "/content/drive/MyDrive/WebAttackDetector/WebAttack ML/TrafficLabelling/Wednesday-workingHours.pcap_ISCX.csv"]




dataframes = []
for file in csv_files:
    if os.path.exists(file):
        df = pd.read_csv(file,encoding='ISO-8859-1', low_memory=False)
        dataframes.append(df)
    else:
        print(f"{file} not found!")

combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = combined_df.columns.str.strip()

**Preprocess**

In [None]:
columns_to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
combined_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

**Clean NaNs and infinities**

In [None]:
combined_df.replace([np.inf, -np.inf], np.nan, inplace=True)
combined_df.dropna(inplace=True)

**Simplify label: 0 = BENIGN, 1 = ATTACK**

In [None]:
combined_df['Label'] = combined_df['Label'].apply(lambda x: 0 if str(x).strip().upper() == 'BENIGN' else 1)

**Separate features and label**

In [None]:
X = combined_df.drop('Label', axis=1).select_dtypes(include=[np.number])
y = combined_df['Label']

**Scale features**

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

**Train/Test split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)

**Train XGBoost**

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss',max_depth=5,scale_pos_weight=4,learning_rate=0.1,n_estimators=200, random_state=42)
xgb.fit(X_train, y_train)

In [None]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier( n_estimators=1000,
    learning_rate=0.05,
    max_depth=8,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42)
lgbm_model.fit(X_train, y_train)

In [None]:
!pip install catboost


In [None]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    verbose=0,
    random_state=42
)

cat_model.fit(X_train, y_train)

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(
    max_iter=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

hgb.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("sgd", SGDClassifier(
        loss="log_loss",
        penalty="l2",
        max_iter=1000,
        tol=1e-3,
        class_weight="balanced",
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)

**Feature Importance**

In [None]:
plt.figure(figsize=(12, 6))
plot_importance(xgb.columns, max_num_features=15, height=0.5, importance_type='gain')
plt.title("Top 15 Important Features for Attack Detection (by Gain) Xgboost")
plt.tight_layout()
plt.show()

**Evaluation**

In [None]:
plt.figure(figsize=(12, 6))
plot_importance(lgbm_model.columns, max_num_features=15, height=0.5, importance_type='gain')
plt.title("Top 15 Important Features for Attack Detection (by Gain) LightGBM")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plot_importance(cat_model.columns, max_num_features=15, height=0.5, importance_type='gain')
plt.title("Top 15 Important Features for Attack Detection (by Gain) Catboost")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plot_importance(hgb.columns, max_num_features=15, height=0.5, importance_type='gain')
plt.title("Top 15 Important Features for Attack Detection (by Gain) HistGradientBoostingClassifier")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plot_importance(pipeline.columns, max_num_features=15, height=0.5, importance_type='gain')
plt.title("Top 15 Important Features for Attack Detection (by Gain) pipeline")
plt.tight_layout()
plt.show()

In [None]:
y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

print("Classification Report(xgboost):\n", classification_report(y_test, y_pred_xgb))

In [None]:
y_pred_lgbm = lgbm.predict(X_test)
y_proba_lgbm = lgbm.predict_proba(X_test)[:, 1]

print("Classification Report(lightGBM):\n", classification_report(y_test, y_pred_lgbm))

In [None]:
y_pred_cat = cat.predict(X_test)
y_proba_cat = cat.predict_proba(X_test)[:, 1]

print("Classification Report(catBoost):\n", classification_report(y_test, y_pred_cat))

In [None]:
y_pred_hgb= hgb.predict(X_test)
y_proba_hgb= hgb.predict_proba(X_test)[:, 1]



print("Classification Report(HistGradientBoosting):\n", classification_report(y_test, y_pred_hgb))

In [None]:
y_pred_pipeline= pipeline.predict(X_test)
y_proba_pipeline= pipeline.predict_proba(X_test)[:, 1]

print("Classification Report(pipeline):\n", classification_report(y_test, y_pred_pipeline))

**Confusion matrix**

In [None]:
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap="Blues", xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
cm_lgbm = confusion_matrix(y_test, y_pred_lgbm)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_lgbm, annot=True, fmt='d', cmap="Blues", xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
cm_cat = confusion_matrix(y_test, y_pred_cat)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_cat, annot=True, fmt='d', cmap="Blues", xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
cm_hgb = confusion_matrix(y_test, y_pred_hgb)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_hgb, annot=True, fmt='d', cmap="Blues", xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
cm_pipeline = confusion_matrix(y_test, y_pred_pipeline)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_pipeline, annot=True, fmt='d', cmap="Blues", xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

**ROC Curve**

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'XGBoost (AUC = {roc_auc:.2f})', color='orange')
plt.plot([0, 1], [0, 1], linestyle='--', color='blue')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Step 1: Start with your new partial data
partial_data = {
    'Destination Port': 54865,
    'Flow Duration': 3,
    'Total Fwd Packets': 2,
    'Total Backward Packets': 0,
    'Total Length of Fwd Packets': 12,
    'Total Length of Bwd Packets': 0,
    'Fwd Packet Length Max': 6,
    'Fwd Packet Length Min': 6,
    'Fwd Packet Length Mean': 6,
    'Fwd Packet Length Std': 0.0,
    # You can add more known values here...
}

# Step 2: Create full empty DataFrame row with all required feature columns
full_input = pd.DataFrame(columns=X.columns)
full_input.loc[0] = 0  # initialize all values to 0

# Step 3: Fill in known values
for col, value in partial_data.items():
    if col in full_input.columns:
        full_input.at[0, col] = value

# Step 4: Scale it using your existing scaler
full_input_scaled = scaler.transform(full_input)

# Step 5: Predict using your trained XGBoost model
prediction = xgb.predict(full_input_scaled)
proba = xgb.predict_proba(full_input_scaled)[0][1]

print("Prediction:", "Attack" if prediction[0] == 1 else "Benign")
print("Attack Probability:", proba)


In [None]:
from sklearn.metrics import accuracy_score

# Accuracy on test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the XGBoost model: {accuracy * 100:.2f}%")


In [None]:
print(f"""
Model Evaluation:
------------------------
✅ Accuracy       : {accuracy_score(y_test, y_pred):.4f}
📈 ROC AUC        : {roc_auc:.4f}
📊 Confusion Matrix:
{confusion_matrix(y_test, y_pred)}
""")


In [None]:
import joblib

# Save model
joblib.dump(xgb, 'xgboost_cicids2017_model.pkl')

# Save the scaler too (important!)
joblib.dump(scaler, 'scaler_cicids2017.pkl')
