In [6]:
# ============================================
# SELF LEARNING CYBER ATTACK DETECTION BOT
# FINAL TRAINING CODE (CICIDS DATASET)
# ============================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# ============================================
# 1. LOAD DATASET
# ============================================

file_path = r'E:/Projects/ML/KSR/data/CICD2017.csv'

print("Loading dataset...")
df = pd.read_csv(file_path)

print("Dataset Loaded Successfully")
print("Dataset Shape:", df.shape)

# ============================================
# 2. AUTO DETECT LABEL COLUMN
# ============================================

possible_labels = ['label', 'attack', 'attack type', 'class']

label_col = None
for col in df.columns:
    if col.strip().lower() in possible_labels:
        label_col = col
        break

# if not found, assume last column
if label_col is None:
    label_col = df.columns[-1]

print("\nDetected Label Column:", label_col)

# ============================================
# 3. CLEAN LABEL VALUES
# ============================================

df[label_col] = df[label_col].astype(str)
df[label_col] = df[label_col].str.strip()
df[label_col] = df[label_col].str.lower()

print("\nAvailable Labels:")
print(df[label_col].unique())

# Convert labels safely
# benign / normal → 0
# attack → 1
normal_keywords = ['benign', 'normal']

df[label_col] = df[label_col].apply(
    lambda x: 0 if any(k in x for k in normal_keywords) else 1
)

print("\nLabel Distribution:")
print(df[label_col].value_counts())

# ============================================
# 4. REMOVE INVALID VALUES
# ============================================

print("\nCleaning data...")

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

print("After cleaning shape:", df.shape)

# ============================================
# 5. SPLIT FEATURES & LABEL
# ============================================

X = df.drop(label_col, axis=1)
y = df[label_col]

# ============================================
# 6. NORMALIZATION
# ============================================

print("\nScaling features...")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ============================================
# 7. TRAIN ONLY NORMAL TRAFFIC
# ============================================

X_normal = X_scaled[y == 0]

print("Normal samples found:", len(X_normal))

if len(X_normal) == 0:
    raise ValueError(
        "No BENIGN/NORMAL samples found. "
        "This file may contain only attack traffic. "
        "Use a dataset containing normal traffic also."
    )

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ============================================
# 8. TRAIN ISOLATION FOREST MODEL
# ============================================

print("\nTraining Isolation Forest Model...")

model = IsolationForest(
    n_estimators=120,
    contamination=0.03,
    random_state=42,
    n_jobs=-1
)

model.fit(X_normal)

print("Model Training Completed")

# ============================================
# 9. PREDICTION
# ============================================

print("\nRunning predictions...")

y_pred = model.predict(X_test)

# convert output
y_pred = np.where(y_pred == -1, 1, 0)

# ============================================
# 10. MODEL EVALUATION
# ============================================

print("\nModel Performance Report:")
print(classification_report(y_test, y_pred))

# ============================================
# 11. SAVE MODEL & SCALER
# ============================================

joblib.dump(model, "anomaly_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\nModel and scaler saved successfully!")


Loading dataset...
Dataset Loaded Successfully
Dataset Shape: (2520751, 53)

Detected Label Column: Attack Type

Available Labels:
['normal traffic' 'port scanning' 'web attacks' 'brute force' 'ddos'
 'bots' 'dos']

Label Distribution:
Attack Type
0    2095057
1     425694
Name: count, dtype: int64

Cleaning data...
After cleaning shape: (2520751, 53)

Scaling features...
Normal samples found: 2095057

Training Isolation Forest Model...
Model Training Completed

Running predictions...

Model Performance Report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93    418697
           1       0.73      0.39      0.51     85454

    accuracy                           0.87    504151
   macro avg       0.81      0.68      0.72    504151
weighted avg       0.86      0.87      0.86    504151


Model and scaler saved successfully!
