In [3]:
#Anomaly Detection using IsolationForest
from sklearn.datasets import fetch_kddcup99
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd

# Load data (subset='SA' has normal + smurf attacks)
data = fetch_kddcup99(subset='SA', shuffle=True, percent10=True)
x_raw, y_raw = data.data, data.target

# Convert to DataFrame for easier preprocessing
x_df = pd.DataFrame(x_raw)
y = pd.Series(y_raw).astype(str)

# Encode bytes and categorical columns to numeric
for col in x_df.columns:
    if x_df[col].dtype == object:
        x_df[col] = x_df[col].astype(str)
        x_df[col] = LabelEncoder().fit_transform(x_df[col])

# Encode labels: 'normal.' → 0, others → 1 (attack)
y = y.apply(lambda val: 0 if val == 'normal.' else 1)

# Train Isolation Forest
model = IsolationForest(contamination=0.1, random_state=42)
model.fit(x_df)

# Predict (-1 = anomaly → 1, 1 = normal → 0)
y_pred = (model.predict(x_df) == -1).astype(int)

# Evaluate
print(classification_report(y, y_pred, target_names=['normal', 'attack']))

              precision    recall  f1-score   support

      normal       0.97      0.91      0.94     97278
      attack       0.10      0.29      0.14      3377

    accuracy                           0.89    100655
   macro avg       0.53      0.60      0.54    100655
weighted avg       0.94      0.89      0.91    100655

