In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import IsolationForest
import warnings

np.warnings = warnings

# Identify categorical features.
categorical_cols = ['protocol_type', 'service', 'flag']

df = pd.read_csv("../Dataset/KDDTrain+.txt", header=None, sep=',', engine='python')
test_df = pd.read_csv("../Dataset/KDDTest+.txt", header=None, sep=',', engine='python')

# Define all feature names based on NSL-KDD dataset
all_features = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
    'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_count',
    'dst_host_diff_srv_count', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'target', 'difficulty'
]

df = df.sample(frac=0.1, replace=False)
test_df = test_df.sample(frac=0.1, replace=False)

df.columns = all_features
test_df.columns = all_features

df['target'] = df['target'].apply(lambda x: 1 if x != 'normal' else 0)
test_df['target'] = test_df['target'].apply(lambda x: 1 if x != 'normal' else 0)

# Drop 'difficulty' column (not needed for model training)
df.drop(columns=['difficulty'], inplace=True)
test_df.drop(columns=['difficulty'], inplace=True)

# One-Hot Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
df_encoded = encoder.fit_transform(df[categorical_cols])
test_df_encoded = encoder.transform(test_df[categorical_cols])

# Convert encoded features into DataFrame
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
df_encoded = pd.DataFrame(df_encoded, columns=encoded_feature_names)
test_df_encoded = pd.DataFrame(test_df_encoded, columns=encoded_feature_names)

# Drop original categorical columns and concatenate encoded features
df = df.drop(columns=categorical_cols).reset_index(drop=True)
test_df = test_df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, df_encoded], axis=1)
test_df = pd.concat([test_df, test_df_encoded], axis=1)

# Normalize features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(columns=['target']))
test_df_scaled = scaler.transform(test_df.drop(columns=['target']))

print(f"Dataset shape after processing: {df_scaled.shape}")

# Run isolation forests
iso_forest = IsolationForest(n_estimators=500, max_samples=1.0, contamination='auto', random_state=42)
iso_forest.fit(df_scaled)

anomaly_scores = iso_forest.decision_function(test_df_scaled)
threshold = np.percentile(anomaly_scores, 33)
test_preds = (anomaly_scores < threshold).astype(int)  # Anomaly if score < threshold

# Get anomaly scores from the Isolation Forest
anomaly_scores = iso_forest.decision_function(test_df_scaled)

# Plot anomaly score distribution
plt.figure(figsize=(8,5))
sns.histplot(anomaly_scores, bins=50, kde=True)
plt.xlabel("Anomaly Score")
plt.ylabel("Frequency")
plt.title("Anomaly Score Distribution")
plt.axvline(x=np.percentile(anomaly_scores, 10), color='red', linestyle='dashed', label='10th percentile (Threshold)')
plt.legend()
plt.show()

# Compute & plot confusion matrix
cm = confusion_matrix(test_labels, test_preds)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Normal', 'Anomalous'], yticklabels=['Normal', 'Anomalous'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# Evaluate the model
accuracy = accuracy_score(test_labels, test_preds)
f1 = f1_score(test_labels, test_preds)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(classification_report(test_labels, test_preds, zero_division=0))