In [4]:
# =========================================
# STEP 1: Import Libraries
# =========================================
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

# For reproducibility
np.random.seed(42)

# =========================================
# STEP 2: Generate Synthetic Logs
# =========================================
# Normal log data
normal_data = {
    "IP": np.random.choice(["10.0.0.1", "10.0.0.2", "10.0.0.3"], size=100),
    "URL": np.random.choice(["/home", "/about", "/products", "/contact"], size=100),
    "Status": np.random.choice([200, 301, 302], size=100),
    "User_Agent": np.random.choice(["Mozilla", "Chrome", "Safari"], size=100),
    "label": 0   # 0 = normal
}

# Anomaly log data
anomaly_data = {
    "IP": np.random.choice(["192.168.1.100", "172.16.0.5", "10.10.10.10"], size=10),
    "URL": np.random.choice(["/etc/passwd", "/cgi-bin/formmail.cgi", "/admin"], size=10),
    "Status": np.random.choice([404, 500], size=10),
    "User_Agent": np.random.choice(["sqlmap", "curl", "bot"], size=10),
    "label": 1   # 1 = anomaly
}

# Combine normal + anomaly logs
df = pd.concat([pd.DataFrame(normal_data), pd.DataFrame(anomaly_data)], ignore_index=True)

print("✅ Synthetic dataset created")
print(df.head())
print("\nDataset size:", df.shape)

# Save dataset
df.to_csv("synthetic_logs.csv", index=False)
print("💾 Synthetic logs saved as synthetic_logs.csv")

# =========================================
# STEP 3: Feature Encoding (convert text -> numbers)
# =========================================
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
encoders = {}

for col in ["IP", "URL", "Status", "User_Agent"]:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    encoders[col] = le  # store encoder for later use

print("\n✅ Features encoded")

# =========================================
# STEP 4: Train IsolationForest
# =========================================
X = df_encoded.drop(columns=["label"])  # features
y_true = df_encoded["label"]            # ground truth

model = IsolationForest(contamination=0.1, random_state=42)
model.fit(X)

# =========================================
# STEP 5: Get Predictions
# =========================================
y_pred = model.predict(X)

# IsolationForest outputs: -1 (anomaly), 1 (normal)
# Convert to match our labels: 1 = anomaly, 0 = normal
y_pred = [1 if val == -1 else 0 for val in y_pred]

df["predicted_label"] = y_pred

# =========================================
# STEP 6: Evaluate Accuracy
# =========================================
print("\n📊 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Normal", "Anomaly"]))

# =========================================
# STEP 7: Save Results
# =========================================
df.to_csv("synthetic_logs_with_predictions.csv", index=False)
print("💾 Results saved as synthetic_logs_with_predictions.csv")

# Show first few rows with prediction
df.head(10)


✅ Synthetic dataset created
         IP        URL  Status User_Agent  label
0  10.0.0.3   /contact     200    Mozilla      0
1  10.0.0.1   /contact     302     Safari      0
2  10.0.0.3   /contact     301     Safari      0
3  10.0.0.3  /products     200    Mozilla      0
4  10.0.0.1  /products     200    Mozilla      0

Dataset size: (110, 5)
💾 Synthetic logs saved as synthetic_logs.csv

✅ Features encoded

📊 Confusion Matrix:
[[99  1]
 [ 0 10]]

📊 Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      0.99      0.99       100
     Anomaly       0.91      1.00      0.95        10

    accuracy                           0.99       110
   macro avg       0.95      0.99      0.97       110
weighted avg       0.99      0.99      0.99       110

💾 Results saved as synthetic_logs_with_predictions.csv


Unnamed: 0,IP,URL,Status,User_Agent,label,predicted_label
0,10.0.0.3,/contact,200,Mozilla,0,0
1,10.0.0.1,/contact,302,Safari,0,0
2,10.0.0.3,/contact,301,Safari,0,0
3,10.0.0.3,/products,200,Mozilla,0,0
4,10.0.0.1,/products,200,Mozilla,0,0
5,10.0.0.1,/products,200,Safari,0,0
6,10.0.0.3,/home,302,Safari,0,0
7,10.0.0.2,/contact,301,Safari,0,0
8,10.0.0.3,/products,200,Chrome,0,0
9,10.0.0.3,/products,200,Mozilla,0,0
