# Unsupervised Models Training (Isolation Forest)

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
import joblib
import matplotlib.pyplot as plt

In [2]:
# Load features
df = pd.read_csv('../url_features.csv')
feature_cols = ['url_length', 'num_digits', 'num_special', 'has_ip', 'path_length', 'domain_length', 'num_subdomains', 'has_suspicious_words', 'entropy']
X = df[feature_cols]
y = df['label']

print("Data shape:", X.shape)

Data shape: (491876, 9)


In [3]:
# Train Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
iso_forest.fit(X)

# Predict anomalies (1 normal, -1 anomaly)
anomaly_scores = iso_forest.decision_function(X)
anomaly_pred = iso_forest.predict(X)

# Map to 0 benign, 1 malicious (anomaly as malicious)
anomaly_pred_mapped = np.where(anomaly_pred == -1, 1, 0)

print("Anomaly prediction distribution:", pd.Series(anomaly_pred_mapped).value_counts())

# Evaluate on known labels (for reference)
print("Classification report for anomaly detection:")
print(classification_report(y, anomaly_pred_mapped))

# Save model
joblib.dump(iso_forest, '../models/isolation_forest.pkl')
print("Isolation Forest saved.")

Anomaly prediction distribution: 0    442688
1     49188
Name: count, dtype: int64
Classification report for anomaly detection:
              precision    recall  f1-score   support

           0       0.53      0.96      0.68    245938
           1       0.78      0.16      0.26    245938

    accuracy                           0.56    491876
   macro avg       0.65      0.56      0.47    491876
weighted avg       0.65      0.56      0.47    491876

Isolation Forest saved.
