# Isolation Forest

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs


In [2]:

# Generate synthetic data with some outliers
n_samples = 100
n_features = 2
n_outliers = 10  # 10% outliers
X, y = make_blobs(n_samples=n_samples - n_outliers, n_features=n_features, centers=2, random_state=42)
X_outliers = np.random.uniform(low=-5, high=5, size=(n_outliers, n_features))  # Create outliers
X = np.concatenate([X, X_outliers])


In [3]:

# Fit Isolation Forest with specified contamination
contamination = 0.1  # 10% contamination
model = IsolationForest(contamination=contamination, random_state=42)
model.fit(X)

# Get anomaly scores (s values)
anomaly_scores = model.decision_function(X) # Note: this is NOT the s function, but the decision function
s_values = -model.score_samples(X) # This is the anomaly score, higher is more anomalous

# Predict anomalies (-1) and inliers (1)
predictions = model.predict(X)


In [7]:

# Count the number of predicted anomalies
n_predicted_anomalies = np.sum(predictions == -1)

# Verify the number of predicted anomalies is close to the expected number
print(f"Expected number of anomalies: {n_outliers}")
print(f"Predicted number of anomalies: {n_predicted_anomalies}")

# Demonstrate relationship between anomaly scores, decision function, and predictions
df_results = pd.DataFrame({'anomaly_score': s_values, 'decision_function': anomaly_scores, 'prediction': predictions})

# Sort by anomaly score to see how threshold works
df_results_sorted = df_results.sort_values('anomaly_score', ascending=False)

# Demonstrate threshold
threshold = np.quantile(s_values, contamination) # Threshold for anomaly scores
print(f"\nThreshold on anomaly scores: {threshold}")
# print(f"Anomaly scores above threshold ({threshold}):")
# print(s_values[s_values > threshold])


Expected number of anomalies: 10
Predicted number of anomalies: 10

Threshold on anomaly scores: 0.4108313614474001
