# Ensemble Anomaly Detection with KNN/LOF

This notebook implements an ensemble approach using multiple KNN or LOF models with different n_neighbors parameters, combining their predictions using average and maximization strategies.

In [2]:
# Import required libraries
import numpy as np
import scipy.io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.utils.utility import standardizer
from pyod.models.combination import average, maximization

In [3]:
# Load the cardio.mat dataset
data = scipy.io.loadmat('cardio.mat')

# Extract features and labels
X = data['X']
y = data['y'].ravel()

print(f"Dataset shape: {X.shape}")
print(f"Number of samples: {len(X)}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of anomalies: {np.sum(y == 1)}")
print(f"Number of normal samples: {np.sum(y == 0)}")
print(f"Contamination rate: {np.sum(y == 1) / len(y):.4f}")

Dataset shape: (1831, 21)
Number of samples: 1831
Number of features: 21
Number of anomalies: 176
Number of normal samples: 1655
Contamination rate: 0.0961


In [None]:
# Split the data into train and test sets (stratify on y to keep the ratio of anomalies)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training anomalies: {np.sum(y_train == 1)}")
print(f"Test anomalies: {np.sum(y_test == 1)}")

Training set size: 1281
Test set size: 550
Training anomalies: 123
Test anomalies: 53


In [5]:
# Normalize the data
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

print("Data normalized successfully")
print(f"Training set mean: {X_train_normalized.mean():.6f}")
print(f"Training set std: {X_train_normalized.std():.6f}")

Data normalized successfully
Training set mean: -0.000000
Training set std: 1.000000


In [6]:
# Calculate contamination rate for the dataset
contamination = np.sum(y_train == 1) / len(y_train)
print(f"Contamination rate: {contamination:.4f}")

Contamination rate: 0.0960


In [None]:
# Define n_neighbors range (30 to 120 with step of 10)
n_neighbors_range = range(30, 121, 10)
n_models = len(n_neighbors_range)

print(f"Creating {n_models} KNN models")
print(f"n_neighbors values: {list(n_neighbors_range)}")

# Initialize storage for scores
train_scores_list = []
test_scores_list = []

# Train each KNN model
for i, n_neighbors in enumerate(n_neighbors_range):
    print(f"\n--- Model {i+1}/{n_models}: KNN with n_neighbors={n_neighbors} ---")
    
    # Create and fit the model
    knn = KNN(n_neighbors=n_neighbors, contamination=contamination)
    knn.fit(X_train_normalized)
    
    y_train_pred = knn.predict(X_train_normalized)
    y_test_pred = knn.predict(X_test_normalized)
    
    train_ba = balanced_accuracy_score(y_train, y_train_pred)
    test_ba = balanced_accuracy_score(y_test, y_test_pred)
    
    print(f"Train BA: {train_ba:.4f}")
    print(f"Test BA: {test_ba:.4f}")
    
    # Store the decision scores
    train_scores = knn.decision_function(X_train_normalized)
    test_scores = knn.decision_function(X_test_normalized)
    
    train_scores_list.append(train_scores)
    test_scores_list.append(test_scores)

print("All models trained successfully")

Creating 10 KNN models
n_neighbors values: [30, 40, 50, 60, 70, 80, 90, 100, 110, 120]

--- Model 1/10: KNN with n_neighbors=30 ---
Train BA: 0.6968
Test BA: 0.6740

--- Model 2/10: KNN with n_neighbors=40 ---
Train BA: 0.7090
Test BA: 0.6864

--- Model 3/10: KNN with n_neighbors=50 ---
Train BA: 0.7176
Test BA: 0.7147

--- Model 4/10: KNN with n_neighbors=60 ---
Train BA: 0.7176
Test BA: 0.7242

--- Model 5/10: KNN with n_neighbors=70 ---
Train BA: 0.7266
Test BA: 0.7430

--- Model 6/10: KNN with n_neighbors=80 ---
Train BA: 0.7441
Test BA: 0.7525

--- Model 7/10: KNN with n_neighbors=90 ---
Train BA: 0.7486
Test BA: 0.7525

--- Model 8/10: KNN with n_neighbors=100 ---
Train BA: 0.7527
Test BA: 0.7525

--- Model 9/10: KNN with n_neighbors=110 ---
Train BA: 0.7486
Test BA: 0.7535

--- Model 10/10: KNN with n_neighbors=120 ---
Train BA: 0.7527
Test BA: 0.7639
All models trained successfully


In [8]:
# Convert lists to numpy arrays for easier manipulation
train_scores_array = np.array(train_scores_list)
test_scores_array = np.array(test_scores_list)

print(f"Train scores array shape: {train_scores_array.shape}")
print(f"Test scores array shape: {test_scores_array.shape}")

Train scores array shape: (10, 1281)
Test scores array shape: (10, 550)


In [9]:
# Normalize the scores using standardizer
# Standardize all train scores together (across all models (n_samples, n_models))
train_scores_normalized = standardizer(train_scores_array.T)
test_scores_normalized = standardizer(test_scores_array.T)

print("Scores normalized using standardizer")
print(f"Normalized train scores shape: {train_scores_normalized.shape}")
print(f"Normalized test scores shape: {test_scores_normalized.shape}")

Scores normalized using standardizer
Normalized train scores shape: (1281, 10)
Normalized test scores shape: (550, 10)


In [10]:
# Combine scores using average ( Not sure if it's all correct because i had to transpose )
train_scores_avg = average(train_scores_normalized)
test_scores_avg = average(test_scores_normalized)

print("Average combination applied")
print(f"Average train scores shape: {train_scores_avg.shape}")
print(f"Average test scores shape: {test_scores_avg.shape}")

Average combination applied
Average train scores shape: (1281,)
Average test scores shape: (550,)


In [11]:
# Find threshold using quantile with contamination
threshold_avg = np.quantile(train_scores_avg, 1 - contamination)
print(f"Average threshold: {threshold_avg:.6f}")

# Make predictions
y_train_pred_avg = (train_scores_avg > threshold_avg).astype(int)
y_test_pred_avg = (test_scores_avg > threshold_avg).astype(int)

# Calculate balanced accuracy
train_ba_avg = balanced_accuracy_score(y_train, y_train_pred_avg)
test_ba_avg = balanced_accuracy_score(y_test, y_test_pred_avg)

print("Average Strategy Results ")
print(f"Train Balanced Accuracy: {train_ba_avg:.4f}")
print(f"Test Balanced Accuracy: {test_ba_avg:.4f}")

Average threshold: 0.941179
Average Strategy Results 
Train Balanced Accuracy: 0.7257
Test Balanced Accuracy: 0.6925


In [12]:
# Combine scores using maximization strategy
train_scores_max = maximization(train_scores_normalized)
test_scores_max = maximization(test_scores_normalized)

print("Maximization combination applied")
print(f"Maximization train scores shape: {train_scores_max.shape}")
print(f"Maximization test scores shape: {test_scores_max.shape}")

Maximization combination applied
Maximization train scores shape: (1281,)
Maximization test scores shape: (550,)


In [13]:
# Find threshold using quantile with contamination rate
threshold_max = np.quantile(train_scores_max, 1 - contamination)
print(f"Maximization strategy threshold: {threshold_max:.6f}")

# Make predictions
y_train_pred_max = (train_scores_max > threshold_max).astype(int)
y_test_pred_max = (test_scores_max > threshold_max).astype(int)

# Calculate balanced accuracy
train_ba_max = balanced_accuracy_score(y_train, y_train_pred_max)
test_ba_max = balanced_accuracy_score(y_test, y_test_pred_max)

print(f"Maximization Strategy Results")
print(f"Train Balanced Accuracy: {train_ba_max:.4f}")
print(f"Test Balanced Accuracy: {test_ba_max:.4f}")

Maximization strategy threshold: 1.041882
Maximization Strategy Results
Train Balanced Accuracy: 0.7482
Test Balanced Accuracy: 0.7113


In [14]:
# Summary of results w ith formatted output
print("="*60)
print("ENSEMBLE RESULTS SUMMARY")
print("="*60)
print(f"\nDataset: cardio.mat")
print(f"Contamination rate: {contamination:.4f}")
print(f"Number of models: {n_models}")
print(f"n_neighbors range: {list(n_neighbors_range)}")
print(f"\n{'Strategy':<20} {'Train BA':<15} {'Test BA':<15}")
print("-"*60)
print(f"{'Average':<20} {train_ba_avg:<15.4f} {test_ba_avg:<15.4f}")
print(f"{'Maximization':<20} {train_ba_max:<15.4f} {test_ba_max:<15.4f}")
print("="*60)

# Determine which strategy performs better
if test_ba_avg > test_ba_max:
    print(f"\n✓ Average strategy performs better on test set (+{test_ba_avg - test_ba_max:.4f})")
elif test_ba_max > test_ba_avg:
    print(f"\n✓ Maximization strategy performs better on test set (+{test_ba_max - test_ba_avg:.4f})")
else:
    print(f"\n✓ Both strategies perform equally on test set")

ENSEMBLE RESULTS SUMMARY

Dataset: cardio.mat
Contamination rate: 0.0960
Number of models: 10
n_neighbors range: [30, 40, 50, 60, 70, 80, 90, 100, 110, 120]

Strategy             Train BA        Test BA        
------------------------------------------------------------
Average              0.7257          0.6925         
Maximization         0.7482          0.7113         

✓ Maximization strategy performs better on test set (+0.0189)
