In [106]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns

np.random.seed(67676767)

In [None]:
def leverage_scores(X):
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    
    X_with_intercept = np.column_stack([np.ones(X.shape[0]), X])
    
    XtX_inv = np.linalg.pinv(X_with_intercept.T @ X_with_intercept)
    H = X_with_intercept @ XtX_inv @ X_with_intercept.T
    
    leverage_scores = np.diag(H)
    
    return leverage_scores

test_x = np.array([1, 2, 3, 4, 5])
test_leverage = leverage_scores(test_x)
print(f"Test pentru leverage scores: {test_leverage}")
print(f"leverage: {np.mean(test_leverage):.3f}")

In [None]:
# y = ax + b + noise
a = 2.0
b = 1.0

n_points = 25

# Different noise levels to test
noise_variances = [0.1, 0.5, 1.0, 2.0]

In [None]:
def generate_data_points(n_points, noise_var, point_type):
    if point_type == "regular":
        # Regular points: x in small range, low noise
        x = np.random.uniform(0, 2, n_points)
        noise = np.random.normal(0, np.sqrt(noise_var), n_points)
        
    elif point_type == "high_x_var":
        # High X variance: x spread out more
        x = np.random.uniform(-2, 4, n_points)
        noise = np.random.normal(0, np.sqrt(noise_var), n_points)
        
    elif point_type == "high_y_var":
        # High Y variance: more noise in y direction
        x = np.random.uniform(0, 2, n_points)
        noise = np.random.normal(0, np.sqrt(noise_var * 5), n_points)  # 5x more noise
        
    elif point_type == "high_both":
        # High variance in both x and y
        x = np.random.uniform(-2, 4, n_points)
        noise = np.random.normal(0, np.sqrt(noise_var * 5), n_points)
    
    y = a * x + b + noise
    
    return x, y

test_noise = 0.5
x_reg, y_reg = generate_data_points(n_points, test_noise, "regular")
x_hx, y_hx = generate_data_points(n_points, test_noise, "high_x_var")
x_hy, y_hy = generate_data_points(n_points, test_noise, "high_y_var")
x_hb, y_hb = generate_data_points(n_points, test_noise, "high_both")

plt.figure(figsize=(10, 8))
plt.scatter(x_reg, y_reg, label="Regular Points", color='blue')
plt.scatter(x_hx, y_hx, label="High X Variance", color='orange')
plt.scatter(x_hy, y_hy, label="High Y Variance", color='green')
plt.scatter(x_hb, y_hb, label="High Both Variance", color='red')
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Data Points with Different Variance Characteristics")
plt.legend()


In [None]:
results = {}
point_types = ["regular", "high_x_var", "high_y_var", "high_both"]

for noise_var in noise_variances:
    results[noise_var] = {}
    print(f"\nNoise variance: {noise_var}")
    
    for point_type in point_types:
        x, y = generate_data_points(n_points, noise_var, point_type)
        
        leverage = leverage_scores(x)
        
        results[noise_var][point_type] = {
            'x': x,
            'y': y,
            'leverage': leverage,
            'max_leverage': np.max(leverage),
            'avg_leverage': np.mean(leverage)
        }
        
        print(f"  {point_type:12}: avg leverage = {np.mean(leverage):.3f}, max = {np.max(leverage):.3f}")


In [None]:
colors = {'regular': 'blue', 'high_x_var': 'red', 'high_y_var': 'green', 'high_both': 'orange'}
labels = {'regular': 'Regular', 'high_x_var': 'High X var', 'high_y_var': 'High Y var', 'high_both': 'High Both'}

for i, noise_var in enumerate(noise_variances):
    # Create a new figure for each noise variance
    fig, ax = plt.subplots(1, 1, figsize=(10, 8))
    
    for point_type in point_types:
        data = results[noise_var][point_type]
        x, y, leverage = data['x'], data['y'], data['leverage']
        
        scatter = ax.scatter(x, y, 
                           c=colors[point_type], 
                           s=leverage * 300,
                           alpha=0.6, 
                           label=labels[point_type])
    
    x_line = np.linspace(-3, 5, 100)
    y_line = a * x_line + b
    ax.plot(x_line, y_line, 'k--', linewidth=2, alpha=0.8, label='True model')
    
    # highlight highest leverage points
    all_x = np.concatenate([results[noise_var][pt]['x'] for pt in point_types])
    all_leverage = np.concatenate([results[noise_var][pt]['leverage'] for pt in point_types])
    top_5_idx = np.argsort(all_leverage)[-5:]
    
    all_y = np.concatenate([results[noise_var][pt]['y'] for pt in point_types])
    ax.scatter(all_x[top_5_idx], all_y[top_5_idx], 
              s=100, facecolors='none', edgecolors='black', linewidth=2,
              label='Highest leverage')
    
    ax.set_title(f'Leverage Scores Analysis - Noise variance = {noise_var}', 
                 fontsize=14, fontweight='bold')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.grid(True, alpha=0.3)
    ax.legend(loc='best')
    
    plt.tight_layout()
    plt.show()

In [None]:
# 2D Model parameters: y = a*x1 + b*x2 + c + noise
a_2d = 1.5  # coefficient for x1
b_2d = 2.0  # coefficient for x2  
c_2d = 0.5  # intercept

def generate_2d_data(n_points, noise_var, point_type):
    if point_type == "regular":
        x1 = np.random.uniform(0, 2, n_points)
        x2 = np.random.uniform(0, 2, n_points)
        noise = np.random.normal(0, np.sqrt(noise_var), n_points)
        
    elif point_type == "high_x_var":
        x1 = np.random.uniform(-2, 4, n_points)
        x2 = np.random.uniform(-2, 4, n_points)
        noise = np.random.normal(0, np.sqrt(noise_var), n_points)
        
    elif point_type == "high_y_var":
        x1 = np.random.uniform(0, 2, n_points)
        x2 = np.random.uniform(0, 2, n_points)
        noise = np.random.normal(0, np.sqrt(noise_var * 5), n_points)
        
    elif point_type == "high_both":
        x1 = np.random.uniform(-2, 4, n_points)
        x2 = np.random.uniform(-2, 4, n_points)
        noise = np.random.normal(0, np.sqrt(noise_var * 5), n_points)
    
    y = a_2d * x1 + b_2d * x2 + c_2d + noise
    
    return np.column_stack([x1, x2]), y

def calculate_2d_leverage(X):
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    
    X_with_intercept = np.column_stack([np.ones(X.shape[0]), X])
    
    XtX_inv = np.linalg.pinv(X_with_intercept.T @ X_with_intercept)
    H = X_with_intercept @ XtX_inv @ X_with_intercept.T
    
    return np.diag(H)

print(f"2D Model: y = {a_2d}*x1 + {b_2d}*x2 + {c_2d} + Îµ")

In [None]:
# Generate 2D data and calculate leverage scores
results_2d = {}

for noise_var in noise_variances:
    results_2d[noise_var] = {}
    print(f"\nNoise variance: {noise_var}")
    
    for point_type in point_types:
        X, y = generate_2d_data(n_points, noise_var, point_type)
        leverage = calculate_2d_leverage(X)
        
        results_2d[noise_var][point_type] = {
            'X': X,
            'y': y,
            'leverage': leverage,
            'max_leverage': np.max(leverage),
            'avg_leverage': np.mean(leverage)
        }
        
        print(f"  {point_type:12}: avg leverage = {np.mean(leverage):.3f}, max = {np.max(leverage):.3f}")

In [None]:
# Plot 2D results
for i, noise_var in enumerate(noise_variances):
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')
    
    for point_type in point_types:
        data = results_2d[noise_var][point_type]
        X, y, leverage = data['X'], data['y'], data['leverage']
        
        # Plot 3D scatter: x1, x2, y with size based on leverage
        ax.scatter(X[:, 0], X[:, 1], y, 
                  c=colors[point_type], 
                  s=leverage * 400,  # Scale up for visibility
                  alpha=0.6, 
                  label=labels[point_type])
    
    # Find and highlight highest leverage points
    all_X = np.vstack([results_2d[noise_var][pt]['X'] for pt in point_types])
    all_y = np.concatenate([results_2d[noise_var][pt]['y'] for pt in point_types])
    all_leverage = np.concatenate([results_2d[noise_var][pt]['leverage'] for pt in point_types])
    
    top_5_idx = np.argsort(all_leverage)[-5:]
    ax.scatter(all_X[top_5_idx, 0], all_X[top_5_idx, 1], all_y[top_5_idx], 
              s=150, facecolors='none', edgecolors='black', linewidth=3,
              label='Highest leverage')
    
    ax.set_title(f'2D Leverage Scores Analysis - Noise variance = {noise_var}', 
                 fontsize=14, fontweight='bold')
    ax.set_xlabel('X1')
    ax.set_ylabel('X2')
    ax.set_zlabel('Y')
    ax.legend(loc='best')
    
    plt.tight_layout()
    plt.show()

## Ex. 2

In [None]:
from pyod.utils.data import generate_data_clusters
from pyod.models.knn import KNN
from sklearn.metrics import balanced_accuracy_score

In [None]:
# Generate clustered data with contamination
# 400 train, 200 test, 2 clusters, 2 features, 10% contamination
X_train, X_test, y_train, y_test = generate_data_clusters(
    n_train=400,
    n_test=200,
    n_clusters=2,
    n_features=2,
    contamination=0.1,
    random_state=42
)

In [None]:
# Test different n_neighbors values
n_neighbors_list = [3, 5, 10, 20, 30, 50]

knn_results = {}

for n_neighbors in n_neighbors_list:
    # Create and train KNN model
    knn = KNN(n_neighbors=n_neighbors, contamination=0.1)
    knn.fit(X_train)
    
    # Get predictions (0 = inlier, 1 = outlier)
    y_train_pred = knn.labels_ 
    y_test_pred = knn.predict(X_test)
    
    train_acc = balanced_accuracy_score(y_train, y_train_pred)
    test_acc = balanced_accuracy_score(y_test, y_test_pred)
    
    knn_results[n_neighbors] = {
        'model': knn,
        'y_train_pred': y_train_pred,
        'y_test_pred': y_test_pred,
        'train_acc': train_acc,
        'test_acc': test_acc
    }
    
    print(f"n_neighbors={n_neighbors:2d} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

In [None]:
# Visualize results for each n_neighbors value
for n_neighbors in n_neighbors_list:
    result = knn_results[n_neighbors]
    y_train_pred = result['y_train_pred']
    y_test_pred = result['y_test_pred']
    train_acc = result['train_acc']
    test_acc = result['test_acc']
    
    # Create 2x2 subplot
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    
    # Subplot 1: Ground truth - Training data
    ax = axes[0, 0]
    ax.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], 
               c='blue', label='Inliers', alpha=0.6, s=30)
    ax.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], 
               c='red', label='Outliers', alpha=0.8, s=50, marker='x')
    ax.set_title('Ground Truth - Training Data', fontsize=12, fontweight='bold')
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Subplot 2: Predicted - Training data
    ax = axes[0, 1]
    ax.scatter(X_train[y_train_pred==0, 0], X_train[y_train_pred==0, 1], 
               c='blue', label='Inliers', alpha=0.6, s=30)
    ax.scatter(X_train[y_train_pred==1, 0], X_train[y_train_pred==1, 1], 
               c='red', label='Outliers', alpha=0.8, s=50, marker='x')
    ax.set_title(f'Predicted - Training Data (Acc: {train_acc:.4f})', 
                 fontsize=12, fontweight='bold')
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Subplot 3: Ground truth - Test data
    ax = axes[1, 0]
    ax.scatter(X_test[y_test==0, 0], X_test[y_test==0, 1], 
               c='blue', label='Inliers', alpha=0.6, s=30)
    ax.scatter(X_test[y_test==1, 0], X_test[y_test==1, 1], 
               c='red', label='Outliers', alpha=0.8, s=50, marker='x')
    ax.set_title('Ground Truth - Test Data', fontsize=12, fontweight='bold')
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Subplot 4: Predicted - Test data
    ax = axes[1, 1]
    ax.scatter(X_test[y_test_pred==0, 0], X_test[y_test_pred==0, 1], 
               c='blue', label='Inliers', alpha=0.6, s=30)
    ax.scatter(X_test[y_test_pred==1, 0], X_test[y_test_pred==1, 1], 
               c='red', label='Outliers', alpha=0.8, s=50, marker='x')
    ax.set_title(f'Predicted - Test Data (Acc: {test_acc:.4f})', 
                 fontsize=12, fontweight='bold')
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.suptitle(f'KNN Anomaly Detection (n_neighbors = {n_neighbors})', 
                 fontsize=16, fontweight='bold')
    plt.tight_layout()

    print(f"Plot displayed for n_neighbors = {n_neighbors}")

    plt.show()
    

## Ex. 3

In [None]:
from pyod.models.lof import LOF
from sklearn.datasets import make_blobs

In [None]:
# Generate 2 clusters with different densities
# Cluster 1: center (-10, -10), std = 2
# Cluster 2: center (10, 10), std = 6

n_samples_per_cluster = 200
centers = [(-10, -10), (10, 10)]
cluster_std = [2, 6]

X_ex3, y_clusters = make_blobs(
    n_samples=[n_samples_per_cluster, n_samples_per_cluster],
    centers=centers,
    cluster_std=cluster_std,
    n_features=2,
    random_state=42
)

print(f"Cluster 1: center {centers[0]}, std = {cluster_std[0]}")
print(f"Cluster 2: center {centers[1]}, std = {cluster_std[1]}")

# Add some outliers manually 
# (7% contamination)
np.random.seed(42)
n_outliers = int(0.07 * len(X_ex3))
outlier_indices = np.random.choice(len(X_ex3), n_outliers, replace=False)

# Create ground truth labels 
# (0 = inlier, 1 = outlier)
y_true_ex3 = np.zeros(len(X_ex3))
y_true_ex3[outlier_indices] = 1

# Perturb outliers to make them more extreme
X_ex3[outlier_indices] += np.random.normal(0, 5, (n_outliers, 2))

print(f"Number of outliers: {n_outliers} ({n_outliers/len(X_ex3):.1%})")

In [None]:
# Visualize the generated data with ground truth
plt.figure(figsize=(10, 8))
plt.scatter(X_ex3[y_true_ex3==0, 0], X_ex3[y_true_ex3==0, 1], 
            c='blue', label='Inliers', alpha=0.6, s=30)
plt.scatter(X_ex3[y_true_ex3==1, 0], X_ex3[y_true_ex3==1, 1], 
            c='red', label='Outliers', alpha=0.8, s=50, marker='x')
plt.title('Generated Data with Different Cluster Densities', fontsize=14, fontweight='bold')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Notice: Left cluster is denser (std=2), right cluster is sparser (std=6)")

In [None]:
# Test different n_neighbors values for both KNN and LOF
n_neighbors_list_ex3 = [5, 10, 20, 30, 50]

comparison_results = {}

for n_neighbors in n_neighbors_list_ex3:
    # Train KNN model
    knn_model = KNN(n_neighbors=n_neighbors, contamination=0.07)
    knn_model.fit(X_ex3)
    y_pred_knn = knn_model.labels_
    
    # Train LOF model
    lof_model = LOF(n_neighbors=n_neighbors, contamination=0.07)
    lof_model.fit(X_ex3)
    y_pred_lof = lof_model.labels_
    
    # Calculate balanced accuracy
    knn_acc = balanced_accuracy_score(y_true_ex3, y_pred_knn)
    lof_acc = balanced_accuracy_score(y_true_ex3, y_pred_lof)
    
    comparison_results[n_neighbors] = {
        'knn_model': knn_model,
        'lof_model': lof_model,
        'y_pred_knn': y_pred_knn,
        'y_pred_lof': y_pred_lof,
        'knn_acc': knn_acc,
        'lof_acc': lof_acc
    }
    
    print(f"n_neighbors={n_neighbors:2d} | KNN Acc: {knn_acc:.4f} | LOF Acc: {lof_acc:.4f}")

In [None]:
# Visualize KNN vs LOF for each n_neighbors
for n_neighbors in n_neighbors_list_ex3:
    result = comparison_results[n_neighbors]
    y_pred_knn = result['y_pred_knn']
    y_pred_lof = result['y_pred_lof']
    knn_acc = result['knn_acc']
    lof_acc = result['lof_acc']
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot KNN predictions
    ax1.scatter(X_ex3[y_pred_knn==0, 0], X_ex3[y_pred_knn==0, 1], 
                c='blue', label='Inliers', alpha=0.6, s=30)
    ax1.scatter(X_ex3[y_pred_knn==1, 0], X_ex3[y_pred_knn==1, 1], 
                c='red', label='Outliers', alpha=0.8, s=50, marker='x')
    ax1.set_title(f'KNN (n_neighbors={n_neighbors}, Acc: {knn_acc:.4f})', 
                  fontsize=14, fontweight='bold')
    ax1.set_xlabel('Feature 1')
    ax1.set_ylabel('Feature 2')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot LOF predictions
    ax2.scatter(X_ex3[y_pred_lof==0, 0], X_ex3[y_pred_lof==0, 1], 
                c='blue', label='Inliers', alpha=0.6, s=30)
    ax2.scatter(X_ex3[y_pred_lof==1, 0], X_ex3[y_pred_lof==1, 1], 
                c='red', label='Outliers', alpha=0.8, s=50, marker='x')
    ax2.set_title(f'LOF (n_neighbors={n_neighbors}, Acc: {lof_acc:.4f})', 
                  fontsize=14, fontweight='bold')
    ax2.set_xlabel('Feature 1')
    ax2.set_ylabel('Feature 2')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.suptitle(f'KNN vs LOF Comparison - Different Cluster Densities', 
                 fontsize=16, fontweight='bold')
    plt.tight_layout()
    print(f"Plot displayed for n_neighbors = {n_neighbors}")

    plt.show()
    

In [None]:
# Compare accuracy across different n_neighbors
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

knn_accs = [comparison_results[n]['knn_acc'] for n in n_neighbors_list_ex3]
lof_accs = [comparison_results[n]['lof_acc'] for n in n_neighbors_list_ex3]

ax.plot(n_neighbors_list_ex3, knn_accs, 'o-', label='KNN', 
        linewidth=2, markersize=10, color='blue')
ax.plot(n_neighbors_list_ex3, lof_accs, 's-', label='LOF', 
        linewidth=2, markersize=10, color='green')
ax.set_xlabel('Number of Neighbors (k)', fontsize=12)
ax.set_ylabel('Balanced Accuracy', fontsize=12)
ax.set_title('KNN vs LOF Performance on Different Density Clusters', 
             fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_xticks(n_neighbors_list_ex3)

plt.tight_layout()
plt.show()

## Ex. 4

In [None]:
from scipy.io import loadmat
from sklearn.model_selection import train_test_split
from pyod.utils.utility import standardizer
from pyod.models.combination import average, maximization

In [None]:
data = loadmat('cardio.mat')

X = data['X']
y = data['y'].ravel()  # Flatten to 1D array

print(f"Dataset shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of outliers: {np.sum(y)}")
print(f"Contamination rate: {np.sum(y) / len(y):.4f}")

In [None]:
# Split into train and test sets
X_train_cardio, X_test_cardio, y_train_cardio, y_test_cardio = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

print(f"Training set: {X_train_cardio.shape}")
print(f"Test set: {X_test_cardio.shape}")
print(f"Train outliers: {np.sum(y_train_cardio)} / {len(y_train_cardio)}")
print(f"Test outliers: {np.sum(y_test_cardio)} / {len(y_test_cardio)}")

In [None]:
# Normalize the data (zero mean, unit variance)
X_train_norm, X_test_norm = standardizer(X_train_cardio, X_test_cardio)

print(f"Train mean: {np.mean(X_train_norm, axis=0)[:3]}")
print(f"Train std: {np.std(X_train_norm, axis=0)[:3]}")

In [None]:
# Get actual contamination rate for the model
contamination_rate = np.sum(y_train_cardio) / len(y_train_cardio)
print(f"Contamination rate: {contamination_rate:.4f}")

# Create 10 KNN models with different n_neighbors (30 to 120)
n_neighbors_range = range(30, 121, 10)  # 30, 40, 50, ..., 120
n_models = len(n_neighbors_range)

print(f"\nTraining {n_models} KNN models with n_neighbors from 30 to 120...")

knn_models = []
train_scores_knn = []
test_scores_knn = []

for n_neighbors in n_neighbors_range:
    # Create and train KNN model
    knn_model = KNN(n_neighbors=n_neighbors, contamination=contamination_rate)
    knn_model.fit(X_train_norm)
    
    # Get anomaly scores (not labels)
    train_scores_knn.append(knn_model.decision_scores_)
    test_scores_knn.append(knn_model.decision_function(X_test_norm))
    
    knn_models.append(knn_model)
    
    print(f"  KNN with n_neighbors={n_neighbors:3d} trained")

# Convert to numpy arrays (shape: n_models x n_samples)
train_scores_knn = np.array(train_scores_knn).T  # Transpose to (n_samples x n_models)
test_scores_knn = np.array(test_scores_knn).T

print(f"\nTrain scores shape: {train_scores_knn.shape}")
print(f"Test scores shape: {test_scores_knn.shape}")

In [None]:
# Create 10 LOF models with different n_neighbors (30 to 120)
print(f"Training {n_models} LOF models with n_neighbors from 30 to 120...")

lof_models = []
train_scores_lof = []
test_scores_lof = []

for n_neighbors in n_neighbors_range:
    # Create and train LOF model
    lof_model = LOF(n_neighbors=n_neighbors, contamination=contamination_rate)
    lof_model.fit(X_train_norm)
    
    # Get anomaly scores (not labels)
    train_scores_lof.append(lof_model.decision_scores_)
    test_scores_lof.append(lof_model.decision_function(X_test_norm))
    
    lof_models.append(lof_model)
    
    print(f"  LOF with n_neighbors={n_neighbors:3d} trained")

# Convert to numpy arrays
train_scores_lof = np.array(train_scores_lof).T
test_scores_lof = np.array(test_scores_lof).T

print(f"\nTrain scores shape: {train_scores_lof.shape}")
print(f"Test scores shape: {test_scores_lof.shape}")

In [None]:
train_scores_knn_norm, test_scores_knn_norm = standardizer(train_scores_knn, test_scores_knn)
train_scores_lof_norm, test_scores_lof_norm = standardizer(train_scores_lof, test_scores_lof)
print("gata")

In [None]:
# Strategy 1: AVERAGE combination for KNN ensemble
train_avg_knn = average(train_scores_knn_norm)
test_avg_knn = average(test_scores_knn_norm)

# Strategy 2: MAXIMIZATION combination for KNN ensemble
train_max_knn = maximization(train_scores_knn_norm)
test_max_knn = maximization(test_scores_knn_norm)

print(f"Average strategy - Train shape: {train_avg_knn.shape}, Test shape: {test_avg_knn.shape}")
print(f"Max strategy - Train shape: {train_max_knn.shape}, Test shape: {test_max_knn.shape}")

In [None]:
# Strategy 1: AVERAGE combination for LOF ensemble
train_avg_lof = average(train_scores_lof_norm)
test_avg_lof = average(test_scores_lof_norm)

# Strategy 2: MAXIMIZATION combination for LOF ensemble
train_max_lof = maximization(train_scores_lof_norm)
test_max_lof = maximization(test_scores_lof_norm)

print(f"Average strategy - Train shape: {train_avg_lof.shape}, Test shape: {test_avg_lof.shape}")
print(f"Max strategy - Train shape: {train_max_lof.shape}, Test shape: {test_max_lof.shape}")

In [None]:
# Find threshold using contamination rate and quantile
# Threshold = score at (1 - contamination) quantile

# For KNN - Average strategy
threshold_train_avg_knn = np.quantile(train_avg_knn, 1 - contamination_rate)
threshold_test_avg_knn = np.quantile(test_avg_knn, 1 - contamination_rate)

# For KNN - Max strategy
threshold_train_max_knn = np.quantile(train_max_knn, 1 - contamination_rate)
threshold_test_max_knn = np.quantile(test_max_knn, 1 - contamination_rate)

# For LOF - Average strategy
threshold_train_avg_lof = np.quantile(train_avg_lof, 1 - contamination_rate)
threshold_test_avg_lof = np.quantile(test_avg_lof, 1 - contamination_rate)

# For LOF - Max strategy
threshold_train_max_lof = np.quantile(train_max_lof, 1 - contamination_rate)
threshold_test_max_lof = np.quantile(test_max_lof, 1 - contamination_rate)

print("Thresholds calculated using contamination rate!")
print(f"Contamination rate: {contamination_rate:.4f}")

print()
print(f"KNN - Average: Train threshold = {threshold_train_avg_knn:.4f}, Test threshold = {threshold_test_avg_knn:.4f}")
print(f"KNN - Max: Train threshold = {threshold_train_max_knn:.4f}, Test threshold = {threshold_test_max_knn:.4f}")
print(f"LOF - Average: Train threshold = {threshold_train_avg_lof:.4f}, Test threshold = {threshold_test_avg_lof:.4f}")
print(f"LOF - Max: Train threshold = {threshold_train_max_lof:.4f}, Test threshold = {threshold_test_max_lof:.4f}")

In [None]:
# Make predictions based on thresholds
# If score > threshold, then outlier (1), else inlier (0)

# KNN - Average
y_train_pred_avg_knn = (train_avg_knn > threshold_train_avg_knn).astype(int)
y_test_pred_avg_knn = (test_avg_knn > threshold_test_avg_knn).astype(int)

# KNN - Max
y_train_pred_max_knn = (train_max_knn > threshold_train_max_knn).astype(int)
y_test_pred_max_knn = (test_max_knn > threshold_test_max_knn).astype(int)

# LOF - Average
y_train_pred_avg_lof = (train_avg_lof > threshold_train_avg_lof).astype(int)
y_test_pred_avg_lof = (test_avg_lof > threshold_test_avg_lof).astype(int)

# LOF - Max
y_train_pred_max_lof = (train_max_lof > threshold_train_max_lof).astype(int)
y_test_pred_max_lof = (test_max_lof > threshold_test_max_lof).astype(int)

In [None]:
# Calculate balanced accuracy for all strategies

# KNN - Average
ba_train_avg_knn = balanced_accuracy_score(y_train_cardio, y_train_pred_avg_knn)
ba_test_avg_knn = balanced_accuracy_score(y_test_cardio, y_test_pred_avg_knn)

# KNN - Max
ba_train_max_knn = balanced_accuracy_score(y_train_cardio, y_train_pred_max_knn)
ba_test_max_knn = balanced_accuracy_score(y_test_cardio, y_test_pred_max_knn)

# LOF - Average
ba_train_avg_lof = balanced_accuracy_score(y_train_cardio, y_train_pred_avg_lof)
ba_test_avg_lof = balanced_accuracy_score(y_test_cardio, y_test_pred_avg_lof)

# LOF - Max
ba_train_max_lof = balanced_accuracy_score(y_train_cardio, y_train_pred_max_lof)
ba_test_max_lof = balanced_accuracy_score(y_test_cardio, y_test_pred_max_lof)

print(f"{'Strategy':<20} {'Train BA':<15} {'Test BA':<15}")
print(f"{'KNN - Average':<20} {ba_train_avg_knn:<15.4f} {ba_test_avg_knn:<15.4f}")
print(f"{'KNN - Maximization':<20} {ba_train_max_knn:<15.4f} {ba_test_max_knn:<15.4f}")
print(f"{'LOF - Average':<20} {ba_train_avg_lof:<15.4f} {ba_test_avg_lof:<15.4f}")
print(f"{'LOF - Maximization':<20} {ba_train_max_lof:<15.4f} {ba_test_max_lof:<15.4f}")

# Find best strategy
strategies = {
    'KNN - Average': ba_test_avg_knn,
    'KNN - Maximization': ba_test_max_knn,
    'LOF - Average': ba_test_avg_lof,
    'LOF - Maximization': ba_test_max_lof
}

best_strategy = max(strategies, key=strategies.get)
best_score = strategies[best_strategy]

print()
print(f"Best strategy: {best_strategy} with Test BA = {best_score:.4f}")

In [None]:
strategies_list = ['KNN\nAverage', 'KNN\nMax', 'LOF\nAverage', 'LOF\nMax']
train_scores_list = [ba_train_avg_knn, ba_train_max_knn, ba_train_avg_lof, ba_train_max_lof]
test_scores_list = [ba_test_avg_knn, ba_test_max_knn, ba_test_avg_lof, ba_test_max_lof]

plt.plot(strategies_list, train_scores_list, 'o-', label='Train', 
         linewidth=2, markersize=10, color='blue')
plt.plot(strategies_list, test_scores_list, 's-', label='Test', 
         linewidth=2, markersize=10, color='red')
plt.xlabel('Strategy', fontsize=12)
plt.ylabel('Balanced Accuracy', fontsize=12)
plt.title('Performance Trends', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.ylim([0.5, 1.0])

plt.show()

In [None]:
# Compare individual models vs ensemble
# Calculate BA for individual models (middle one: n_neighbors=70)
middle_idx = len(knn_models) // 2
middle_n = list(n_neighbors_range)[middle_idx]

# Get predictions from individual models
y_train_pred_single_knn = knn_models[middle_idx].predict(X_train_norm)
y_test_pred_single_knn = knn_models[middle_idx].predict(X_test_norm)

y_train_pred_single_lof = lof_models[middle_idx].predict(X_train_norm)
y_test_pred_single_lof = lof_models[middle_idx].predict(X_test_norm)

# Calculate BA for single models
ba_train_single_knn = balanced_accuracy_score(y_train_cardio, y_train_pred_single_knn)
ba_test_single_knn = balanced_accuracy_score(y_test_cardio, y_test_pred_single_knn)

ba_train_single_lof = balanced_accuracy_score(y_train_cardio, y_train_pred_single_lof)
ba_test_single_lof = balanced_accuracy_score(y_test_cardio, y_test_pred_single_lof)

print(f"{'Model':<25} {'Train BA':<15} {'Test BA':<15}")
print(f"{'KNN - Single':<25} {ba_train_single_knn:<15.4f} {ba_test_single_knn:<15.4f}")
print(f"{'KNN - Ensemble (Avg)':<25} {ba_train_avg_knn:<15.4f} {ba_test_avg_knn:<15.4f}")
print(f"{'KNN - Ensemble (Max)':<25} {ba_train_max_knn:<15.4f} {ba_test_max_knn:<15.4f}")
print(f"{'LOF - Single':<25} {ba_train_single_lof:<15.4f} {ba_test_single_lof:<15.4f}")
print(f"{'LOF - Ensemble (Avg)':<25} {ba_train_avg_lof:<15.4f} {ba_test_avg_lof:<15.4f}")
print(f"{'LOF - Ensemble (Max)':<25} {ba_train_max_lof:<15.4f} {ba_test_max_lof:<15.4f}")

# Calculate improvement
knn_improvement_avg = ba_test_avg_knn - ba_test_single_knn
knn_improvement_max = ba_test_max_knn - ba_test_single_knn
lof_improvement_avg = ba_test_avg_lof - ba_test_single_lof
lof_improvement_max = ba_test_max_lof - ba_test_single_lof

print(f"\nImprovement over single model (Test BA):")
print(f"  KNN - Average: {knn_improvement_avg:+.4f}")
print(f"  KNN - Max: {knn_improvement_max:+.4f}")
print(f"  LOF - Average: {lof_improvement_avg:+.4f}")
print(f"  LOF - Max: {lof_improvement_max:+.4f}")