## Project Overview: Synthetic Document Anomaly Detection

This notebook explores unsupervised anomaly detection techniques applied to synthetic document data. The goal is to identify patterns that deviate from the expected structure, simulating use cases such as document forgery detection.

Key aspects:
- Implements Isolation Forest, One-Class SVM, CBLOF, LOF, KNN and DBSCAN methods.
- Evaluates model similarity using Jaccard Index across binary prediction sets.
- Combines model outputs using:
  - **Majority voting**: flags anomalies if a configurable proportion of models agree.
  - **Weighted ensembling**: uses model-specific weights to emphasize certain predictors.
- Enables deeper analysis of model agreement and decision diversity.

These ensemble methods provide a stronger foundation for high-stakes applications where **redundancy and robustness** are critical to reliability.

In [3]:
!pip install pyod



In [4]:
!pip install pyodbc



In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from pyod.models.iforest import IForest
from pyod.models.cblof import CBLOF
from sklearn.svm import OneClassSVM
from pyod.models.lof import LOF
from pyod.models.knn import KNN
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances
import logging

In [20]:
pd.set_option('display.max_rows', None)

In [25]:
def connect(server, db):
    print(f'You are connecting to: {server}/{db}')
    conn = pyodbc.connect(f'Driver={{SQL Server}};'
                          f'Server={server};'
                          f'Database={db};'
                          'Trusted_Connection=yes;',
                          autocommit=True)
    print('Connection Successful!\n')
    return conn

def load_data(conn, schema, table):
    print('Loading data')
    Data = pd.read_sql(f"SELECT * FROM {schema}.{table}", conn)
    cols = Data.columns
    print('Data loaded successfully')
    return Data, cols

def preprocess_data(df, apply_pca=False, pca_variance=0.95):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_pipeline, numeric_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

    logging.info("Fitting preprocessor")
    processed_data = preprocessor.fit_transform(df)
    
    if apply_pca:
        pca = PCA(n_components=pca_variance)
        logging.info("Applying PCA")
        processed_data = pca.fit_transform(processed_data)

    return pd.DataFrame(processed_data)


def run_model(name, model, data):
    logging.info(f"Running model: {name}")
    model.fit(data)
    preds = model.predict(data)
    return pd.Series(preds, name=name)


def run_all_models(data, contamination=0.1): # Assurant gadget claim average fraud rate ca. 30%. Set contamination to a conservative 10%. 
    models = {
        'IsolationForest': IForest(contamination=contamination),
        'CBLOF': CBLOF(contamination=contamination, check_estimator=False, random_state=0),
        'OneClassSVM': OneClassSVM(nu=0.03, kernel="rbf", gamma='scale'),
        'LOF': LOF(contamination=contamination),
        'KNN': KNN(contamination=contamination),
        'DBSCAN': DBSCAN(eps=0.5, min_samples=5),
    }

    predictions = pd.DataFrame()

    for name, model in models.items():
        if name == 'DBSCAN':
            db_preds = model.fit_predict(data)
            db_preds = np.where(db_preds == -1, 1, 0)
            predictions[name] = db_preds
        elif name == 'OneClassSVM':
            preds = model.fit(data).predict(data)
            predictions[name] = (preds == -1).astype(int)
        else:
            predictions[name] = run_model(name, model, data)

    return predictions


def compute_jaccard_similarity(df_preds):
    jaccard_matrix = pd.DataFrame(index=df_preds.columns, columns=df_preds.columns)
    for i in df_preds.columns:
        for j in df_preds.columns:
            a, b = df_preds[i], df_preds[j]
            intersection = np.logical_and(a == 1, b == 1).sum()
            union = np.logical_or(a == 1, b == 1).sum()
            jaccard_index = intersection / union if union != 0 else 0
            jaccard_matrix.loc[i, j] = round(jaccard_index, 3)
    return jaccard_matrix.astype(float)

def majority_voting(predictions, threshold=0.5):
    """
    Majority voting ensemble. Returns 1 (anomaly) if threshold of models agree.
    """
    vote_sum = predictions.sum(axis=1)
    n_models = predictions.shape[1]
    return (vote_sum >= (threshold * n_models)).astype(int)


def ensemble_predictions(predictions, strategy="majority", threshold=0.5, weights=None):
    """
    Combines predictions using specified ensemble strategy.
    """
    if strategy == "majority":
        return majority_voting(predictions, threshold)
    elif strategy == "weighted":
        if weights is None or len(weights) != predictions.shape[1]:
            raise ValueError("Provide valid weights matching number of models.")
        weighted_sum = (predictions * weights).sum(axis=1)
        return (weighted_sum >= 0.5).astype(int)
    else:
        raise ValueError("Unknown strategy: choose 'majority' or 'weighted'.")

In [29]:
# Example usage:
folder_path = "/home/sagemaker-user/example-project/Anomaly Detection/"

df = pd.read_csv(f'{folder_path}dummy_image_metadata_with_anomalies.csv')
df_processed = preprocess_data(df, apply_pca=False)

# Assurant gadget claim average fraud rate ca. 30%. Set contamination to a conservative 10%. 
predictions = run_all_models(df_processed, contamination=0.1)

# Combine predictions into a final ensemble prediction
ensemble = ensemble_predictions(predictions, strategy="majority", threshold=0.6)
predictions['Ensemble'] = ensemble

# Save to CSV or downstream usage
predictions.to_csv('anomaly_predictions.csv', index=False)
print(predictions.head())

# Model agreement diagnostics
jaccard_matrix = compute_jaccard_similarity(predictions.drop(columns='Ensemble'))
print(jaccard_matrix)

   IsolationForest  CBLOF  OneClassSVM  LOF  KNN  DBSCAN  Ensemble
0                0      1            0    0    0       1         0
1                0      0            0    0    0       1         0
2                0      0            0    0    0       1         0
3                0      0            0    0    0       1         0
4                0      0            0    0    0       1         0
                 IsolationForest  CBLOF  OneClassSVM    LOF    KNN  DBSCAN
IsolationForest            1.000  0.250        0.125  0.212  0.241   0.100
CBLOF                      0.250  1.000        0.184  0.667  0.565   0.100
OneClassSVM                0.125  0.184        1.000  0.154  0.139   0.125
LOF                        0.212  0.667        0.154  1.000  0.800   0.100
KNN                        0.241  0.565        0.139  0.800  1.000   0.080
DBSCAN                     0.100  0.100        0.125  0.100  0.080   1.000


In [30]:
jaccard_matrix

Unnamed: 0,IsolationForest,CBLOF,OneClassSVM,LOF,KNN,DBSCAN
IsolationForest,1.0,0.25,0.125,0.212,0.241,0.1
CBLOF,0.25,1.0,0.184,0.667,0.565,0.1
OneClassSVM,0.125,0.184,1.0,0.154,0.139,0.125
LOF,0.212,0.667,0.154,1.0,0.8,0.1
KNN,0.241,0.565,0.139,0.8,1.0,0.08
DBSCAN,0.1,0.1,0.125,0.1,0.08,1.0


In [31]:
predictions

Unnamed: 0,IsolationForest,CBLOF,OneClassSVM,LOF,KNN,DBSCAN,Ensemble
0,0,1,0,0,0,1,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0
5,0,0,0,0,0,1,0
6,0,0,0,0,0,1,0
7,0,0,0,0,0,1,0
8,0,0,0,0,0,1,0
9,0,0,0,0,0,1,0
