# detect_out.py

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors
from sklearn.preprocessing import StandardScaler

def normalize_scores(scores):
    min_score = min(scores)
    max_score = max(scores)
    if max_score == min_score:
        return [0.0 for _ in scores]
    return [(s - min_score) / (max_score - min_score) for s in scores]

def out_detect_LOF(data_scaled, cleaned_indices, df_full):
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    lof_labels = lof.fit_predict(data_scaled)
    lof_scores = -lof.negative_outlier_factor_

    outliers = cleaned_indices[lof_labels == -1]
    outlier_names = [f"sample_{i}" for i in outliers]

    scores_dict = {i: score for i, score in zip(cleaned_indices, lof_scores)}
    all_scores = [scores_dict.get(i, 0.0) for i in df_full.index]
    conf = list(zip([f"sample_{i}" for i in df_full.index], normalize_scores(all_scores)))
    return outlier_names, conf

def out_detect_kNN(data_scaled, cleaned_indices, df_full):
    k = 20
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(data_scaled)
    distances, _ = neigh.kneighbors(data_scaled)
    knn_scores = distances.mean(axis=1)
    threshold = np.percentile(knn_scores, 95)
    knn_flags = knn_scores > threshold

    outliers = cleaned_indices[knn_flags]
    outlier_names = [f"sample_{i}" for i in outliers]

    scores_dict = {i: score for i, score in zip(cleaned_indices, knn_scores)}
    all_scores = [scores_dict.get(i, 0.0) for i in df_full.index]
    conf = list(zip([f"sample_{i}" for i in df_full.index], normalize_scores(all_scores)))
    return outlier_names, conf

def out_detect_combined(lof_conf, knn_conf, df_full):
    combined_conf = [(l + k) / 2 for l, k in zip(lof_conf, knn_conf)]
    conf = list(zip([f"sample_{i}" for i in df_full.index], combined_conf))

    outlier_names = [name for name, l, k in zip(df_full.index, lof_conf, knn_conf) if l > 0.95 and k > 0.95]
    outlier_names = [f"sample_{i}" for i in outlier_names]

    return outlier_names, conf

def detect_all_models():
    df = pd.read_csv('Code_Emiel/Dataset/refData_obf.csv')
    df['original_index'] = df.index

    data = df.select_dtypes(include=[np.number])
    data_cleaned = data.dropna()
    cleaned_indices = data_cleaned.index

    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data_cleaned)

    lof_outliers, lof_conf = out_detect_LOF(data_scaled, cleaned_indices, df)
    knn_outliers, knn_conf = out_detect_kNN(data_scaled, cleaned_indices, df)
    combined_outliers, combined_conf = out_detect_combined(
        [c for _, c in lof_conf],
        [c for _, c in knn_conf],
        df
    )

    return {
        "LOF": (lof_outliers, lof_conf),
        "kNN": (knn_outliers, knn_conf),
        "LOF + kNN": (combined_outliers, combined_conf),
    }


## Code for Ai Project

In [None]:
from Code_Patri.detect_out import detect_all_models

results = detect_all_models()

for model_name, (outliers, confidences) in results.items():
    standard_format.add_outlier_samples(model_name, outliers)
    standard_format.add_confidences(model_name, confidences)

standard_format.compute_similarity_scores_detected_outliers(plot_results=True)
standard_format.compute_similarity_scores_confidences(plot_results=True)
