This notebook is used to compile all our models and compare them along with some standardized format. Make sure your code output fits as input to this standardized format, such that it only needs to be imported and can run from the get-go.

I'll also build some plotter to make this work well and of course ensure there is proper docstrings and comments

(If you want to make your folder importable, like I did with Code_Emiel, simply add an empty __init__.py to the folder)

In [1]:
import os
import sys

sys.path.append(os.getcwd())

In [None]:
# Standard format

from typing import List
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class StandardFormat:
    def __init__(self, n_samples):
        self.detected_outliers = {}
        self.confidences = {}
        self.n_samples = n_samples

    def add_outlier_samples(self, model_name: str, outliers: List[str]):
        """
        model_name: str = the name of your model of which you want to add the outliers to the pool

        outliers: list = the list of sample/measurement names (don't add anything else besides that, just the names of the samples (sample_1, sample_4567, etc)) (the length of the list doesn't matter, it can be any length, ensuring the alpha is 0.05)

        Call this function to add identified samples to the data pool
        With that I mean the following:
        with a certain alpha value, you select which samples are identified to be outliers.

        For this alpha, make sure you take 0.05! (standard alpha value)

        This is important, else we're comparing apples with oranges.
        """
        
        assert all(isinstance(item, str) for item in outliers), "Outliers should be a list of strings"
        self.detected_outliers.update({model_name: outliers})

    def add_confidences(self, model_name: str, confidences: List[(str, float)]):
        """
        model_name: str = the name of your model of which you want to add the confidences to the pool

        confidences: list = the list of (sample_name, confidence values) (probabilities between 0 and 1 that this sample or measurement is an outlier) (don't add anything else besides that, just the confidence values)

        Call this function to add identified samples to the data pool
        We're specifically NOT working with alpha values in this one
        Ensure that the raw confidence values of something being an outlier are added here
        What is the confidence value? A probability between 0 and 1 that this sample or measurement is an outlier, with 1 being fully confident that it IS an outlier, and vice versa.
    
        Ensure that the length of the confidences list is the same as the length of the entire dataset (so make sure that all samples are in the list, and nothing more or less. I will assert this, it will crash if the length is off!).
        """
        if self.n_samples is not None and len(confidences) != self.n_samples:
            raise ValueError(f"Expected {self.n_samples} confidences, got {len(confidences)}")
        self.confidences.update({model_name: confidences})

    def compute_similarity_scores_detected_outliers(self, plot_results: bool = False):
        models = list(self.detected_outliers)
        M = len(models)
        matrix = np.zeros((M, M), dtype=float)

        for i, model_name in enumerate(models):
            si = set(self.detected_outliers[model_name])
            for j, other_model_name in enumerate(models):
                if i == j:
                    matrix[i, j] = np.nan  # Diagonal can be set to NaN or 0 as preferred
                else:
                    sj = set(self.detected_outliers[other_model_name])
                    union = len(si | sj)
                    matrix[i, j] = (len(si & sj) / union) if union else np.nan

        if plot_results:
            fig, ax = plt.subplots()
            cax = ax.imshow(matrix, cmap='viridis', interpolation='nearest')
            fig.colorbar(cax, ax=ax)
            ax.set_xticks(range(M))
            ax.set_xticklabels(models, rotation=90)
            ax.set_yticks(range(M))
            ax.set_yticklabels(models)
            ax.set_title("Similarity Matrix of Detected Outliers", pad=20)
            plt.show()
        return models, matrix

    def compute_similarity_scores_confidences(self, plot_results: bool = False):
        models = list(self.confidences)
        sorted_confidences = {
            m: sorted(self.confidences[m], key=lambda x: x[0])
            for m in models
        }
        X = np.vstack([
            [conf for (_sid, conf) in sorted_confidences[m]]
            for m in models
        ])
        matrix = cosine_similarity(X)
        np.fill_diagonal(matrix, np.nan)

        if plot_results:
            fig, ax = plt.subplots(figsize=(6,6))
            cax = ax.imshow(matrix, cmap='viridis', interpolation='nearest')
            fig.colorbar(cax, ax=ax)
            ax.set_xticks(range(len(models)))
            ax.set_xticklabels(models, rotation=90)
            ax.set_yticks(range(len(models)))
            ax.set_yticklabels(models)
            ax.set_title("Similarity Matrix of Confidences", pad=20)
            plt.show()
        return models, matrix

standard_format = StandardFormat(20446)

**Emiel's code:**

In [3]:
from Code_Emiel import Elliptic_Envelope as elliptic_envelope_emiel
from Code_Emiel import Robust_PCA as rpca_emiel

outliers, confidences = elliptic_envelope_emiel.main()
standard_format.add_outlier_samples("Elliptic Envelope Emiel", outliers)
standard_format.add_confidences("Elliptic Envelope Emiel", confidences)

outliers, confidences = rpca_emiel.main()
standard_format.add_outlier_samples("Robust PCA Emiel", outliers)
standard_format.add_confidences("Robust PCA Emiel", confidences)

(18548, 51)


ValueError: Expected 2D array, got scalar array instead:
array=0.05.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

**Nicolas' code:**

**Abhinav's code:**

**Patricia's code:**

**Run Evaluation**

In [None]:
standard_format.compute_similarity_scores_detected_outliers(plot_results=True)
standard_format.compute_similarity_scores_confidences(plot_results=True)