In [19]:
import pandas as pd
import numpy as np
import time

import warnings
warnings.filterwarnings("ignore")

# import the necessary package
from adbench.datasets.data_generator import DataGenerator
from adbench.myutils import Utils

datagenerator = DataGenerator()
utils = Utils()

In [20]:
from adbench.baseline.PyOD import PYOD
from adbench.baseline.DevNet.run import DevNet
from adbench.baseline.Supervised import supervised
from adbench.baseline.REPEN.run import REPEN
from adbench.baseline.DevNet.run import DevNet
from adbench.baseline.PReNet.run import PReNet
from adbench.baseline.FEAWAD.run import FEAWAD
from adbench.baseline.DAGMM.run import DAGMM # Unsup
from adbench.baseline.DeepSAD.src.run import DeepSAD # Semi


# model_dict = {'DAGMM':DAGMM, 'COPOD': PYOD, 'ECOD': PYOD, 'XGBOD':PYOD, 'DeepSAD':DeepSAD, 'DevNet':DevNet, 'PReNet':PReNet, 'FEAWAD':FEAWAD}

# GANomaly DeepSAD REPEN DevNet PReNet FEAWAD XGBOD NB SVM MLP ResNet FTTrans
# former RF LGB XGB CatB

# dataset and model list / dict
dataset_list = ['1_ALOI']
model_dict = {'XGBOD':PYOD, 'CatB': supervised, 'COPOD': PYOD, 'ECOD': PYOD}

# save the results
df_AUCROC = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())
df_AUCPR = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())

In [21]:
# save the results
df_results = pd.DataFrame(columns=['Model', 'AUCROC', 'AUCPR', 'Execution_Time'])

# seed for reproducibility
seed = 42

for dataset in dataset_list:
    # Import the dataset
    datagenerator.dataset = dataset
    data = datagenerator.generator(la=0.1, realistic_synthetic_mode=None, noise_type=None)
    
    for name, clf in model_dict.items():
        clf = clf(seed=seed, model_name=name)
        
        # Measure execution time
        start_time = time.time()
        clf = clf.fit(X_train=data['X_train'], y_train=data['y_train'])
        execution_time = time.time() - start_time
        
        # Evaluation
        score = clf.predict_score(data['X_test'])
        result = utils.metric(y_true=data['y_test'], y_score=score)
        
        # Save results
        new_row = pd.DataFrame({
            'Model': [name],
            'AUCROC': [result['aucroc']],
            'AUCPR': [result['aucpr']],
            'Execution_Time': [execution_time]
        })
        df_results = pd.concat([df_results, new_row], ignore_index=True)
        df_AUCROC.loc[dataset, name] = result['aucroc']
        df_AUCPR.loc[dataset, name] = result['aucpr']

# Display the results
print(df_results)

subsampling for dataset 1_ALOI...
current noise type: None
{'Samples': 10000, 'Features': 27, 'Anomalies': 302, 'Anomalies Ratio(%)': 3.02}
best param: None
Learning rate set to 0.023648
0:	learn: 0.6279188	total: 4.88ms	remaining: 4.87s
1:	learn: 0.5683493	total: 10.4ms	remaining: 5.21s
2:	learn: 0.5148908	total: 15.6ms	remaining: 5.19s
3:	learn: 0.4640346	total: 20.9ms	remaining: 5.19s
4:	learn: 0.4220544	total: 25.2ms	remaining: 5.02s
5:	learn: 0.3827438	total: 29.4ms	remaining: 4.87s
6:	learn: 0.3480773	total: 33.7ms	remaining: 4.79s
7:	learn: 0.3174367	total: 37.9ms	remaining: 4.7s
8:	learn: 0.2882729	total: 42.1ms	remaining: 4.63s
9:	learn: 0.2630682	total: 46.1ms	remaining: 4.56s
10:	learn: 0.2400025	total: 50.3ms	remaining: 4.52s
11:	learn: 0.2190697	total: 54.5ms	remaining: 4.49s
12:	learn: 0.2011648	total: 58.9ms	remaining: 4.47s
13:	learn: 0.1841713	total: 63ms	remaining: 4.44s
14:	learn: 0.1694869	total: 67.3ms	remaining: 4.42s
15:	learn: 0.1558192	total: 71.5ms	remaining: 

In [22]:
def is_pareto_efficient(costs):
    """
    Identify Pareto-efficient points.
    Args:
        costs (ndarray): Array of shape (n_points, n_costs), where each row is a point and columns are criteria.
    Returns:
        mask (ndarray): Boolean array indicating whether each point is Pareto-efficient.
    """
    is_efficient = np.ones(costs.shape[0], dtype=bool)
    for i, c in enumerate(costs):
        if is_efficient[i]:
            is_efficient[is_efficient] = np.any(costs[is_efficient] < c, axis=1) | np.all(costs[is_efficient] == c, axis=1)
            is_efficient[i] = True
    return is_efficient

# Example: Extract Pareto front
results = df_results[['AUCROC', 'Execution_Time']].values  # Use relevant metrics
pareto_mask = is_pareto_efficient(results)

# Filter Pareto-efficient models
pareto_front = df_results[pareto_mask]
print("Pareto Front Models:")
print(pareto_front)


Pareto Front Models:
   Model    AUCROC     AUCPR  Execution_Time
2  COPOD  0.475179  0.030852        0.064143


In [23]:
# Fonction pour identifier le front de Pareto
def pareto_front(df, performance_col, time_col):
    # Trie par performances décroissantes, puis par temps croissant
    df_sorted = df.sort_values(by=[performance_col, time_col], ascending=[False, True])
    pareto_indices = []  # Index des points sur le front de Pareto
    
    best_time = float('inf')  # Meilleur temps rencontré jusqu'à présent
    for idx, row in df_sorted.iterrows():
        if row[time_col] < best_time:
            pareto_indices.append(idx)
            best_time = row[time_col]
    
    return df.loc[pareto_indices]

In [24]:
# Front de Pareto pour AUCROC vs Execution Time
pareto_df = pareto_front(df_results, performance_col='AUCROC', time_col='Execution_Time')

print("Front de Pareto AUCROC vs Execution Time:")
print(pareto_df[['Model', 'AUCROC', 'Execution_Time']])

Front de Pareto AUCROC vs Execution Time:
   Model    AUCROC  Execution_Time
0  XGBOD  0.721894       46.344571
3   ECOD  0.512396        0.065586
2  COPOD  0.475179        0.064143


In [25]:
# Front de Pareto pour AUCPR vs Execution Time
pareto_df = pareto_front(df_results, performance_col='AUCPR', time_col='Execution_Time')

print("Front de Pareto AUCPR vs Execution Time:")
print(pareto_df[['Model', 'AUCPR', 'Execution_Time']])

Front de Pareto AUCPR vs Execution Time:
   Model     AUCPR  Execution_Time
0  XGBOD  0.067865       46.344571
1   CatB  0.034110        4.954636
3   ECOD  0.033768        0.065586
2  COPOD  0.030852        0.064143


In [26]:
#  Ensemble learning
from sklearn.model_selection import train_test_split

# Concaténation des données d'entraînement et de test existantes
X = np.concatenate((data['X_train'], data['X_test']))
y = np.concatenate((data['y_train'], data['y_test']))

# Division en ensembles d'entraînement, de validation et de test
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=seed)

print(f"Taille de X_train: {X_train.shape}")
print(f"Taille de X_val: {X_val.shape}")
print(f"Taille de X_test: {X_test.shape}")

Taille de X_train: (6000, 27)
Taille de X_val: (2000, 27)
Taille de X_test: (2000, 27)


In [27]:
model_names = pareto_df['Model'].tolist()[:2]  # Sélectionne les deux premiers

# Initialisation des modèles de base
models = {}
for name in model_names:
    clf_class = model_dict[name]
    clf = clf_class(seed=seed, model_name=name)
    clf.fit(X_train, y_train)
    models[name] = clf

best param: None
Learning rate set to 0.022141
0:	learn: 0.6452227	total: 6.81ms	remaining: 6.8s
1:	learn: 0.6003846	total: 12.7ms	remaining: 6.32s
2:	learn: 0.5589926	total: 18.3ms	remaining: 6.08s
3:	learn: 0.5201805	total: 24.3ms	remaining: 6.04s
4:	learn: 0.4852375	total: 30.5ms	remaining: 6.07s
5:	learn: 0.4528917	total: 36.8ms	remaining: 6.09s
6:	learn: 0.4231342	total: 42.1ms	remaining: 5.97s
7:	learn: 0.3953395	total: 47ms	remaining: 5.83s
8:	learn: 0.3699160	total: 52.1ms	remaining: 5.74s
9:	learn: 0.3469536	total: 57.6ms	remaining: 5.7s
10:	learn: 0.3253632	total: 63.6ms	remaining: 5.71s
11:	learn: 0.3055207	total: 69.5ms	remaining: 5.72s
12:	learn: 0.2872380	total: 74.4ms	remaining: 5.65s
13:	learn: 0.2706083	total: 80.2ms	remaining: 5.64s
14:	learn: 0.2552676	total: 85.5ms	remaining: 5.61s
15:	learn: 0.2410029	total: 90.8ms	remaining: 5.58s
16:	learn: 0.2280734	total: 96.6ms	remaining: 5.59s
17:	learn: 0.2159331	total: 102ms	remaining: 5.57s
18:	learn: 0.2047800	total: 107m

In [28]:
# Prédictions sur l'ensemble de validation
val_preds = pd.DataFrame()
for name, model in models.items():
    scores = model.predict_score(X_val)
    val_preds[name] = scores

In [29]:
from sklearn.linear_model import LogisticRegression

# Entraînement du méta-modèle
meta_model = LogisticRegression(random_state=seed)
meta_model.fit(val_preds, y_val)

LogisticRegression(random_state=42)

In [30]:
# Prédictions des modèles de base sur l'ensemble de test
test_preds = pd.DataFrame()
for name, model in models.items():
    scores = model.predict_score(X_test)
    test_preds[name] = scores

# Prédictions finales du méta-modèle
final_scores = meta_model.predict_proba(test_preds)[:, 1]

# Évaluation des performances
result = utils.metric(y_true=y_test, y_score=final_scores)

print("Performances du modèle empilé :")
print(f"AUCROC: {result['aucroc']}")
print(f"AUCPR: {result['aucpr']}")

Performances du modèle empilé :
AUCROC: 0.7410178474008261
AUCPR: 0.04264496669904561


In [33]:
#Comparaison entre le méta-modèle et les modèles individuels
df_results

Unnamed: 0,Model,AUCROC,AUCPR,Execution_Time
0,XGBOD,0.721894,0.067865,46.344571
1,CatB,0.510126,0.03411,4.954636
2,COPOD,0.475179,0.030852,0.064143
3,ECOD,0.512396,0.033768,0.065586
4,Stacked Model,0.741018,0.042645,
