### Running CapyMOA models with the paper's datasets

In [2]:
import os
import gzip
import pandas as pd
import numpy as np
from capymoa.stream import stream_from_file
from capymoa.anomaly import HalfSpaceTrees, OnlineIsolationForest, Autoencoder
from capymoa.evaluation import AnomalyDetectionEvaluator
from sklearn.metrics import average_precision_score, roc_auc_score, roc_curve, auc
from scipy.stats import sem
import time

In [10]:

# Define datasets, their corresponding run counts, and models
datasets = {
    "abalone": 10,
    "annthyroid": 10,
    "magicgamma": 4,
    "kdd_ftp": 10,
    "mammography": 10,
    "thyroid": 10,
    "mnist": 10,
    "musk": 5,
    "satellite": 5,
    "satimages": 5,
    "spambase": 5,
    "shuttle_odds": 2
}

models = {
    "HalfSpaceTrees": HalfSpaceTrees,
    "Autoencoder": Autoencoder,
    "OnlineIsolationForest": OnlineIsolationForest
}

# Define dataset path
dataset_path = r"C:\Users\aleja\OneDrive - Universidad Nacional de Colombia\Documentos\Institut Polytechnique de Paris\courses\P1\Data Streaming\project\actual code\datasets\forStefan\data\public"


In [16]:
# Results storage
all_results = []

# Main loop
for dataset_name, n_runs in datasets.items():
    print(f"Dataset: {dataset_name} (Runs: {n_runs})")

    input_path = os.path.join(dataset_path, f"{dataset_name}.gz")
    output_path = os.path.join(dataset_path, f"{dataset_name}.csv")

    # Unzip the dataset if needed
    if not os.path.exists(output_path):
        with gzip.open(input_path, 'rt') as gz_file:
            df = pd.read_csv(gz_file)
            df.to_csv(output_path, index=False)
            print(f"CSV saved to: {output_path}")

    stream = stream_from_file(output_path, dataset_name=dataset_name)
    schema = stream.get_schema()

    # Load labels for metrics
    df = pd.read_csv(output_path)
    labels = df['label'].to_numpy(dtype='float32')

    for model_name, ModelClass in models.items():
        print(f"Running model: {model_name}")
        ap_scores = []
        auc_scores = []
        auc_paper_scores = []
        auc_capymoa_scores = []
        execution_times = []
        model_results = []

        for run in range(n_runs):
            learner = ModelClass(schema)
            evaluator = AnomalyDetectionEvaluator(schema)

            stream.restart()  # Restart stream for each run
            anomaly_scores = []
            start_time = time.time()

            while stream.has_more_instances():
                instance = stream.next_instance()
                proba = learner.score_instance(instance)
                #We do 1-proba because for capyMOA models 1 means normal and 0 means anomaly, inverse as in streamrhf
                anomaly_scores.append(1-proba)
                evaluator.update(instance.y_index, proba)
                learner.train(instance)

            # Get AUC from evaluator
            auc_score_capymoa = evaluator.auc()

            #####################################
            anomaly_scores = np.array(anomaly_scores)
            ap_score = average_precision_score(labels, anomaly_scores)
            auc_score = roc_auc_score(labels, anomaly_scores)
            fpr, tpr, thresholds = roc_curve(labels, anomaly_scores)
            auc_paper = auc(fpr, tpr)
            #####################################

            execution_time = time.time() - start_time

            print(f"Run {run + 1}: AP = {ap_score:.4f}, AUC = {auc_score:.4f}, Time = {execution_time:.2f}s")

            # Save run results
            run_result = {
                'Dataset': dataset_name,
                'Model': model_name,
                'Run': run + 1,
                'AP': ap_score,
                'AUC_capymoa': auc_score_capymoa,
                'AUC (sklearn)': auc_score,
                'AUC (paper)': auc_paper,
                'Execution Time (s)': execution_time
            }
            model_results.append(run_result)
            all_results.append(run_result)

        # Save checkpoint after each model
        results_df = pd.DataFrame(all_results)
        results_df.to_csv("all_run_results_checkpoint.csv", index=False)
        print(f"Checkpoint saved for model {model_name}")

        # Summarize results for the model
        ap_scores = np.array([res['AP'] for res in model_results])
        auc_scores = np.array([res['AUC (sklearn)'] for res in model_results])
        auc_paper_scores = np.array([res['AUC (paper)'] for res in model_results])
        auc_capymoa_scores = np.array([res['AUC_capymoa'] for res in model_results])
        execution_times = np.array([res['Execution Time (s)'] for res in model_results])

        mean_ap = np.mean(ap_scores)
        mean_auc = np.mean(auc_scores)
        mean_auc_paper = np.mean(auc_paper_scores)
        mean_auc_capymoa = np.mean(auc_capymoa_scores)
        mean_time = np.mean(execution_times)
        ap_sem = sem(ap_scores)
        auc_sem = sem(auc_scores)
        auc_paper_sem = sem(auc_paper_scores)
        auc_capymoa_sem = sem(auc_capymoa_scores)
        time_sem = sem(execution_times)
        confidence_level = 1.96
        ap_ci = confidence_level * ap_sem
        auc_ci = confidence_level * auc_sem
        auc_paper_ci = confidence_level * auc_paper_sem
        auc_capymoa_ci = confidence_level * auc_capymoa_sem
        time_ci = confidence_level * time_sem

        print(f"Summary for {model_name}:")
        print(f"AP: {mean_ap:.4f} ± {ap_ci:.4f} (95% CI)")
        print(f"AUC (sklearn): {mean_auc:.4f} ± {auc_ci:.4f} (95% CI)")
        print(f"AUC (paper): {mean_auc_paper:.4f} ± {auc_paper_ci:.4f} (95% CI)")
        print(f"AUC (CapyMOA): {mean_auc_capymoa:.4f} ± {auc_capymoa_ci:.4f} (95% CI)")
        print(f"Time: {mean_time:.2f} ± {time_ci:.2f} seconds (95% CI)")

        # Save summary
        summary = {
            'Dataset': dataset_name,
            'Model': model_name,
            'Metric': ['AP', 'AUC (sklearn)', 'AUC (paper)', 'AUC (CapyMOA)', 'Execution Time'],
            'Mean': [mean_ap, mean_auc, mean_auc_paper, mean_auc_capymoa, mean_time],
            'CI (95%)': [ap_ci, auc_ci, auc_paper_ci, auc_capymoa_ci, time_ci]
        }
        summary_df = pd.DataFrame(summary)

        # Create a folder for each model in the current working directory if it doesn't exist
        model_folder = os.path.join(os.getcwd(), model_name)
        os.makedirs(model_folder, exist_ok=True)
        
        # Save the summary in the respective model's folder
        summary_df.to_csv(os.path.join(model_folder, f"{dataset_name}_summary.csv"), index=False)



Dataset: abalone (Runs: 2)
Running model: HalfSpaceTrees




Run 1: AP = 0.5313, AUC = 0.9628, Time = 0.26s
Run 2: AP = 0.5313, AUC = 0.9628, Time = 0.65s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.5313 ± 0.0000 (95% CI)
AUC (sklearn): 0.9628 ± 0.0000 (95% CI)
AUC (paper): 0.9628 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.9628 ± 0.0000 (95% CI)
Time: 0.45 ± 0.38 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.1163, AUC = 0.8546, Time = 1.65s
Run 2: AP = 0.1163, AUC = 0.8546, Time = 1.39s
Checkpoint saved for model Autoencoder
Summary for Autoencoder:
AP: 0.1163 ± 0.0000 (95% CI)
AUC (sklearn): 0.8546 ± 0.0000 (95% CI)
AUC (paper): 0.8546 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.8546 ± 0.0000 (95% CI)
Time: 1.52 ± 0.25 seconds (95% CI)
Running model: OnlineIsolationForest
Run 1: AP = 0.0333, AUC = 0.3180, Time = 44.39s
Run 2: AP = 0.0333, AUC = 0.3180, Time = 44.28s
Checkpoint saved for model OnlineIsolationForest
Summary for OnlineIsolationForest:
AP: 0.0333 ± 0.0000 (95% CI)
AUC (sklearn): 0.3180 ± 0.0000 (95% CI)
A

In [None]:
# Save all results to a final CSV
results_df = pd.DataFrame(all_results)
results_df.to_csv("all_run_results.csv", index=False)
print("Final results saved to 'all_run_results.csv'")
