In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from pathlib import Path
import torch
import torch.nn as nn
import re
from utils_notebook import histogram_experiment, dataframe_to_latex

In [None]:
results_path = "../results/all_db_all_training"


In [None]:
full_results_df = pd.DataFrame()
# For all folder in results_path
for experiment_folder in os.listdir(results_path):
    # Load the data
    # if experiment starts with DDPM skip
    # if not "DDPM" in experiment_folder:
    #     continue
    for dataset_folder in os.listdir(os.path.join(results_path, experiment_folder)):
        if "A_synthetic" not in dataset_folder:
            continue
        try:
            dataset_results = pd.read_csv(Path(results_path, experiment_folder,dataset_folder, "model_metrics.csv"))
        except:
            print(f"Error loading {results_path}/{experiment_folder}/{dataset_folder}/model_metrics.csv")
            continue
        dataset_results["experiment"] = experiment_folder
        dataset_results["dataset_name"] = "_".join(dataset_folder.split("_")[:-2])
        dataset_results['dataset_version'] = dataset_folder.split("_")[-1]
        # dataset_results['nb_bins'] = int(re.search(r"bins(\d+)", experiment_folder).group(1))
        # dataset_results['T'] = int(re.search(r"T(\d+)", experiment_folder).group(1))
        dataset_results['dimension'] = dataset_results['dataset_name'].str.extract(r"f(\d+)", expand=False).astype(int)
        full_results_df = pd.concat([full_results_df, dataset_results])
# result_path = "../results/grid_search_new_db"
# for experiment_folder in os.listdir("../results/grid_search_new_db"):
#     # Load the data
#     # if experiment starts with DDPM skip
#     for dataset_folder in os.listdir(os.path.join("../results/grid_search_new_db", experiment_folder)):
#         if "A_synthetic" not in dataset_folder:
#             continue
#         try:
#             dataset_results = pd.read_csv(Path("../results/grid_search_new_db", experiment_folder,dataset_folder, "model_metrics.csv"))
#         except:
#             print(f"Error loading {results_path}/{experiment_folder}/{dataset_folder}/model_metrics.csv")
#             continue
#         dataset_results["experiment"] = experiment_folder
#         dataset_results["dataset_name"] = "_".join(dataset_folder.split("_")[:-2])
#         dataset_results['dataset_version'] = dataset_folder.split("_")[-1]
#         # dataset_results['nb_bins'] = int(re.search(r"bins(\d+)", experiment_folder).group(1))
#         # dataset_results['T'] = int(re.search(r"T(\d+)", experiment_folder).group(1))
#         dataset_results['dimension'] = dataset_results['dataset_name'].str.extract(r"f(\d+)", expand=False).astype(int)
#         full_results_df = pd.concat([full_results_df, dataset_results])


In [None]:
full_results_df.columns

In [None]:
def extract_ddpm_shap(row, column_name):
    if "DDPM" in row["experiment"]:
        return row[column_name]
    else:
        return np.nan
    
full_results_df['ddpm_shap_accuracy'] = full_results_df.apply(lambda row: extract_ddpm_shap(row, "shap_explanation_accuracy"), axis=1)
full_results_df['ddpm_shap_ndcg'] = full_results_df.apply(lambda row: extract_ddpm_shap(row, "shap_feature_importance_ndcg"), axis=1)
full_results_df['ddpm_shap_time'] = full_results_df.apply(lambda row: extract_ddpm_shap(row, "shap_explanation_time"), axis=1)

In [None]:
full_results_df = full_results_df.drop(columns=["Unnamed: 0", "model_name", "sampling_method"], )
full_results_df.head(5)

In [None]:
full_results_df["dataset_name"].unique()

In [None]:
for name in full_results_df.columns:
    print(name)

In [None]:

# Function to extract the sum of ratios
def extract_sum_of_ratios(dataset_name):
    # Use regex to extract all ratios after 'r' and before '_', split by '_'
    ratios = re.findall(r'r([\d\.]+(?:_[\d\.]+)*)', dataset_name)
    if ratios:
        # Convert the extracted ratios to a list of floats
        ratio_list = list(map(float, ratios[0].split('_')))
        return sum(ratio_list) * 100
    return 0  # Default value if no ratios are found

# Apply the function to the dataframe
full_results_df['anomaly_ratio'] = full_results_df['dataset_name'].apply(extract_sum_of_ratios)
# Recreate the dataset names with "Synthetic {dimension}d, {ratio*100}% anomalies"
full_results_df['dataset_name'] = full_results_df['dimension'].apply(lambda x: f"Synthetic {x}d") + ", " + (full_results_df['anomaly_ratio']).astype(int).astype(str) + "\\% anomalies"
# Add ratio to dataset name


In [None]:
full_results_df['anomaly_ratio'].unique()

In [None]:
# Keep only row where nb_bins = 7 and T = 400
# full_results_df = full_results_df[(full_results_df['nb_bins'] == 7) & (full_results_df['T'] == 400)]
# full_results_df = full_results_df.drop(columns=["nb_bins", "T"])

In [None]:
full_results_df

In [None]:
# Compute mean and std for each dataset with different version, but same experiment name
temp_df = full_results_df.drop(columns=['training_method', "dataset_version"])
mean_df = temp_df.groupby(["experiment", "dataset_name", "anomaly_ratio", "dimension"]).mean().reset_index()
std_df = temp_df.groupby(["experiment", "dataset_name", "anomaly_ratio", "dimension"]).std().reset_index()

In [None]:
print(full_results_df['experiment'].unique())
print(full_results_df['dataset_name'].unique())

In [None]:
# Extract model name from experiment name, its the part before the first _
mean_df['model_name'] = mean_df['experiment'].str.extract(r"([A-Za-z0-9]+)_")
std_df['model_name'] = std_df['experiment'].str.extract(r"([A-Za-z0-9]+)_")


In [None]:
mean_df.head(20)

In [None]:
# Take only rows where experiment contains "0.5"
ddpm_mean_df = mean_df[mean_df['model_name'] == "DDPM"]
ddpm_std_df = std_df[std_df['model_name'] == "DDPM"]
ddpm_std_df.head()


In [None]:
def merge_and_create_latex_table(mean_df, std_df, column_name, caption="", label=""):
    # Step 1: Merge DataFrames on common columns
    merged_df = pd.merge(mean_df, std_df, on=["dataset_name", "experiment", "dimension", "anomaly_ratio"], suffixes=('_mean', '_std'))
    # Step 2: Combine mean and std into a single column with "mean(std)" format
    merged_df[f"{column_name}_str"] = merged_df.apply(
        lambda row: f"${row[f'{column_name}_mean']:.2f}({row[f'{column_name}_std']:.2f})$", axis=1
    )
    # Step 3: Drop the separate mean and std columns if needed
    merged_df = merged_df.drop(columns=[f"{column_name}_mean", f"{column_name}_std"])
    # Pivot so that each model is a experiment
    merged_df = merged_df.pivot(index=["dataset_name"], columns="experiment", values=f"{column_name}_str")
    merged_df = merged_df.reset_index()
    print(merged_df.columns)
    latex_table = dataframe_to_latex(
        merged_df,
        column_format="llc",  
        caption=caption,
        label=label,
        index=False
    )
    with open("latex_table.tex", "w") as f:
        f.write(latex_table)
    latex_table.replace("\n", " ")

In [None]:
merge_and_create_latex_table(ddpm_mean_df, ddpm_std_df, "aucroc", caption="AUCROC for DDPM", label="tab:ddpm_aucroc")

In [None]:
histogram_experiment(mean_df=ddpm_mean_df, std_df=ddpm_std_df, column="f1_score", ylabel="F1 score", title="F1 score for different experiments using DDPM")

In [None]:
histogram_experiment(mean_df=ddpm_mean_df, std_df=ddpm_std_df, column="aucroc", ylabel="AUCROC", title="AUCROC for different experiments of DDPM")

In [None]:
mean_df

In [None]:
# Take only rows where experiment contains "0.5"
dte_mean_df = mean_df[mean_df['model_name'] == "DTEC"]
dte_std_df = std_df[std_df['model_name'] == "DTEC"]
dte_std_df.head()


In [None]:
histogram_experiment(mean_df=dte_mean_df, std_df=dte_std_df, column="f1_score", ylabel="F1 score", title="F1 score for different experiments of DTEC")

In [None]:
merge_and_create_latex_table(dte_mean_df, dte_std_df, "f1_score", caption="F1 for DTEC", label="tab:dte_aucroc_synth")

In [None]:
# Function to extract the training method
def extract_training_method(experiment_name):
    # Regex to match the format {model_name}_{training_method}_s{seed}
    match = re.match(r'^[^_]+_([^_]+(?:_[^_]+)*)_s\d+', experiment_name)
    if match:
        return match.group(1)  # Extract the training_method part
    return None  # Return None if no match is found

# Apply the function to the dataframe
mean_df['training_method'] = mean_df['experiment'].apply(extract_training_method)
std_df['training_method'] = std_df['experiment'].apply(extract_training_method)
mean_df