### **Cell 1: Imports**

This cell contains all the necessary library imports for the project.



In [1]:
import os
import pandas as pd
import re
import csv
import json
from CriticalDifference import draw_cd_diagram
from IPython.display import Image
import warnings
warnings.filterwarnings("ignore")



### **Cell 2: Configuration of Result Files**

This cell defines a dictionary mapping algorithm names to their corresponding result CSV files. This centralized configuration makes it easy to manage file paths.



In [None]:
results = {
    # CATEGORICAL ALGORITHMS
    'AVF': r'..\..\results\experiments\algorithms\AVF\AVF.csv',
    'CBRW': r'..\..\results\experiments\algorithms\CBRW\CBRW.csv',
    'CompreX': r'..\..\results\experiments\algorithms\CompreX\COMPREX.csv',
    'FPOF': r'..\..\results\experiments\algorithms\FPOF\FPOF.csv',
    'POP': r'..\..\results\experiments\algorithms\POP\POP.csv',
    'SCAN': r'..\..\results\experiments\algorithms\SCAN\SCAN.csv',
    'SDRW': r'..\..\results\experiments\algorithms\SDRW\SDRW.csv',
    'Zero++': r'..\..\results\experiments\algorithms\Zero++\ZERO++.csv',
    
    # NUMERICAL ALGORITHMS WITH DIFFERENT ENCODINGS
    'LOF_ca': r'..\..\results\experiments\algorithms\LOF\ca\CA.csv',
    'KNN_ca': r'..\..\results\experiments\algorithms\KNN\ca\CA.csv',
    'iForest_ca': r'..\..\results\experiments\algorithms\iForest\ca\CA.csv',
    'FastABOD_ca': r'..\..\results\experiments\algorithms\FastABOD\ca\CA.csv',
    'DeepSVDD_ca': r'..\..\results\experiments\algorithms\DeepSVDD\ca\CA.csv',
    'McCatch_ca': r'..\..\results\experiments\algorithms\McCatch\ca\CA.csv',
    
    'LOF_idf': r'..\..\results\experiments\algorithms\LOF\idf\IDF.csv',
    'KNN_idf': r'..\..\results\experiments\algorithms\KNN\idf\IDF.csv',
    'iForest_idf': r'..\..\results\experiments\algorithms\iForest\idf\IDF.csv',
    'FastABOD_idf': r'..\..\results\experiments\algorithms\FastABOD\idf\IDF.csv',
    'DeepSVDD_idf': r'..\..\results\experiments\algorithms\DeepSVDD\idf\IDF.csv',
    'McCatch_idf': r'..\..\results\experiments\algorithms\McCatch\idf\IDF.csv',
    
    'LOF_onehot': r'..\..\results\experiments\algorithms\LOF\one_hot\ONE_HOT.csv',
    'KNN_onehot': r'..\..\results\experiments\algorithms\KNN\one_hot\ONE_HOT.csv',
    'iForest_onehot': r'..\..\results\experiments\algorithms\iForest\one_hot\ONE_HOT.csv',
    'FastABOD_onehot': r'..\..\results\experiments\algorithms\FastABOD\one_hot\ONE_HOT.csv',
    'DeepSVDD_onehot': r'..\..\results\experiments\algorithms\DeepSVDD\one_hot\ONE_HOT.csv',
    'McCatch_onehot': r'..\..\results\experiments\algorithms\McCatch\one_hot\ONE_HOT.csv',
    
    'LOF_pivot': r'..\..\results\experiments\algorithms\LOF\pivot\PIVOT.csv',
    'KNN_pivot': r'..\..\results\experiments\algorithms\KNN\pivot\PIVOT.csv',
    'iForest_pivot': r'..\..\results\experiments\algorithms\iForest\pivot\PIVOT.csv',
    'FastABOD_pivot': r'..\..\results\experiments\algorithms\FastABOD\pivot\PIVOT.csv',
    'DeepSVDD_pivot': r'..\..\results\experiments\algorithms\DeepSVDD\pivot\PIVOT.csv',
    'McCatch_pivot': r'..\..\results\experiments\algorithms\McCatch\pivot\PIVOT.csv',
    
    'LOF_nocat': r'..\..\results\experiments\algorithms\LOF\nocat\NOCAT.csv',
    'KNN_nocat': r'..\..\results\experiments\algorithms\KNN\nocat\NOCAT.csv',
    'iForest_nocat': r'..\..\results\experiments\algorithms\iForest\nocat\NOCAT.csv',
    'FastABOD_nocat': r'..\..\results\experiments\algorithms\FastABOD\nocat\NOCAT.csv',
    'DeepSVDD_nocat': r'..\..\results\experiments\algorithms\DeepSVDD\nocat\NOCAT.csv',
    'McCatch_nocat': r'..\..\results\experiments\algorithms\McCatch\nocat\NOCAT.csv',
}



### **Cell 3: Load and Prepare Dataset Summary**

This cell loads a summary of the datasets used in the experiments and prepares a sorted list of all unique dataset files.



In [None]:
df_datasets_summary = pd.read_csv(r'..\resume_datasets.csv', sep=';')
all_datasets = sorted(df_datasets_summary['file'].unique().tolist())
df_datasets_summary.sort_values(by=['%_categorics', '%_binaries'], inplace=True)
df_datasets_summary



### **Cell 4: Verify Processed Files**

This cell checks which datasets have been processed by each algorithm, creating a summary DataFrame to show the processing status with checkmarks (✔) or crosses (❌).



In [None]:
# Verify processing status
pd.set_option('display.max_columns', 100)
processed_list = [all_datasets]
column_names = ['Datasets']

for name, path in results.items():
    column_names.append(name)
    status_list = []
    try:
        df_result = pd.read_csv(path, sep=';')
        processed_datasets = sorted(df_result['dataset'].apply(lambda x: re.sub(r'_v[0-9]+', '', x).replace('.csv.csv', '.csv')).unique().tolist())
        
        for dataset_name in all_datasets:
            if dataset_name in processed_datasets:
                status_list.append('\u2714')  # Checkmark
            else:
                status_list.append('\u274C')  # Cross
    except FileNotFoundError:
        # If a result file doesn't exist, mark all as not processed
        status_list = ['\u274C'] * len(all_datasets)
        
    processed_list.append(status_list)
    
# Create a DataFrame from the collected data
processed_dict = {col: data for col, data in zip(column_names, processed_list)}
df_status = pd.DataFrame(processed_dict)
df_status



### **Cell 5: Define Evaluation Metrics**

This cell lists the performance metrics that will be used for comparison throughout the notebook.



In [None]:
evaluation_metrics = [
    'auc',
    'adj_r_precision',
    'adj_average_precision',
    'adj_max_f1'
]



### **Cell 6: Helper Functions for Data Aggregation**

These functions are used to extract the best-performing row and calculate the average value for a given metric from a DataFrame.



In [None]:
def get_row_with_highest_value(df, column):
    """Returns the row with the maximum value in the specified column."""
    max_value_index = df[column].idxmax()
    return df.loc[max_value_index]

def get_average_algorithm_value(df, column):
    """Calculates the mean value of a specified column."""
    mean_value = df[column].mean()
    return mean_value



### **Cell 7: Result Processing and Analysis Functions**

This block contains core functions for processing the experiment results, including averaging dataset versions, handling missing data, and preparing data for ranking.



In [None]:
def average_dataset_versions(df):
    """
    Calculates the average performance across different versions of the same dataset.
    It groups by dataset name (ignoring version suffixes like _v01) and parameters.
    """
    df.fillna('', inplace=True)
    # Standardize dataset name by removing version suffixes (e.g., _v01)
    df['dataset'] = df['dataset'].str.replace(r'_v[0-9]{2}', '', regex=True)
    
    rows = []
    for dataset_name in df['dataset'].unique():
        # Get unique parameter sets for the current dataset
        parameters = df.query('dataset == @dataset_name')['parameter'].unique()
        for param in parameters:
            # Calculate the mean of metrics for this dataset and parameter set
            mean_metrics = df.query('dataset == @dataset_name and parameter == @param')[evaluation_metrics].mean().tolist()
            row = [dataset_name, param, df['algorithm'].unique()[0]] + mean_metrics
            rows.append(row)
            
    df_averaged = pd.DataFrame(rows, columns=['dataset', 'parameter', 'algorithm'] + evaluation_metrics)
    return df_averaged

def add_missing_datasets(df, metric, diagram_type):
    """
    Ensures that all algorithms have a result for every dataset.
    If a result is missing, it's added with a default low-performance value.
    """
    random_value = {'auc': 0.5, 'adj_r_precision': 0, 'adj_average_precision': 0, 'adj_max_f1': 0}
    
    # Determine column names, which can vary
    dataset_col = 'dataset_name' if 'dataset_name' in df.columns else 'dataset'
    algo_col = 'classifier_name' if 'classifier_name' in df.columns else 'algorithm'
    
    all_datasets_in_df = df[dataset_col].unique()
    all_algorithms_in_df = df[algo_col].unique()
    
    rows_to_add = []
    for d in all_datasets_in_df:
        for a in all_algorithms_in_df:
            if df[(df[dataset_col] == d) & (df[algo_col] == a)].empty:
                # Assign a default value for the missing entry
                new_row = {dataset_col: d, algo_col: a, 'value': random_value[metric], 'metric': metric, 'diagram': diagram_type}
                rows_to_add.append(new_row)
    
    if rows_to_add:
        df = pd.concat([df, pd.DataFrame(rows_to_add)], ignore_index=True)
        
    return df

def filter_datasets_by_list(df, dataset_list):
    """Filters a DataFrame to include only datasets from a specified list."""
    dataset_col = 'dataset_name' if 'dataset_name' in df.columns else 'dataset'
    df[dataset_col] = df[dataset_col].str.replace(r'_v[0-9]{2}', '', regex=True)
    return df[df[dataset_col].isin(dataset_list)]



### **Cell 8: Ranking and File Generation Functions**

These functions are responsible for calculating ranks based on performance and saving them to CSV files for further analysis (e.g., with Wilcoxon signed-rank test).



In [None]:
def calculate_ranks(values: list):
    """
    Calculates ranks for a list of values, handling ties by averaging.
    Example: [10, 20, 20, 30] -> [1, 2.5, 2.5, 4]
    """
    # Create a DataFrame to handle ranking easily
    temp_df = pd.DataFrame({'values': values})
    # 'average' method for ties is standard for statistical tests
    temp_df['rank'] = temp_df['values'].rank(method='average', ascending=False)
    return temp_df['rank'].tolist()

def create_wilcoxon_rank_file(ranking_series, file_name):
    """Saves the average ranks of algorithms to a CSV file."""
    ranking_df = ranking_series.reset_index()
    ranking_df.columns = ['algorithm', 'average_rank']
    # The rank of the average_rank is not standard for Wilcoxon, but might be for other plots.
    # Sticking to the original logic of re-ranking the average ranks.
    ranking_df['rank_of_rank'] = calculate_ranks(ranking_df['average_rank'].tolist())
    ranking_df.to_csv(file_name, index=False)



### **Cell 9: Calculate "Best" and "Average" Performance DataFrames**

This block processes all result files to create two main DataFrames:
1.  `df_bests`: Contains the best performance for each algorithm on each dataset (optimized parameters).
2.  `df_averages`: Contains the average performance for each algorithm on each dataset across all its parameters.



In [None]:
df_bests = []
df_averages = []

# BEST: Find the best parameter set for each algorithm on each dataset
for metric in evaluation_metrics:
    rows = []
    for algo_name, file_path in results.items():
        if os.path.isfile(file_path):
            df = pd.read_csv(file_path, sep=';').drop_duplicates()
            df_avg_versions = average_dataset_versions(df)
            
            for dataset_name in df_avg_versions['dataset'].unique():
                # Get the best performing parameters for the current dataset
                best_row = get_row_with_highest_value(
                    df_avg_versions.query('dataset == @dataset_name'),
                    metric
                )
                rows.append([best_row['dataset'], algo_name, best_row[metric], metric, 'best'])
                
    df_temp = filter_datasets_by_list(pd.DataFrame(rows, columns=['dataset', 'algorithm', 'value', 'metric', 'diagram']), all_datasets)
    df_bests.append(df_temp)

# AVERAGE: Calculate the average performance for each algorithm on each dataset
for metric in evaluation_metrics:
    rows = []
    for algo_name, file_path in results.items():
        if os.path.isfile(file_path):
            df = pd.read_csv(file_path, sep=';').drop_duplicates()
            df_avg_versions = average_dataset_versions(df)

            for dataset_name in df_avg_versions['dataset'].unique():
                # Get the average performance for the current dataset
                mean_value = get_average_algorithm_value(
                    df_avg_versions.query('dataset == @dataset_name'),
                    metric
                )
                rows.append([dataset_name, algo_name, mean_value, metric, 'average'])
                
    df_temp = filter_datasets_by_list(pd.DataFrame(rows, columns=['dataset', 'algorithm', 'value', 'metric', 'diagram']), all_datasets)
    df_averages.append(df_temp)



### **Cell 10: Generate Overall Critical Difference Diagrams**

This cell generates and saves the main Critical Difference (CD) diagrams, comparing all algorithms across all datasets for both "best" and "average" performance scenarios.



In [None]:
# Create output directories if they don't exist
if not os.path.exists(r'..\..\results\experiments\plot'):
    os.makedirs(r'..\..\results\experiments\plot', exist_ok=True)
if not os.path.exists(r'..\..\results\experiments\tables\best\ranking'):
    os.makedirs(r'..\..\results\experiments\tables\best\ranking', exist_ok=True)
if not os.path.exists(r'..\..\results\experiments\tables\average\ranking'):
    os.makedirs(r'..\..\results\experiments\tables\average\ranking', exist_ok=True)

# Process BEST performance results
for i, metric in enumerate(evaluation_metrics):
    df_ = df_bests[i]
    df_.rename(columns={'algorithm': 'classifier_name', 'dataset': 'dataset_name', 'value': 'accuracy'}, inplace=True)
    df_ = add_missing_datasets(df_, metric, 'best')
    df_bests[i] = df_
    
    output_path = f'..\\..\\results\\experiments\\plot\\best-{metric}'
    try:
        average_ranks = draw_cd_diagram(df_perf=df_.drop(columns=['metric', 'diagram']), title=f'Best Overall - Metric: {metric.replace("adj_", "")}', labels=True, output=output_path)
        create_wilcoxon_rank_file(average_ranks, f'..\\..\\results\\experiments\\tables\\best\\ranking\\best-{metric}.csv')
    except Exception as e:
        print(f"Error generating 'best' diagram for {metric}: {e}")

# Process AVERAGE performance results
for i, metric in enumerate(evaluation_metrics):
    df_ = df_averages[i]
    df_.rename(columns={'algorithm': 'classifier_name', 'dataset': 'dataset_name', 'value': 'accuracy'}, inplace=True)
    df_ = add_missing_datasets(df_, metric, 'average')
    df_averages[i] = df_
    
    output_path = f'..\\..\\results\\experiments\\plot\\average-{metric}'
    try:
        average_ranks = draw_cd_diagram(df_perf=df_.drop(columns=['metric', 'diagram']), title=f'Average Overall - Metric: {metric.replace("adj_", "")}', labels=True, output=output_path)
        create_wilcoxon_rank_file(average_ranks, f'..\\..\\results\\experiments\\tables\\average\\ranking\\average-{metric}.csv')
    except Exception as e:
        print(f"Error generating 'average' diagram for {metric}: {e}")



### **Cell 11: Dataset Grouping Functions**

This section defines a series of functions to categorize datasets based on their properties, such as data type, number of features, number of instances, and domain context. These groupings are used to perform more detailed, comparative analyses.



In [None]:
def get_datasets_by_type(summary_df):
    """Splits datasets into 'mixed' and 'categorical-only' types."""
    mixed = summary_df[summary_df['type'] == 'mix']['file'].tolist()
    categorical = summary_df[summary_df['type'] == 'cat']['file'].tolist()
    return mixed, categorical

def get_datasets_by_num_features(summary_df, low, medium):
    """Groups datasets by the number of features."""
    low_list = summary_df[summary_df['features'] <= low]['file'].tolist()
    medium_list = summary_df[(summary_df['features'] > low) & (summary_df['features'] <= medium)]['file'].tolist()
    high_list = summary_df[summary_df['features'] > medium]['file'].tolist()
    return low_list, medium_list, high_list

def get_datasets_by_num_instances(summary_df, low, medium):
    """Groups datasets by the number of instances."""
    low_list = summary_df[summary_df['instances'] <= low]['file'].tolist()
    medium_list = summary_df[(summary_df['instances'] > low) & (summary_df['instances'] <= medium)]['file'].tolist()
    high_list = summary_df[summary_df['instances'] > medium]['file'].tolist()
    return low_list, medium_list, high_list

def get_datasets_by_percent_categorical(summary_df, low, medium):
    """Groups mixed-type datasets by the percentage of categorical features."""
    df_mixed = summary_df[summary_df['attr_categorics'] > 0]
    low_list = df_mixed[df_mixed['%_categorics'] <= low]['file'].tolist()
    medium_list = df_mixed[(df_mixed['%_categorics'] > low) & (df_mixed['%_categorics'] <= medium)]['file'].tolist()
    high_list = df_mixed[df_mixed['%_categorics'] > medium]['file'].tolist()
    return low_list, medium_list, high_list

def get_datasets_by_binary_feature_threshold(summary_df, threshold):
    """
    For purely categorical datasets, splits them based on the percentage of binary features.
    """
    df_cat_only = summary_df[summary_df['%_categorics'] >= 100]
    low_list = df_cat_only[df_cat_only['%_binaries'] <= threshold]['file'].tolist()
    high_list = df_cat_only[df_cat_only['%_binaries'] > threshold]['file'].tolist()
    return low_list, high_list

def get_datasets_by_context(summary_df):
    """Splits datasets based on the presence of contextual features."""
    return ['thyroid_disease_variant1ori.csv', 'sick_sick_35_variant1ori.csv', 'cmc-nominal.csv', 'ecoli.csv', 'heart.csv', 'hepatitis.csv', 'lymphography.csv', 'nursery.csv'], ['bank-additional-ful-nominal_processed.csv', 'creditA_plus_42_variant1ori.csv', 'australian.csv', 'crx.csv', 'german.csv', 'Reuters-corn-100.csv'], ['kddcup99-corrected-u2rvsnormal-nominal-cleaned.csv', 'KDDTrain20R2LFS.csv', 'KDDTrain20ProbeFS.csv', 'KDDTrain20FS.csv'], ['solar-flare_FvsAll-cleaned_processed.csv', 'bands_band_16_variant1ori.csv', 'anneal.csv', 'covertype_nominal_4vs123567.csv', 'AID362red_train_allpossiblenominal.csv', 'list_attr_celeba_baldvsnonbald.csv', 'w7a-libsvm-nonsparse.csv'], ['scenario-1-1-rcat-0-icat.csv', 'scenario-1-2-rcat-0-icat.csv', 'scenario-1-3-rcat-0-icat.csv', 'scenario-1-4-rcat-0-icat.csv', 'scenario-1-5-rcat-0-icat.csv', 'scenario-1-6-rcat-0-icat.csv',
                          'scenario-1-7-rcat-0-icat.csv', 'scenario-1-8-rcat-0-icat.csv', 'scenario-1-9-rcat-0-icat.csv', 'scenario-1-9-rcat-1-icat.csv', 'scenario-1-9-rcat-2-icat.csv', 'scenario-1-9-rcat-3-icat.csv',
                          'scenario-1-9-rcat-4-icat.csv', 'scenario-1-9-rcat-5-icat.csv', 'scenario-1-9-rcat-6-icat.csv', 'scenario-1-9-rcat-7-icat.csv', 'scenario-1-9-rcat-8-icat.csv', 'scenario-1-9-rcat-9-icat.csv']
        



### **Cell 12: Generate Grouped Critical Difference Diagrams**

This final, comprehensive cell uses the grouping functions to generate a wide array of CD diagrams. It creates comparisons for different data types, feature counts, instance counts, and categorical feature percentages. This allows for a nuanced understanding of how algorithms perform under various conditions.



In [None]:

def generate_grouped_diagrams(df_results, metric, dataset_group, group_name, title_prefix, output_folder):
    """Helper function to generate and save a CD diagram for a specific dataset group."""
    df_filtered = filter_datasets_by_list(df_results.copy(), dataset_group)
    if df_filtered.empty or df_filtered['dataset_name'].nunique() < 2:
        print(f"Skipping '{group_name}' for metric '{metric}': Not enough data.")
        return

    output_path = f'..\\..\\results\\experiments\\plot\\{output_folder}\\{title_prefix}-{group_name}-{metric}'
    title = f'{title_prefix.replace("-", " ").title()} {metric} ({group_name})'
    
    try:
        draw_cd_diagram(df_perf=df_filtered.drop(columns=['metric', 'diagram']), title=title, labels=True, output=output_path)
    except Exception as e:
        print(f"Error generating diagram for {title}: {e}")

# Create output directories if they don't exist
for folder in ['feature', 'instance', 'type', 'percent-categorical', 'binary-feature', 'context']:
    if not os.path.exists(f'..\\..\\results\\experiments\\{folder}'):
        os.makedirs(f'..\\..\\results\\experiments\\plot\\{folder}', exist_ok=True)

# --- Analysis by Data Type ---
mixed_datasets, cat_datasets = get_datasets_by_type(df_datasets_summary)
for i, metric in enumerate(evaluation_metrics):
    generate_grouped_diagrams(df_bests[i], metric, mixed_datasets, 'Mix', 'best', 'type')
    generate_grouped_diagrams(df_bests[i], metric, cat_datasets, 'Cat', 'best', 'type')
    generate_grouped_diagrams(df_averages[i], metric, mixed_datasets, 'Mix', 'average', 'type')
    generate_grouped_diagrams(df_averages[i], metric, cat_datasets, 'Cat', 'average', 'type')

# --- Analysis by Number of Instances ---
low_inst, med_inst, high_inst = get_datasets_by_num_instances(df_datasets_summary, 10000, 20000)
for i, metric in enumerate(evaluation_metrics):
    generate_grouped_diagrams(df_bests[i], metric, low_inst, 'Low-Instances', 'best', 'instance')
    generate_grouped_diagrams(df_bests[i], metric, med_inst, 'Med-Instances', 'best', 'instance')
    generate_grouped_diagrams(df_bests[i], metric, high_inst, 'High-Instances', 'best', 'instance')
    generate_grouped_diagrams(df_averages[i], metric, low_inst, 'Low-Instances', 'average', 'instance')
    generate_grouped_diagrams(df_averages[i], metric, med_inst, 'Med-Instances', 'average', 'instance')
    generate_grouped_diagrams(df_averages[i], metric, high_inst, 'High-Instances', 'average', 'instance')

# --- Analysis by Number of Features ---
low_feat, med_feat, high_feat = get_datasets_by_num_features(df_datasets_summary, 15, 25)
for i, metric in enumerate(evaluation_metrics):
    generate_grouped_diagrams(df_bests[i], metric, low_feat, 'Low-Features', 'best', 'feature')
    generate_grouped_diagrams(df_bests[i], metric, med_feat, 'Med-Features', 'best', 'feature')
    generate_grouped_diagrams(df_bests[i], metric, high_feat, 'High-Features', 'best', 'feature')
    generate_grouped_diagrams(df_averages[i], metric, low_feat, 'Low-Features', 'average', 'feature')
    generate_grouped_diagrams(df_averages[i], metric, med_feat, 'Med-Features', 'average', 'feature')
    generate_grouped_diagrams(df_averages[i], metric, high_feat, 'High-Features', 'average', 'feature')

# --- Analysis by Percentage of Categorical Features (for mixed datasets) ---
low_cat, med_cat, high_cat = get_datasets_by_percent_categorical(df_datasets_summary, 34, 67)
for i, metric in enumerate(evaluation_metrics):
    generate_grouped_diagrams(df_bests[i], metric, low_cat, 'Low-Categorical', 'best', 'percent-categorical')
    generate_grouped_diagrams(df_bests[i], metric, med_cat, 'Med-Categorical', 'best', 'percent-categorical')
    generate_grouped_diagrams(df_bests[i], metric, high_cat, 'High-Categorical', 'best', 'percent-categorical')
    generate_grouped_diagrams(df_averages[i], metric, low_cat, 'Low-Categorical', 'average', 'percent-categorical')
    generate_grouped_diagrams(df_averages[i], metric, med_cat, 'Med-Categorical', 'average', 'percent-categorical')
    generate_grouped_diagrams(df_averages[i], metric, high_cat, 'High-Categorical', 'average', 'percent-categorical')

# --- Analysis by Percentage of Binary Features (for 100% categorical datasets) ---
low_bin, high_bin = get_datasets_by_binary_feature_threshold(df_datasets_summary, 50)
for i, metric in enumerate(evaluation_metrics):
    generate_grouped_diagrams(df_bests[i], metric, low_bin, 'Low-Binary', 'best', 'binary-feature')
    generate_grouped_diagrams(df_bests[i], metric, high_bin, 'High-Binary', 'best', 'binary-feature')
    generate_grouped_diagrams(df_averages[i], metric, low_bin, 'Low-Binary', 'average', 'binary-feature')
    generate_grouped_diagrams(df_averages[i], metric, high_bin, 'High-Binary', 'average', 'binary-feature')
    
# --- Analysis by Context of Dataset ---
medicine_context, finance_context, invasion_context, sciency_context, synthetic_context = get_datasets_by_context(df_datasets_summary)
for i, metric in enumerate(evaluation_metrics):
    generate_grouped_diagrams(df_bests[i], metric, medicine_context, 'Medicine', 'best', 'context')
    generate_grouped_diagrams(df_bests[i], metric, finance_context, 'Finance', 'best', 'context')
    generate_grouped_diagrams(df_bests[i], metric, invasion_context, 'Invasion', 'best', 'context')
    generate_grouped_diagrams(df_bests[i], metric, sciency_context, 'Sciency', 'best', 'context')
    generate_grouped_diagrams(df_bests[i], metric, synthetic_context, 'Synthetic', 'best', 'context')
    generate_grouped_diagrams(df_averages[i], metric, medicine_context, 'Medicine', 'average', 'context')
    generate_grouped_diagrams(df_averages[i], metric, finance_context, 'Finance', 'average', 'context')
    generate_grouped_diagrams(df_averages[i], metric, invasion_context, 'Invasion', 'average', 'context')
    generate_grouped_diagrams(df_averages[i], metric, sciency_context, 'Sciency', 'average', 'context')
    generate_grouped_diagrams(df_averages[i], metric, synthetic_context, 'Synthetic', 'average', 'context')

print("\n--- All processing and diagram generation complete. ---")