In [1]:
import pandas as pd

# Load the CSV file to check its contents
data_path = 'Results - Sheet1.csv'
data = pd.read_csv(data_path)

# Display the first few rows of the dataframe to understand its structure
data.head(), data.columns


(   Dataset                Model  Num_features Shap_Lime_Adjusted  Accuracy  \
 0  Abalone  Logistic Regression             8                NaN  0.583732   
 1  Abalone  Logistic Regression             7               Lime  0.574163   
 2  Abalone  Logistic Regression             7               Shap  0.578947   
 3      pc4        Random Forest            37                NaN  0.900685   
 4      pc4        Random Forest            34               Lime  0.900685   
 
    F1-macro  F1-micro  F1-weighted   ROC AUC  
 0  0.572013  0.583732     0.570221  0.771102  
 1  0.560012  0.574163     0.558557  0.769296  
 2  0.564986  0.578947     0.563447  0.773467  
 3  0.718100  0.900685     0.879707  0.957810  
 4  0.708953  0.900685     0.877224  0.954476  ,
 Index(['Dataset', 'Model', 'Num_features', 'Shap_Lime_Adjusted', 'Accuracy',
        'F1-macro', 'F1-micro', 'F1-weighted', 'ROC AUC'],
       dtype='object'))

In [3]:
from scipy.stats import wilcoxon
import numpy as np

In [4]:
# Function to prepare and perform Wilcoxon signed-rank test with adjustments for zero differences
def perform_adjusted_tests(data, metric):
    results = []
    # Filter data for each dataset and model combination
    grouped = data.groupby(['Dataset', 'Model'])

    for (dataset, model), group in grouped:
        none_scores = group[group['Shap_Lime_Adjusted'] == 'None'][metric]
        lime_scores = group[group['Shap_Lime_Adjusted'] == 'Lime'][metric]
        shap_scores = group[group['Shap_Lime_Adjusted'] == 'Shap'][metric]

        # Ensure we have data to compare
        if not lime_scores.empty and not none_scores.empty:
            if np.all(none_scores.values == lime_scores.values):
                lime_stat, lime_p = np.nan, 1  # No change in scores
            else:
                lime_stat, lime_p = wilcoxon(none_scores.values, lime_scores.values, alternative='two-sided')
        else:
            lime_stat, lime_p = np.nan, np.nan

        if not shap_scores.empty and not none_scores.empty:
            if np.all(none_scores.values == shap_scores.values):
                shap_stat, shap_p = np.nan, 1  # No change in scores
            else:
                shap_stat, shap_p = wilcoxon(none_scores.values, shap_scores.values, alternative='two-sided')
        else:
            shap_stat, shap_p = np.nan, np.nan

        # Store results
        results.append({
            'Dataset': dataset,
            'Model': model,
            'Metric': metric,
            'LIME Stat': lime_stat,
            'LIME p-value': lime_p,
            'SHAP Stat': shap_stat,
            'SHAP p-value': shap_p
        })

    return pd.DataFrame(results)

# Perform adjusted tests for Accuracy, F1-macro, F1-weighted, and ROC AUC
adjusted_results_accuracy = perform_adjusted_tests(data, 'Accuracy')
adjusted_results_f1_macro = perform_adjusted_tests(data, 'F1-macro')
adjusted_results_f1_weighted = perform_adjusted_tests(data, 'F1-weighted')
adjusted_results_roc_auc = perform_adjusted_tests(data, 'ROC AUC')

# Combine adjusted results for easier viewing
adjusted_final_results = pd.concat([adjusted_results_accuracy, adjusted_results_f1_macro, adjusted_results_f1_weighted, adjusted_results_roc_auc])
adjusted_final_results


Unnamed: 0,Dataset,Model,Metric,LIME Stat,LIME p-value,SHAP Stat,SHAP p-value
0,Abalone,Logistic Regression,Accuracy,,,,
1,ba,K-Nearest Neighbors,Accuracy,,,,
2,btsc,Decision Tree,Accuracy,,,,
3,credit,Logistic Regression,Accuracy,,,,
4,iris,K-Nearest Neighbors,Accuracy,,,,
5,madelon,XGBoost,Accuracy,,,,
6,pc4,Random Forest,Accuracy,,,,
7,scene,AdaBoost,Accuracy,,,,
8,wbdc,Support Vector Machine,Accuracy,,,,
0,Abalone,Logistic Regression,F1-macro,,,,
