# predictive performance evaluation

In [None]:
import pandas as pd
import numpy as np
import os

import metrics_normalized
import radar_chart
import fairness_dashboard

import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import (
    accuracy_score, 
    balanced_accuracy_score,
    recall_score, 
    precision_score, 
    f1_score
)

import warnings
warnings.filterwarnings("ignore")

In [None]:
#sensitive_attribute = "nationality"
sensitive_attribute = "gender"

input_path = "xxx" # add customized local path to input folder
output_path = "xxx" # add customized local path to output folder

y_true = pd.read_pickle(input_path+"y_test.pkl")

baseline = pd.read_pickle(input_path+"y_pred_te_baseline.pkl")
ftu = pd.read_pickle(input_path+"y_pred_lgbm_nosen.pkl")[0].to_numpy()
#to = pd.read_pickle(input_path+"ypred_to_nat_tprr_recall.pkl")[0].to_numpy()

if sensitive_attribute == 'gender':
    fgbm = pd.read_pickle(input_path+"y_pred_fgbm_gender.pkl")[0].to_numpy()
    hpt = pd.read_pickle(input_path+"y_pred_fgbm_gen_hpt_v2.pkl")[0].to_numpy()
    #eodds = pd.read_csv(input_path+"y_pred_eodds_gen_FU.csv").to_numpy().flatten()
    #roc = pd.read_csv(input_path+"y_pred_roc_gen_FU.csv")["0"].to_numpy().flatten()
    
else: #nationality
    fgbm = pd.read_pickle(input_path+"y_pred_fgbm_nat.pkl")[0].to_numpy()
    hpt = pd.read_pickle(input_path+"y_pred_nat_FGBM_HPT_model.pkl")[0].to_numpy()
    eodds = pd.read_csv(input_path+"y_pred_eodds_nat_AU.csv").to_numpy().flatten()
    roc = pd.read_csv(input_path+"y_pred_roc_nat_AU.csv")["0"].to_numpy().flatten()
    

# Generate different prediction sets with varying accuracy
#all_pred = [baseline,ftu,fgbm,hpt,to,roc,eodds]
all_pred = [baseline,ftu,fgbm,hpt]


#model_names = ["Baseline", "FTU", "FGBM", "FGBM with HPT", "ThresholdOpt", "ROC", "EOdds"]
model_names = ["Baseline", "FTU", "FGBM", "FGBM with HPT"]

y_pred_list = []
for pred in all_pred:
    pred = (pred > 0.5).astype(int)
    y_pred_list.append(pred)

In [None]:
def evaluate_models(y_true, y_pred_list, model_names=None):
    """
    Evaluate multiple models against a single ground truth.
    
    Parameters:
    -----------
    y_true : array-like
        Ground truth (correct) target values.
    y_pred_list : list of array-like
        List of predictions from different models.
    model_names : list of str, optional
        Names for the models. If None, will use Model_1, Model_2, etc.
        
    Returns:
    --------
    DataFrame with performance metrics for each model.
    """
    if model_names is None:
        model_names = [f"Model_{i+1}" for i in range(len(y_pred_list))]
    
    if len(model_names) != len(y_pred_list):
        raise ValueError("Number of model names must match number of prediction sets")
    
    # Initialize dictionary to store results
    results = {
        'Model': [],
        'Accuracy': [],
        'Balanced Accuracy': [],
        'Recall': [],
        'Precision': [],
        'F1 Score': []
    }
    
    # Calculate metrics for each model
    for name, y_pred in zip(model_names, y_pred_list):
        results['Model'].append(name)
        results['Accuracy'].append(accuracy_score(y_true, y_pred))
        results['Balanced Accuracy'].append(balanced_accuracy_score(y_true, y_pred))
        results['Recall'].append(recall_score(y_true, y_pred))
        results['Precision'].append(precision_score(y_true, y_pred))
        results['F1 Score'].append(f1_score(y_true, y_pred))
    
    # Create DataFrame from results
    df_results = pd.DataFrame(results)
    
    return df_results

In [None]:
# Evaluate models
results_df = evaluate_models(y_true, y_pred_list, model_names)

# Display results
print(results_df)

# results_df.to_csv('model_comparison.csv', index=False)

In [None]:
df_results = results_df
# Set the style for all visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

def plot_facet_grid(df):
    """Create a facet grid of individual metrics with consistent colors for models and a legend"""
    # Melt the DataFrame
    df_melted = pd.melt(df, id_vars=['Model'], 
                         value_vars=['Accuracy', 'Balanced Accuracy', 'Recall', 'Precision', 'F1 Score'],
                         var_name='Metric', value_name='Score')
    
    # Create a custom color palette for the models
    unique_models = df['Model'].unique()
    model_colors = dict(zip(unique_models, sns.color_palette("colorblind", len(unique_models))))
    
    # Create a FacetGrid - single row with 5 columns and smaller height
    g = sns.FacetGrid(df_melted, col='Metric', col_wrap=5, height=4, aspect=0.7, sharey=True)
    
    # Map the barplot with colors based on model and hide x-tick labels
    def plot_colored_bars(data, **kwargs):
        ax = sns.barplot(data=data, x='Model', y='Score', palette=model_colors, **kwargs)
        
        for patch in ax.patches:
            current_width = patch.get_width()
            patch.set_width(current_width * 0.7) 
            patch.set_x(patch.get_x() + current_width * 0.2)
        
        ax.set_xticklabels([])
        # Add value labels on top of bars
        for container in ax.containers:
            ax.bar_label(container, fmt='%.2f', fontsize=7)
    
    g.map_dataframe(plot_colored_bars)
    
    g.set_titles(col_template="{col_name}", fontsize=12 , pad=10)
    g.set_axis_labels("", "Score") 
    
    g.set(ylim=(0, 1))
    
    # Create a custom legend (outside the facet grid)
    handles = [plt.Rectangle((0,0),1,1, color=color) for color in model_colors.values()]
    labels = list(model_colors.keys())
    g.fig.legend(handles, labels, title="Model", bbox_to_anchor=(1.02, 0.5), loc='center left')
    plt.subplots_adjust(top=0.85, right=0.85)  
    
    #g.fig.suptitle('Performance Metrics by Model Type', fontsize=16)
    
    plt.tight_layout()
    plt.savefig(output_path+"Perf_metrics_gen.png", dpi=300, bbox_inches='tight')
    plt.show()

plot_facet_grid(df_results)