# Imports

In [16]:
%run Imports.ipynb
name = 'Kred' # Choose Kred or Mone
from scipy.stats import spearmanr
from sklearn.metrics import accuracy_score, roc_auc_score


# Data

In [2]:
with open("../../pickle/Monedo_5/wo_optuna/ebm_best_results.pkl", "rb") as f:
    ebm_best_res = pickle.load(f)

In [3]:
with open("../../pickle/Monedo_7/without_optuna/results_M7_T2.pkl", "rb") as f:
    m7_res = pickle.load(f)

In [19]:

key_featsubgroups = pd.read_pickle('../../pickle/2_FS/' + name + '/key_featsubgroups.pkl')
df = pd.read_pickle('../../pickle/2_FS/' + name + '/2_df_new_.pkl')

In [20]:
print(df[target].value_counts()/df.shape[0])
print('df_shape: ', df.shape)

arrears
1   0.646
0   0.354
Name: count, dtype: float64
df_shape:  (129457, 418)


# Functions

In [21]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

In [22]:
x_train, x_test, y_train, y_test = split_data_4(df)

Demographic Features: ['clientdata.demo.gender', 'clientdata.demo.age_year', 'clientdata.demo.age_month', 'clientdata.demo.children', 'clientdata.demo.children_singleparent', 'clientdata.demo.maritalstatus_expand_SINGLE', 'clientdata.demo.maritalstatus_expand_MARRIED', 'clientdata.demo.maritalstatus_expand_DIVORCED', 'clientdata.demo.maritalstatus_expand_WIDOWED', 'clientdata.demo.maritalstatus_expand_newvalue', 'clientdata.demo.maritalstatus_woe']
Training Features Shape: (103565, 406)
Training Labels Shape: (103565,)
Testing Features Shape: (25892, 406)
Testing Labels Shape: (25892,)


# Reporting

## Overview

In [14]:
ebm_best_res.keys()

dict_keys(['y_test_pred', 'y_test_proba', 'feature_importances'])

In [17]:
m7_res.keys()

dict_keys(['y_test_pred', 'y_test_proba', 'feature_importances'])

In [6]:
results = {'EBM': ebm_best_res,
           'SKlearn_NN': m7_res}

In [8]:
for mth, res in results.items():
    print(mth)

EBM
SKlearn_NN


## reporting

In [23]:
# --- 1) Build performance DataFrame ---
perf_rows = []
for method, res in results.items():
    y_pred  = res["y_test_pred"]
    y_proba = res["y_test_proba"]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    perf_rows.append({
        "method":   method,
        "accuracy": acc,
        "roc_auc":  auc
    })

perf_df = pd.DataFrame(perf_rows).set_index("method")
print("=== Predictive Performance ===")
print(perf_df)

=== Predictive Performance ===
            accuracy  roc_auc
method                       
EBM            0.698    0.721
SKlearn_NN     0.647    0.500


In [24]:
# --- 2) Build feature‐importance DataFrame ---
# This assumes each feature_importances array is the same length
fi_df = pd.DataFrame({
    method: res["feature_importances"]
    for method, res in results.items()
})
fi_df.index.name = "feature_index"
print("\n=== Feature Importances (first 5 features) ===")
print(fi_df.head())


=== Feature Importances (first 5 features) ===
                EBM  SKlearn_NN
feature_index                  
0             0.000       0.000
1             0.001       0.000
2             0.001       0.000
3             0.000       0.000
4             0.000       0.000


In [25]:
# --- 3) Pairwise Spearman rank correlations ---
corr = fi_df.corr(method="spearman")
print("\n=== Spearman Rank-Correlation of Feature Importances ===")
print(corr)


=== Spearman Rank-Correlation of Feature Importances ===
             EBM  SKlearn_NN
EBM        1.000       0.147
SKlearn_NN 0.147       1.000


In [26]:
# --- 4) (Optional) Long-format correlations table ---
corr_long = (
    corr
      .reset_index()
      .melt(id_vars="index", var_name="method2", value_name="spearman_r")
      .rename(columns={"index": "method1"})
)
print("\n=== Pairwise Spearman Correlations (long format) ===")
print(corr_long)


=== Pairwise Spearman Correlations (long format) ===
      method1     method2  spearman_r
0         EBM         EBM       1.000
1  SKlearn_NN         EBM       0.147
2         EBM  SKlearn_NN       0.147
3  SKlearn_NN  SKlearn_NN       1.000
