In [55]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [56]:
# Load the dataset
df = pd.read_csv('../database/cleaned_dataset.csv')

In [57]:
df.columns

Index(['age', 'balance_cash', 'balance_investment', 'balance_pension',
       'balance_3a', 'app_logins', 'income', 'has_twint', 'has_credit_card',
       'has_partner_account', 'residence_status', 'account_trx_ls_amount',
       'card_trx_ls_amount', 'investment_risk_score', 'vermoegenszuwachs_3mo',
       'vermoegenszuwachs_6mo', 'TARGET', 'ID'],
      dtype='object')

### Model

In [58]:
# Drop irrelevant columns
df_cleaned = df.drop(columns=['ID'])

# One-hot encode 'residence_status' if it's categorical
if 'residence_status' in df_cleaned.columns:
    df_cleaned = pd.get_dummies(df_cleaned, columns=['residence_status'], drop_first=True)

# Balance the dataset: Equal number of TARGET 0 and TARGET 1
df_target_1 = df_cleaned[df_cleaned['TARGET'] == 1]
df_target_0 = df_cleaned[df_cleaned['TARGET'] == 0].sample(n=len(df_target_1), random_state=42)
df_balanced = pd.concat([df_target_1, df_target_0]).reset_index(drop=True)

# Prepare data for training
X = df_balanced.drop(columns=['TARGET'])
y = df_balanced['TARGET']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bool_cols = X.select_dtypes(include=['bool']).columns.tolist()
# Convert boolean columns in training and testing sets
X_train[bool_cols] = X_train[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)

In [59]:
# Train RandomForestClassifier on the training set
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]  # Probability of class 1

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
importances = rf.feature_importances_
feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:\n", feature_importances)


Accuracy: 0.8580441640378549
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.92      0.85       143
           1       0.93      0.80      0.86       174

    accuracy                           0.86       317
   macro avg       0.86      0.86      0.86       317
weighted avg       0.87      0.86      0.86       317

Feature Importances:
 vermoegenszuwachs_3mo    0.202179
vermoegenszuwachs_6mo    0.183315
balance_cash             0.135244
income                   0.131366
card_trx_ls_amount       0.058409
account_trx_ls_amount    0.057712
age                      0.048022
balance_pension          0.042673
balance_3a               0.042081
app_logins               0.034578
balance_investment       0.009574
has_partner_account      0.008500
has_twint                0.007922
residence_status_b       0.007557
has_credit_card          0.006695
residence_status_ch      0.006125
residence_status_l       0.006009
investment_risk_scor

### Save model

In [60]:
import pickle

# Save the model to a file
with open('../models/random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf, file)

### Shap

In [61]:
# Initialize the SHAP explainer
explainer = shap.Explainer(rf, X_train)

# Calculate SHAP values for the test set
shap_values = explainer(X_test)

# Determine the shape of shap_values.values
print("Shape of shap_values.values:", shap_values.values.shape)

# For binary classification, extract SHAP values for class 1
shap_values_array = shap_values.values[:, :, 1]  # Shape: (n_samples, n_features)

# Calculate the sum of absolute SHAP values for each instance
sum_abs_shap_values = np.sum(np.abs(shap_values_array), axis=1)  # Shape: (n_samples,)

# Sort indices based on the sum of absolute SHAP values
sorted_indices = np.argsort(-sum_abs_shap_values)  # Descending order

# Select indices for high and low SHAP values
indices_high_shap = sorted_indices[:2]
indices_low_shap = sorted_indices[-2:]

print("Indices with highest SHAP values:", indices_high_shap)
print("Indices with lowest SHAP values:", indices_low_shap)

# Feature names
feature_names = X_test.columns

# Loop over customers with high SHAP values
print("\nCustomers with Highest SHAP Values:")
for idx in indices_high_shap:
    # Get individual data
    individual = X_test.iloc[idx]  # Returns a Series
    feature_values = individual.values  # Shape: (n_features,)
    individual_shap_values = shap_values_array[idx]  # Shape: (n_features,)
    
    # Create DataFrame for the individual's SHAP values
    individual_explanation = pd.DataFrame({
        'Feature': feature_names,
        'Feature Value': feature_values,
        'SHAP Value': individual_shap_values
    })
    
    # Add Customer Index and SHAP Sum
    individual_explanation['Customer Index'] = idx
    individual_explanation['SHAP Sum'] = sum_abs_shap_values[idx]
    
    # Rearrange columns
    individual_explanation = individual_explanation[['Customer Index', 'SHAP Sum', 'Feature', 'Feature Value', 'SHAP Value']]
    
    # Optionally, print the SHAP values and feature values
    print(f"\nIndividual SHAP Values for customer index {idx}:")
    print(individual_explanation[['Feature', 'Feature Value', 'SHAP Value']].sort_values(by='SHAP Value', key=abs, ascending=False).head(20))

# Repeat for customers with low SHAP values
print("\nCustomers with Lowest SHAP Values:")
for idx in indices_low_shap:
    # Get individual data
    individual = X_test.iloc[idx]  # Returns a Series
    feature_values = individual.values  # Shape: (n_features,)
    individual_shap_values = shap_values_array[idx]  # Shape: (n_features,)
    
    # Create DataFrame for the individual's SHAP values
    individual_explanation = pd.DataFrame({
        'Feature': feature_names,
        'Feature Value': feature_values,
        'SHAP Value': individual_shap_values
    })
    
    # Add Customer Index and SHAP Sum
    individual_explanation['Customer Index'] = idx
    individual_explanation['SHAP Sum'] = sum_abs_shap_values[idx]
    
    # Rearrange columns
    individual_explanation = individual_explanation[['Customer Index', 'SHAP Sum', 'Feature', 'Feature Value', 'SHAP Value']]
    
    # Optionally, print the SHAP values and feature values
    print(f"\nIndividual SHAP Values for customer index {idx}:")
    print(individual_explanation[['Feature', 'Feature Value', 'SHAP Value']].sort_values(by='SHAP Value', key=abs, ascending=False).head(20))

print("\nSaved SHAP values for selected customers to 'customer_shap_values.csv'.")




Shape of shap_values.values: (317, 20, 2)
Indices with highest SHAP values: [232 273]
Indices with lowest SHAP values: [189 197]

Customers with Highest SHAP Values:

Individual SHAP Values for customer index 232:
                  Feature  Feature Value  SHAP Value
14  vermoegenszuwachs_6mo      504781.72   -0.284575
6                  income       13259.93    0.153689
13  vermoegenszuwachs_3mo      217006.28    0.102090
1            balance_cash      528762.09   -0.064937
0                     age          64.00   -0.031775
3         balance_pension           0.00    0.026533
4              balance_3a           0.00   -0.014408
5              app_logins           0.00   -0.011256
10  account_trx_ls_amount        2574.29   -0.006784
7               has_twint           1.00   -0.003506
17    residence_status_ch           1.00   -0.003029
18     residence_status_l           0.00    0.002427
15     residence_status_b           0.00   -0.001676
12  investment_risk_score           0.00   -

In [62]:
with open('../models/shap_explainer.pkl', 'wb') as f:
    pickle.dump(explainer, f)