In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import shap
from xgboost import XGBClassifier
from collections import OrderedDict
import matplotlib.pyplot as plt
import copy
import math
import tabulate
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import spearmanr

In [99]:
data = pd.read_csv('student-por.csv')
print(data.shape)

data['G2'] = round(data['G2']/2)
y = np.where(data['G2']>5, 1, 0)

data = data[['sex', 'age', 'studytime', 'failures', 'absences', 'activities', 'G1', 'Medu', 'Fedu']]
originial_features = data.columns
data['G1'] = round(data['G1']/2)
sex = {'M': 0, 'F': 1}
activities = {'no': 0, 'yes': 1}
data = data.replace({'sex': sex})
data = data.replace({'activities': activities})
categorical_vars ={}
X = data
features = X.columns
print(features)


(649, 33)
Index(['sex', 'age', 'studytime', 'failures', 'absences', 'activities', 'G1',
       'Medu', 'Fedu'],
      dtype='object')


In [166]:
########################## GERMAN

data = pd.read_csv('german_credit.csv')

data =  data[['Purpose', 'Telephone', 'Account Balance', 'Duration of Credit (month)', 'Payment Status of Previous Credit',
            'Credit Amount', 'Guarantors', 'Age (years)', 'Creditability']]
#Purpose is a categorical variable, but some of its levels are merged
data.loc[data.Purpose <= 1, 'Credit Purpose'] = 1
data.loc[((data.Purpose>1) & (data.Purpose<=5)), 'Credit Purpose'] = 2
data.loc[data.Purpose>5, 'Credit Purpose'] = 3

data.loc[data['Payment Status of Previous Credit'] <= 1, 'Credit History'] = 1
data.loc[data['Payment Status of Previous Credit'] > 1, 'Credit History'] = 2

data.loc[data['Guarantors'] == 1, 'Guarantor'] = 1
data.loc[data['Guarantors'] > 1, 'Guarantor'] = 2


# making boolean series for a team name 
filter1 = (data["Credit History"]== 1)
filter2 = (data["Creditability"] == 0)

filter3 = (data["Credit History"]== 1)
filter4 = (data["Creditability"] == 1)


data['Weight'] = np.where((filter1 & filter2 | filter3 & filter4), .8, .1)
stupid_dataset = data.sample(frac=.4, random_state=1111, weights = data['Weight'])
data = stupid_dataset
y = data['Creditability']
data = data.drop(columns=['Purpose', 'Creditability', 'Weight',  'Payment Status of Previous Credit', 'Guarantors'])

# y = data['Creditability']
# data = data.drop(columns=['Purpose', 'Creditability', 'Payment Status of Previous Credit', 'Guarantors'])


# The data contains some categorical variables which need to be one-hot-encoded
categorical_vars = {'Credit Purpose': 3}
#check whether I want to keep purpose variable like this or merge certain purposes, also something weird with number of levels
purpose = {1:'Car', 2:'Home Related', 3:'Other'}
sex_marital_status = {1:'Male+Divorced', 2:'Male+Single', 3:'Male+Married/Widowed', 4:'Female'}
concurrent_credit = {1: 'Other Banks', 2:'Dept.Stores', 3:'None'}
telephone = {1: 'Yes', 2: 'No'}
foreign_worker = {1: 'Yes', 2: 'No'}


data_one_hot_encoded = data.replace({"Credit Purpose": purpose})
data_one_hot_encoded = pd.get_dummies(data_one_hot_encoded, columns=categorical_vars.keys())


X = data_one_hot_encoded
features = X.columns.values
print(features)


 

['Telephone' 'Account Balance' 'Duration of Credit (month)'
 'Credit Amount' 'Age (years)' 'Credit History' 'Guarantor'
 'Credit Purpose_Car' 'Credit Purpose_Home Related' 'Credit Purpose_Other']


In [204]:
# The one-hot- encoded data is splitted in a train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)
# The one-hot-encoded training data is further splitted into a validation- and training set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

X_train, X_val, X_test = X_train.values, X_val.values, X_test.values


In [205]:
model = XGBClassifier(max_depth=3, min_child_weight=1, gamma = 0.2, missing = np.nan)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.2,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [206]:
# Here a shap explainer is made to explain the prediction for the test set
shap_explainer = shap.TreeExplainer(model)
shap_values_test = shap_explainer.shap_values(X_test)

print(shap_values_test)



[[-1.52077209e-02  4.29019362e-01 -2.84021329e-02  7.04068065e-01
  -4.31192398e-01  2.91840851e-01 -3.74038704e-02  2.34752335e-02
   0.00000000e+00  1.05744880e-02]
 [-1.79828089e-02 -4.99776155e-01 -7.14113355e-01  3.64628732e-02
  -1.55133054e-01  2.43428200e-01 -5.15499488e-02  7.21724257e-02
   0.00000000e+00 -7.86137860e-03]
 [ 5.44339698e-03 -4.16906416e-01  3.61595362e-01 -3.98885041e-01
  -4.63943243e-01  3.59641165e-01 -4.20507267e-02  5.43503575e-02
   0.00000000e+00 -1.48123130e-03]
 [-1.62343979e-02  7.90923536e-01 -1.44427657e-01  5.25869727e-01
   1.79433212e-01  2.57238388e-01 -5.16829528e-02 -5.90474950e-03
   0.00000000e+00  1.23130409e-02]
 [ 2.28518294e-03 -8.34467933e-02  5.42844892e-01  5.28681517e-01
  -3.46059874e-02  4.52760428e-01 -6.20385744e-02  2.01710984e-02
   0.00000000e+00 -1.04879160e-04]
 [-1.62343979e-02  6.56295598e-01 -1.18756361e-01  4.94234711e-01
   6.87885210e-02  2.62910157e-01 -4.52951379e-02  2.44751759e-02
   0.00000000e+00 -1.04879160e-04

In [207]:
def norm_shap(shaps):
    return (shaps-min(shaps))/(max(shaps) - min(shaps))


#transforms the probability of a prediction to confidence of prediction, by scaling number between 0 and 1
def probability_to_confidence(prob):
    return ((prob-0.5)/0.5)

#maybe do this for " most critical", second most critical etc
def calc_impact(theta, n):
    number_of_impacts = 0
    normalized_shap_values = []
    for i in range(0, X_test.shape[0]):
        test_pred = model.predict([X_test[i]])
        test_prob = model.predict_proba([X_test[i]])[0]
        test_prob_pred = test_prob[test_pred[0]]
        confidence = probability_to_confidence(test_prob_pred)

        #get top2 indeces out of absolute values 
        shaps = abs(shap_values_test[i])
        normalized_shaps = norm_shap(shaps)
        sorted_indices = np.argsort(shaps)
        critical = sorted_indices[len(sorted_indices) - n]
        new_x = X_test[i].copy()
        new_x[critical] = np.nan
        test_pred_new = model.predict([new_x])
        test_prob_new = model.predict_proba([new_x])[0]
        test_prob_pred_new = test_prob_new[test_pred_new[0]]
        test_confidence_new = probability_to_confidence(test_prob_pred_new)
        
        number_of_impacts += ((test_pred_new != test_pred) or (test_confidence_new <= confidence*theta))
        
        normalized_shap_values.append(normalized_shaps[critical])
    
    print(str(sum(normalized_shap_values)/len(normalized_shap_values)))
    return number_of_impacts/X_test.shape[0]

calc_impact(0.5, 1)
calc_impact(0.5, 2)
calc_impact(0.5, 3)
calc_impact(0.5, 4)
calc_impact(0.5, 5)
calc_impact(0.5, 6)
calc_impact(0.5, 7)
calc_impact(0.5, 8)
calc_impact(0.5, 9)
# for i in range(1, 10):
#     print(calc_impact(0.5, i))

# test_pred = model.predict(X_test)
# test_prob = model.predict_proba(X_test)
# true_class_label = y_test[i]
# predicted_class_label = test_pred[i]

# prob_predicted_class_label = test_prob[i]
# print(prob_predicted_class_label[0])



1.0
0.6431545451283455
0.4883608128875494
0.3124878199771047
0.1647652474232018
0.07394859744235874
0.03852517502382398
0.018854462902527302
0.0053153777107581845


0.0