In [11]:
import pickle
import pandas as pd
import shap
import numpy as np
import matplotlib.pyplot as plt

In [12]:
name = "lr"

with open(f"model_{name}.pkl", "rb") as f:
    model = pickle.load(f)

with open(f"explainer_{name}.pkl", "rb") as f:
    explainer = pickle.load(f)

def predict(data_list):
    y_pred_test=model.predict([data_list])
    return y_pred_test

In [13]:
def prepare(data_list):
    prev_data_frame = pd.DataFrame(data_list)
    data_frame=prev_data_frame.transpose().values[0]

    feature_names=[]
    for i in prev_data_frame.transpose():
        feature_names.append(i)
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(data_frame, check_additivity=False)
    shap_values = shap_values.transpose()
    
    return {'data_frame': data_frame, 'shap_values': shap_values, 'feature_names': feature_names}

In [14]:
def show_shap(data_list):
    
    data_frame=pd.DataFrame(data_list)
    explainer = shap.TreeExplainer(model)
    prev_shap_values = explainer.shap_values(data_frame, check_additivity=False)
    shap_values = prev_shap_values.transpose()

    index=0
    feature_importances = np.abs(shap_values).mean(axis=0)
    sorted_features = np.sort(feature_importances[0])[::-1]
    max_shap=sorted_features[0]

    print(f"row {i} is safe mainly because> ", end='')
    
    for i in range (0, len(feature_importances[0])):
        if(feature_importances[0][i]==max_shap):
            print(f"column index {i}", end=' ')
            index=i
    
    data_point = data_frame.iloc[index]
    print(f"is: {data_point[0]}")

In [15]:
def get_feature_importance_ranking(data_list):
  
  shap_values = prepare(data_list)['shap_values']
  feature_names = prepare(data_list)['feature_names']
  
  feature_importances = np.abs(shap_values).mean(axis=0)

  importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})

  importance_df.sort_values(by='importance', ascending=False, inplace=True)

  arr=np.around(importance_df, decimals=4)
  print(arr.transpose())

In [16]:
def plot_shap_value_distribution(data_list):
    
  shap_values = prepare(data_list)['shap_values']
  feature_names = prepare(data_list)['feature_names']
    
  for i in range(shap_values.shape[1]):
    plt.hist(shap_values[:, i])
    plt.xlabel("SHAP Value")
    plt.ylabel("Count")
    plt.title(f"Distribution of SHAP Values for {feature_names[i]}")
    plt.show()
    plt.clf()


In [17]:
def explain_individual_datapoint(data_list):
    
  shap_values = prepare(data_list)['shap_values']
  feature_names = prepare(data_list)['feature_names']
  
  for feature_name, shap_value in zip(feature_names, shap_values[0]):
    print(f"Feature: {feature_name}, SHAP Value: {shap_value}")

In [18]:
data=pd.read_csv('base files/creditcard_test.csv')
data_list = data.values.tolist()
i=0
for row in data_list:
    i=i+1
    res=predict(row)
    if(res[0]==1):
        try:
            # show_shap(row)
            get_feature_importance_ranking(row)
            # plot_shap_value_distribution(row)
            # explain_individual_datapoint(row)
        except Exception as e:
            print(e)

In [19]:
print('done')

done
