In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from matplotlib import pyplot as plt

import lime

import plotly.graph_objects as go

import shap

In [2]:
def read_csv(df, sep = ","):
    return pd.read_csv(df, sep)

In [3]:
def to_csv(df, filepath):
    df.to_csv(filepath, index=False)

In [4]:
def return_columns(df):
    return df.columns

In [9]:
def create_decision_tree(criterion="gini", splitter="best"):
    model = DecisionTreeClassifier(criterion=criterion, splitter=splitter, random_state=1)
    return model

In [10]:
def fit_decision_tree(train_X, train_y, model):
    model.fit(train_X, train_y)

In [11]:
def calculate_accuracy(testcases, prediction_result):
    return round(accuracy_score(testcases, prediction_result) * 100.00, 2)

In [12]:
def train_decision_tree_agreed_disagreed(all_cases, features):
    train_X = all_cases[features]
    train_y = all_cases.Agreed
    model = create_decision_tree()
    fit_decision_tree(train_X, train_y, model)
    return model

In [14]:
def save_ML_model(file_name, model):
    filehandler = open(file_name + '.obj', 'wb') 
    pickle.dump(model, filehandler)

In [15]:
def generate_predictions(model, testcases):
    return model.predict(testcases)

In [16]:
def load_ML_model(file_to_open):
    filehandler = open(file_to_open, 'rb') 
    return pickle.load(filehandler)

In [17]:
def calculate_similarity(prediction_result1, prediction_result2):
    return round(accuracy_score(prediction_result1, prediction_result2) * 100.00, 2)

In [18]:
def number_of_disagreed_cases(testcases, similarity):
    wrong_proportion = 1 - (similarity / 100)
    return round(len(testcases) * wrong_proportion)

In [19]:
def return_agreed_cases(testcases, prediction_result1, prediction_result2):
    filters = []
    for i in range(len(testcases)):
        filters.append(prediction_result1[i] == prediction_result2[i])
    disagreed = testcases[filters]
    disagreed.loc[:,'Agreed'] = 1
    return disagreed

In [20]:
def return_disagreed_cases(testcases, prediction_result1, prediction_result2):
    filters = return_agreed_cases(testcases,prediction_result1, prediction_result2)
    agreed = testcases.loc[testcases.index.difference(filters.index)]
    agreed.loc[:,'Agreed'] = 0
    return agreed

In [21]:
#     return feature_importance_output.plot.barh()
def print_feature_importance(fittedmodel, features):
    feature_importance = fittedmodel.feature_importances_
    feature_importance_output = pd.DataFrame(feature_importance, features)
    feature_importance_output.set_axis(['Output'], axis=1, inplace=True)
    print(feature_importance_output)
    ax = feature_importance_output.plot.barh(figsize=(9,5))

In [22]:
def print_feature_importance_regression(fittedmodel, features):
    importance = fittedmodel.coef_
    importance = importance.reshape(-1,1)
    df = pd.DataFrame(importance, features)
    df.set_axis(['Feature coefficient'], axis=1, inplace=True)
    ax = df.plot.barh(figsize=(9,5))
    plt.show()

In [23]:
def feature_importance_for_disagreement(features, agreed_cases, disagreed_cases):
    all_cases = pd.DataFrame(agreed_cases.append(disagreed_cases))
    X_train = all_cases[features]
    y_train = all_cases['Agreed']
    model = create_decision_tree()
    fit_decision_tree(X_train, y_train, model)
    print_feature_importance(model, features)

In [24]:
def feature_importance_comparison(model1, model2, features, name1, name2):
    feature_importance1 = model1.feature_importances_
    feature_importance2 = model2.feature_importances_
    feature_importance1_output = pd.DataFrame(feature_importance1, features)
    feature_importance2_output = pd.DataFrame(feature_importance2, features)
    concat_df = pd.concat([feature_importance1_output, feature_importance2_output],axis=1)
    concat_df.set_axis([name1,name2], axis=1, inplace=True)
    ax=concat_df.plot.barh(figsize=(9,7))
    plt.show()    

In [25]:
def feature_importance_comparison_regression(model1, model2, features, name1, name2):
    feature_importance1 = (model1.coef_).reshape(-1,1)
    feature_importance2 = (model2.coef_).reshape(-1,1)
    feature_importance1_output = pd.DataFrame(feature_importance1, features)
    feature_importance2_output = pd.DataFrame(feature_importance2, features)
    concat_df = pd.concat([feature_importance1_output, feature_importance2_output],axis=1)
    concat_df.set_axis([name1,name2], axis=1, inplace=True)
    ax=concat_df.plot.barh(figsize=(9,7))
    plt.show()    

In [26]:
def disagreed_LIME(train1, train2, all_disagreed_testcases, model1, model2, features, classes):
    train1_numpy = train1.to_numpy()
    train2_numpy = train2.to_numpy()
    all_disagreed_testcases_numpy = all_disagreed_testcases[features].to_numpy()

    explainer = lime.lime_tabular.LimeTabularExplainer(train1_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    i = np.random.randint(0, len(all_disagreed_testcases_numpy))
    disagreed_case = all_disagreed_testcases_numpy[i]

    exp1 = explainer.explain_instance(disagreed_case, model1.predict_proba, num_features=len(features))
    exp1_map = exp1.as_map()
    exp1.show_in_notebook()

    explainer = lime.lime_tabular.LimeTabularExplainer(train2_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    exp2 = explainer.explain_instance(disagreed_case, model2.predict_proba, num_features=len(features))
    exp2_map = exp2.as_map()
    exp2.show_in_notebook()
    
    index1= model1.predict(disagreed_case.reshape(1,-1))[0]
    index2= model2.predict(disagreed_case.reshape(1,-1))[0]
    
    exp1_result = classes[index1]
    exp2_result = classes[index2]
    
    return exp1_map[1], exp2_map[1], disagreed_case, exp1_result, exp2_result

In [27]:
def disagreed_LIME_multi(train1, train2, all_disagreed_testcases, model1, model2, features, classes):
    train1_numpy = train1.to_numpy()
    train2_numpy = train2.to_numpy()
    all_disagreed_testcases_numpy = all_disagreed_testcases[features].to_numpy()

    explainer = lime.lime_tabular.LimeTabularExplainer(train1_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    i = np.random.randint(0, len(all_disagreed_testcases_numpy))
    disagreed_case = all_disagreed_testcases_numpy[i]
    
    index1= int(model1.predict(disagreed_case.reshape(1,-1))[0]) - min(classes)
    index2= int(model2.predict(disagreed_case.reshape(1,-1))[0]) - min(classes)

    exp1 = explainer.explain_instance(disagreed_case, model1.predict_proba, num_features=len(features), labels=[index1])
    exp1_map = exp1.as_map()
    exp1.show_in_notebook()

    explainer = lime.lime_tabular.LimeTabularExplainer(train2_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    exp2 = explainer.explain_instance(disagreed_case, model2.predict_proba, num_features=len(features), labels=[index2])
    exp2_map = exp2.as_map()
    exp2.show_in_notebook()
    
    exp1_result = classes[index1]
    exp2_result = classes[index2]
    
    return exp1_map[index1], exp2_map[index2], disagreed_case, exp1_result, exp2_result

In [28]:
def regression_LIME(train_set, test_set, model, features, class_name):
    explainer = lime.lime_tabular.LimeTabularExplainer(train_set.to_numpy(),
                                                       feature_names=features,
                                                       class_names=class_name,
                                                       verbose=True,
                                                       mode='regression')
    test_set_numpy = test_set.to_numpy()
    i = np.random.randint(0, len(test_set_numpy))
    exp = explainer.explain_instance(test_set_numpy[i], model.predict, num_features=len(features))
    exp.show_in_notebook()

In [29]:
def LIME_compare_bar_plot(features, exp1_map, exp2_map, exp1_result, exp2_result):
    exp1_map.sort()
    exp2_map.sort()
    
    exp1_map_df = pd.DataFrame(exp1_map)
    exp2_map_df = pd.DataFrame(exp2_map)
    
    exp1_map_df[0] = features
    exp2_map_df[0] = features
    
    merged_df= pd.merge(exp1_map_df, exp2_map_df, on=0)
    
    ax = merged_df.plot.barh(figsize=(9,7))
    plt.yticks(np.arange(len(features)),features)
    plt.legend([exp1_result, exp2_result])
    plt.show()

labels = ['Younger Diabetes', 'Younger No Diabetes', 'Diabetes', 'No Diabetes', 'Older Diabetes', 'Older No Diabetes']

sources = [0, 0, 1, 1, 4, 4, 5, 5] 

targets = [2, 3, 2, 3, 2, 3, 2, 3]

- #num(predicted class1 by v1 that are of class1), 
- #num(predicted class1 by v1 that are of class2), …,

values = [younger_y_pred_confusion[1][1], younger_y_pred_confusion[1][0],
          younger_y_pred_confusion[0][1], younger_y_pred_confusion[0][0],
          older_y_pred_opposite_confusion[1][1], older_y_pred_opposite_confusion[1][0],
          older_y_pred_opposite_confusion[0][1], older_y_pred_opposite_confusion[0][0],]

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      color = "blue"
    ),
    link = dict(
      source = sources,
      target = targets,
      value = values
  ))])

fig.update_layout(title_text="Sankey Diagram", font_size=10)
fig.show()

In [30]:
def draw_sankey_diagram(title, classes, pred1, pred2, ground_truth):
    number_of_classes = len(classes)

    classes_labels = list()
    for i in range(max(classes), min(classes) - 1, -1):
        classes_labels.append("Class " + str(i))
    
    labels = classes_labels * 3
    
    confusion_matrix1 = confusion_matrix(ground_truth, pred1, labels=classes)
    confusion_matrix2 = confusion_matrix(ground_truth, pred2, labels=classes)
    
    targets = list()
    for i in range(number_of_classes * 2 - 1, number_of_classes * 2 - number_of_classes - 1, -1):
        for j in range(1, 1 + number_of_classes):
            targets.append(i)
    start = targets.copy()
    for i in range(number_of_classes):
        for j in range(3 * number_of_classes - 1, 2 * number_of_classes -1, -1):
            targets.append(j)

    sources = list()
    for i in range(number_of_classes):
        for j in range(number_of_classes - 1, -1, -1):
            sources.append(j)

    sources += start
    
    values = list(confusion_matrix1.ravel()) + list(confusion_matrix2.ravel())
    
    my_colors = [('rgba('+str(np.random.randint(0, high = 256))+','+
                str(np.random.randint(0, high = 256))+','+
                str(np.random.randint(0, high = 256))) for i in range(len(classes_labels))]
    my_colors_node = []
    my_colors_opac = []

    for rgba in my_colors:
        my_colors_node.append(rgba + ',0.8)')
        my_colors_opac.append(rgba + ',0.4)')
    my_colors_opac = my_colors_opac * number_of_classes
    
    fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),    
        label = labels,
        color = my_colors_node * 3
    ),
    link = dict(
        source = sources,
        target = targets,
        value = values,
        color = my_colors_opac[::-1] + my_colors_opac[::-1]
    ))])

    fig.update_layout(title_text=title, font_size=12)
    fig.show()

In [31]:
def create_LinearExplainer(model, train_data, sample = None):
    if sample != None:
        train_data = shap.sample(train_data, sample)
    return shap.LinearExplainer(model, train_data)

In [32]:
def create_KernelExplainer(model, train_data, sample = None):
    if sample != None:
        train_data = shap.sample(train_data, sample)
    return shap.KernelExplainer(model.predict_proba, train_data)

In [33]:
def create_TreeExplainer(model, train_data, sample = None):
    if sample != None:
        train_data = shap.sample(train_data, sample)
    return shap.TreeExplainer(model, train_data)

In [34]:
def print_SHAP_info(shap_explainer, model, test_data, index):
    shap_values = shap_explainer.shap_values(test_data)

    print("Base Value : ", shap_explainer.expected_value)
    print()
    print("Shap Values for Sample %d: " %(index), shap_values[index])
    print("\n")
    print("Prediction From Model                            : ", model.predict((test_data.iloc[index]).to_numpy().reshape(1,-1))[0])
    print("Prediction From Adding SHAP Values to Base Value : ", shap_explainer.expected_value + shap_values.sum())
    print("Input instance:                                  : ", test_data.iloc[index])
    

In [35]:
def global_regression_force_plot(shap_explainer, test_data):
    shap.initjs()

    base_value = shap_explainer.expected_value
    shap_values = shap_explainer.shap_values(test_data)

    return shap.force_plot(base_value, shap_values, test_data)

In [36]:
def global_classification_force_plot(shap_explainer, test_data, class_index, sample=None):
    shap.initjs()
    
    if sample != None:
        test_data = shap.sample(test_data, sample)
    
    base_value = shap_explainer.expected_value
    shap_values = shap_explainer.shap_values(test_data)
    
    return shap.force_plot(base_value[class_index], shap_values[class_index], test_data)

In [37]:
def regression_waterfall_plot(shap_explainer, test_data, sample_index):
    shap.initjs()
    shap_values = shap_explainer(test_data)
    return shap.plots.waterfall(shap_values[sample_index])

In [38]:
def classification_waterfall_plot(shap_explainer, data, class_index, sample_index):
    shap.initjs()
    
#     data = diabetes_all_testcases[diabetes_features]
#     explainer = shap.TreeExplainer(diabetes_all_testcases_decisionTree_model, data)
    
    shap_values = shap_explainer.shap_values(data)
    return shap.waterfall_plot(shap.Explanation(values=shap_values[class_index][sample_index], 
                                     base_values=shap_explainer.expected_value[class_index],
                                     data=data.iloc[sample_index]))

In [39]:
def shap_feature_importance(shap_explainer, train_data, features, classes, sample):
    shap.initjs()
    if sample != None:
        train_data = shap.sample(train_data, sample)
#     data = shap.sample(less_X_train, 10)
#     shap_explainer = shap.KernelExplainer(less_knn_model.predict_proba, data)
    shap_values = shap_explainer.shap_values(train_data)
    return shap.summary_plot(shap_values, train_data, feature_names=features, plot_type="bar", class_names=classes)