In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score

from sklearn import tree

from matplotlib import pyplot as plt

import lime
import lime.lime_tabular

import plotly.graph_objects as go

In [2]:
def check_missing_values(df):
    print(df.isnull().values.any())

In [3]:
def split_to_2_predictors(dataset1, dataset2, features):
    predictor1 = dataset1[features]
    predictor2 = dataset2[features]
    return predictor1, predictor2

In [4]:
def split_to_2_targets(dataset1, dataset2, outcome):
    target1 = dataset1[outcome]
    target2 = dataset2[outcome]
    return target1, target2

In [5]:
def train_test_split_data(dataset_X, dataset_y):
    return train_test_split(dataset_X, dataset_y, test_size=0.25, random_state=1, shuffle = True)
#     return train_test_split(dataset_X, dataset_y, random_state=1, shuffle = True, stratify=dataset_y)

In [6]:
def create_decision_tree(criterion="gini", splitter="best", max_depth=6):
    model = DecisionTreeClassifier(criterion=criterion, splitter=splitter, random_state=1, max_depth=max_depth)
    return model

In [7]:
def fit_decision_tree(train_X, train_y, model):
    model.fit(train_X, train_y)

In [8]:
def calculate_accuracy(testcases, prediction_result):
    return round(accuracy_score(testcases, prediction_result) * 100.00, 2)

In [9]:
def train_decision_tree_agreed_disagreed(all_cases, features):
    train_X = all_cases[features]
    train_y = all_cases.Agreed
    model = create_decision_tree()
    fit_decision_tree(train_X, train_y, model)
    return model

In [10]:
# class_names e.g. ['0', '1']
def visualise_decision_tree(model, features, class_names):
    fig = plt.figure(figsize=(40,20))
    visualisation = tree.plot_tree(model, feature_names=features, class_names=class_names, filled=True)
    return fig

In [11]:
def save_ML_model(file_name_for_save, model_to_save):
    filehandler = open(file_name_for_save + '.obj', 'wb') 
    pickle.dump(model_to_save, filehandler)

# For users:

In [12]:
def read_csv(filepath, sep=None):
    return pd.read_csv(filepath, sep)

In [13]:
def generate_predictions(model, testcases):
    return model.predict(testcases)

In [14]:
def load_ML_model(file_to_open):
    filehandler = open(file_to_open, 'rb') 
    return pickle.load(filehandler)

In [15]:
def calculate_similarity(prediction_result1, prediction_result2):
    return round(accuracy_score(prediction_result1, prediction_result2) * 100.00, 2)

In [16]:
def number_of_disagreed_cases(testcases, similarity):
    wrong_proportion = 1 - (similarity / 100)
    return round(len(testcases) * wrong_proportion)

In [17]:
def return_agreed_cases(testcases, prediction_result1, prediction_result2):
    filters = []
    for i in range(len(testcases)):
        filters.append(prediction_result1[i] == prediction_result2[i])
    disagreed = testcases[filters]
    disagreed.loc[:,'Agreed'] = 1
    return disagreed

In [18]:
def return_disagreed_cases(testcases, prediction_result1, prediction_result2):
    filters = return_agreed_cases(testcases,prediction_result1, prediction_result2)
    agreed = testcases.loc[testcases.index.difference(filters.index)]
    agreed.loc[:,'Agreed'] = 0
    return agreed

In [19]:
#     return feature_importance_output.plot.barh()
def print_feature_importance(fittedmodel, features):
    feature_importance = fittedmodel.feature_importances_
    feature_importance_output = pd.DataFrame(feature_importance, features)
    feature_importance_output.set_axis(['Output'], axis=1, inplace=True)
    print(feature_importance_output)
    ax = feature_importance_output.plot.barh(figsize=(9,5))

In [20]:
def print_feature_importance_regression(fittedmodel, features):
    importance = fittedmodel.coef_
    importance = importance.reshape(-1,1)
    df = pd.DataFrame(importance, features)
    df.set_axis(['Feature coefficient'], axis=1, inplace=True)
    ax = df.plot.barh(figsize=(9,5))
    plt.show()

In [21]:
def feature_importance_for_disagreement(features, agreed_cases, disagreed_cases, model = None):
    all_cases = pd.DataFrame(agreed_cases.append(disagreed_cases))
    X_train = all_cases[features]
    y_train = all_cases['Agreed']
    if model == None:
        model = create_decision_tree()
        fit_decision_tree(X_train, y_train, model)
    print_feature_importance(model, features)

In [22]:
def feature_importance_comparison(model1, model2, features, name1, name2):
    feature_importance1 = model1.feature_importances_
    feature_importance2 = model2.feature_importances_
    feature_importance1_output = pd.DataFrame(feature_importance1, features)
    feature_importance2_output = pd.DataFrame(feature_importance2, features)
    concat_df = pd.concat([feature_importance1_output, feature_importance2_output],axis=1)
    concat_df.set_axis([name1,name2], axis=1, inplace=True)
    ax=concat_df.plot.barh(figsize=(9,7))
    plt.show()    

In [23]:
def feature_importance_comparison_regression(model1, model2, features, name1, name2):
    feature_importance1 = (model1.coef_).reshape(-1,1)
    feature_importance2 = (model2.coef_).reshape(-1,1)
    feature_importance1_output = pd.DataFrame(feature_importance1, features)
    feature_importance2_output = pd.DataFrame(feature_importance2, features)
    concat_df = pd.concat([feature_importance1_output, feature_importance2_output],axis=1)
    concat_df.set_axis([name1,name2], axis=1, inplace=True)
    ax=concat_df.plot.barh(figsize=(9,7))
    plt.show()    

In [24]:
def disagreed_LIME(train1, train2, all_disagreed_testcases, model1, model2, features, classes):
    train1_numpy = train1.to_numpy()
    train2_numpy = train2.to_numpy()
    all_disagreed_testcases_numpy = all_disagreed_testcases[features].to_numpy()

    explainer = lime.lime_tabular.LimeTabularExplainer(train1_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    i = np.random.randint(0, len(all_disagreed_testcases_numpy))
    disagreed_case = all_disagreed_testcases_numpy[i]

    exp1 = explainer.explain_instance(disagreed_case, model1.predict_proba, num_features=len(features))
    exp1_map = exp1.as_map()
    exp1.show_in_notebook()

    explainer = lime.lime_tabular.LimeTabularExplainer(train2_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    exp2 = explainer.explain_instance(disagreed_case, model2.predict_proba, num_features=len(features))
    exp2_map = exp2.as_map()
    exp2.show_in_notebook()
    
    index1= model1.predict(disagreed_case.reshape(1,-1))[0]
    index2= model2.predict(disagreed_case.reshape(1,-1))[0]
    
    exp1_result = classes[index1]
    exp2_result = classes[index2]
    
    return exp1_map[1], exp2_map[1], disagreed_case, exp1_result, exp2_result

In [25]:
def disagreed_LIME_multi(train1, train2, all_disagreed_testcases, model1, model2, features, classes):
    train1_numpy = train1.to_numpy()
    train2_numpy = train2.to_numpy()
    all_disagreed_testcases_numpy = all_disagreed_testcases[features].to_numpy()

    explainer = lime.lime_tabular.LimeTabularExplainer(train1_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    i = np.random.randint(0, len(all_disagreed_testcases_numpy))
    disagreed_case = all_disagreed_testcases_numpy[i]
    
    index1= int(model1.predict(disagreed_case.reshape(1,-1))[0]) - min(classes)
    index2= int(model2.predict(disagreed_case.reshape(1,-1))[0]) - min(classes)

    exp1 = explainer.explain_instance(disagreed_case, model1.predict_proba, num_features=len(features), labels=[index1])
    exp1_map = exp1.as_map()
    exp1.show_in_notebook()

    explainer = lime.lime_tabular.LimeTabularExplainer(train2_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    exp2 = explainer.explain_instance(disagreed_case, model2.predict_proba, num_features=len(features), labels=[index2])
    exp2_map = exp2.as_map()
    exp2.show_in_notebook()
    
    exp1_result = classes[index1]
    exp2_result = classes[index2]
    
    return exp1_map[index1], exp2_map[index2], disagreed_case, exp1_result, exp2_result

In [26]:
def regression_LIME(train_set, test_set, model, features, class_name):
    explainer = lime.lime_tabular.LimeTabularExplainer(train_set.to_numpy(),
                                                       feature_names=features,
                                                       class_names=class_name,
                                                       verbose=True,
                                                       mode='regression')
    test_set_numpy = test_set.to_numpy()
    i = np.random.randint(0, len(test_set_numpy))
    exp = explainer.explain_instance(test_set_numpy[i], model.predict, num_features=len(features))
    exp.show_in_notebook()

In [27]:
def LIME_compare_bar_plot(features, exp1_map, exp2_map, exp1_result, exp2_result):
    exp1_map.sort()
    exp2_map.sort()
    
    exp1_map_df = pd.DataFrame(exp1_map)
    exp2_map_df = pd.DataFrame(exp2_map)
    
    exp1_map_df[0] = features
    exp2_map_df[0] = features
    
    merged_df= pd.merge(exp1_map_df, exp2_map_df, on=0)
    
    ax = merged_df.plot.barh(figsize=(9,7))
    plt.yticks(np.arange(len(features)),features)
    plt.legend([exp1_result, exp2_result])
    plt.show()

labels = ['Younger Diabetes', 'Younger No Diabetes', 'Diabetes', 'No Diabetes', 'Older Diabetes', 'Older No Diabetes']

sources = [0, 0, 1, 1, 4, 4, 5, 5] 

targets = [2, 3, 2, 3, 2, 3, 2, 3]

- #num(predicted class1 by v1 that are of class1), 
- #num(predicted class1 by v1 that are of class2), …,

values = [younger_y_pred_confusion[1][1], younger_y_pred_confusion[1][0],
          younger_y_pred_confusion[0][1], younger_y_pred_confusion[0][0],
          older_y_pred_opposite_confusion[1][1], older_y_pred_opposite_confusion[1][0],
          older_y_pred_opposite_confusion[0][1], older_y_pred_opposite_confusion[0][0],]

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      color = "blue"
    ),
    link = dict(
      source = sources,
      target = targets,
      value = values
  ))])

fig.update_layout(title_text="Sankey Diagram", font_size=10)
fig.show()

In [28]:
def draw_sankey_diagram(title, classes_labels, confusion_matrix1, confusion_matrix2):
    number_of_classes = len(classes_labels)

    labels = classes_labels * 3
    
    targets = list()
    for i in range(number_of_classes * 2 - 1, number_of_classes * 2 - number_of_classes - 1, -1):
        for j in range(1, 1 + number_of_classes):
            targets.append(i)
    start = targets.copy()
    for i in range(number_of_classes):
        for j in range(3 * number_of_classes - 1, 2 * number_of_classes -1, -1):
            targets.append(j)

    sources = list()
    for i in range(number_of_classes):
        for j in range(number_of_classes - 1, -1, -1):
            sources.append(j)

    sources += start
    
    values = list(confusion_matrix1.ravel()) + list(confusion_matrix2.ravel())
    
    my_colors = [('rgba('+str(np.random.randint(0, high = 256))+','+
                str(np.random.randint(0, high = 256))+','+
                str(np.random.randint(0, high = 256))) for i in range(len(classes_labels))]
    my_colors_node = []
    my_colors_opac = []

    for rgba in my_colors:
        my_colors_node.append(rgba + ',0.8)')
        my_colors_opac.append(rgba + ',0.4)')
    my_colors_opac = my_colors_opac * number_of_classes
    
    fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),    
        label = labels,
        color = my_colors_node * 3
    ),
    link = dict(
        source = sources,
        target = targets,
        value = values,
        color = my_colors_opac[::-1] + my_colors_opac[::-1]
    ))])

    fig.update_layout(title_text=title, font_size=12)
    fig.show()