In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.model_selection import cross_val_score

import lime
import lime.lime_tabular

import holoviews as hv

In [2]:
def read_csv(filepath):
    data = pd.read_csv(filepath)
    return data

In [3]:
def split_to_2_predictors(dataset1, dataset2, features):
    predictor1 = dataset1[features]
    predictor2 = dataset2[features]
    return predictor1, predictor2

In [4]:
def split_to_2_targets(dataset1, dataset2, outcome):
    target1 = dataset1[outcome]
    target2 = dataset2[outcome]
    return target1, target2

In [5]:
def train_test_split_data(dataset_X, dataset_y):
    return train_test_split(dataset_X, dataset_y, random_state=1, shuffle = True, stratify=dataset_y)

In [6]:
def create_decision_tree(criterion="gini", splitter="best", max_depth=6):
    model = DecisionTreeClassifier(criterion=criterion, splitter=splitter, random_state=1, max_depth=max_depth)
    return model

In [7]:
def fit_decision_tree(train_X, train_y, model):
    model.fit(train_X, train_y)

In [8]:
def test_decision_tree(model, testcases):
    return model.predict(testcases)

In [9]:
def calculate_accuracy(testcases, prediction_result):
    return accuracy_score(testcases, prediction_result) * 100.00

In [10]:
def calculate_similarity(prediction_result1, prediction_result2):
    return accuracy_score(prediction_result1, prediction_result2) * 100.00

In [11]:
def number_of_disagreed_cases(testcases, similarity):
    # wrong_proportion = 1 - calculate_accuracy(testcases, prediction_result)
    wrong_proportion = 1 - (similarity / 100)
    return round(len(testcases) * wrong_proportion)

In [12]:
def return_disagreed_cases(testcases, prediction_result1, prediction_result2):
    filters = np.logical_xor(prediction_result1, prediction_result2)
    disagreed = testcases[filters]
    disagreed['Agreed'] = 0
    return disagreed

In [13]:
def return_agreed_cases(testcases, prediction_result1, prediction_result2):
    filters = return_disagreed_cases(testcases,prediction_result1, prediction_result2)
    agreed = testcases.loc[testcases.index.difference(filters.index)]
    agreed['Agreed'] = 1
    return agreed

In [14]:
def train_decision_tree_agreed_disagreed(all_cases, features):
    train_X = all_cases[features]
    train_y = all_cases.Agreed
    model = create_decision_tree()
    fit_decision_tree(train_X, train_y, model)
    return model

In [18]:
def disagreed_LIME(train1, train2, all_disagreed_testcases, model1, model2, features, classes):
    train1_numpy = train1.to_numpy()
    train2_numpy = train2.to_numpy()
    all_disagreed_testcases_numpy = all_disagreed_testcases[features].to_numpy()

    explainer = lime.lime_tabular.LimeTabularExplainer(train1_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    i = np.random.randint(0, len(all_disagreed_testcases_numpy))
    disagreed_case = all_disagreed_testcases_numpy[i]

    exp1 = explainer.explain_instance(disagreed_case, model1.predict_proba, num_features=len(features))
    exp1_map = exp1.as_map()
    exp1.show_in_notebook()

    explainer = lime.lime_tabular.LimeTabularExplainer(train2_numpy,
                                                       feature_names=features,
                                                       class_names=classes,
                                                       discretize_continuous=True)

    exp2 = explainer.explain_instance(disagreed_case, model2.predict_proba, num_features=len(features))
    exp2_map = exp2.as_map()
    exp2.show_in_notebook()
    
    exp1_result = classes[model1.predict(disagreed_case.reshape(1,-1))[0]]
    exp2_result = classes[model2.predict(disagreed_case.reshape(1,-1))[0]]
    
    return exp1_map[1], exp2_map[1], disagreed_case, exp1_result, exp2_result

In [16]:
#     return feature_importance_output.plot.barh()
def print_feature_importance(fittedmodel, features):
    feature_importance = fittedmodel.feature_importances_
    feature_importance_output = pd.DataFrame(feature_importance, features)
    feature_importance_output.set_axis(['Output'], axis=1, inplace=True)
    print(feature_importance_output)
    ax = feature_importance_output.plot.barh()
    return feature_importance

In [17]:
# class_names e.g. ['0', '1']
def visualise_decision_tree(model, features, class_names):
    fig = plt.figure(figsize=(40,20))
    visualisation = tree.plot_tree(model, feature_names=features, class_names=class_names, filled=True)
    return fig